]>
Commit | Line | Data |
---|---|---|
b75a7d8f | 1 | /* |
374ca955 A |
2 | ********************************************************************** |
3 | * Copyright (c) 2001-2004, International Business Machines Corporation | |
4 | * and others. All Rights Reserved. | |
b75a7d8f A |
5 | ********************************************************************** |
6 | * Date Name Description | |
7 | * 07/23/01 aliu Creation. | |
8 | ********************************************************************** | |
9 | */ | |
10 | ||
11 | #include "unicode/utypes.h" | |
12 | ||
13 | #if !UCONFIG_NO_TRANSLITERATION | |
14 | ||
15 | #include "strmatch.h" | |
16 | #include "rbt_data.h" | |
17 | #include "util.h" | |
18 | #include "unicode/uniset.h" | |
19 | ||
20 | U_NAMESPACE_BEGIN | |
21 | ||
374ca955 | 22 | static const UChar EMPTY[] = { 0 }; // empty string: "" |
b75a7d8f | 23 | |
374ca955 | 24 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher) |
b75a7d8f A |
25 | |
26 | StringMatcher::StringMatcher(const UnicodeString& theString, | |
27 | int32_t start, | |
28 | int32_t limit, | |
29 | int32_t segmentNum, | |
30 | const TransliterationRuleData& theData) : | |
31 | data(&theData), | |
32 | segmentNumber(segmentNum), | |
33 | matchStart(-1), | |
34 | matchLimit(-1) | |
35 | { | |
36 | theString.extractBetween(start, limit, pattern); | |
37 | } | |
38 | ||
39 | StringMatcher::StringMatcher(const StringMatcher& o) : | |
374ca955 | 40 | UnicodeFunctor(o), |
b75a7d8f | 41 | UnicodeMatcher(o), |
374ca955 | 42 | UnicodeReplacer(o), |
b75a7d8f A |
43 | pattern(o.pattern), |
44 | data(o.data), | |
45 | segmentNumber(o.segmentNumber), | |
46 | matchStart(o.matchStart), | |
47 | matchLimit(o.matchLimit) | |
48 | { | |
49 | } | |
50 | ||
51 | /** | |
52 | * Destructor | |
53 | */ | |
54 | StringMatcher::~StringMatcher() { | |
55 | } | |
56 | ||
57 | /** | |
58 | * Implement UnicodeFunctor | |
59 | */ | |
60 | UnicodeFunctor* StringMatcher::clone() const { | |
61 | return new StringMatcher(*this); | |
62 | } | |
63 | ||
64 | /** | |
65 | * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer | |
66 | * and return the pointer. | |
67 | */ | |
68 | UnicodeMatcher* StringMatcher::toMatcher() const { | |
69 | return (UnicodeMatcher*) this; | |
70 | } | |
71 | ||
72 | /** | |
73 | * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer | |
74 | * and return the pointer. | |
75 | */ | |
76 | UnicodeReplacer* StringMatcher::toReplacer() const { | |
77 | return (UnicodeReplacer*) this; | |
78 | } | |
79 | ||
80 | /** | |
81 | * Implement UnicodeMatcher | |
82 | */ | |
83 | UMatchDegree StringMatcher::matches(const Replaceable& text, | |
84 | int32_t& offset, | |
85 | int32_t limit, | |
86 | UBool incremental) { | |
87 | int32_t i; | |
88 | int32_t cursor = offset; | |
89 | if (limit < cursor) { | |
90 | // Match in the reverse direction | |
91 | for (i=pattern.length()-1; i>=0; --i) { | |
92 | UChar keyChar = pattern.charAt(i); | |
93 | UnicodeMatcher* subm = data->lookupMatcher(keyChar); | |
94 | if (subm == 0) { | |
95 | if (cursor > limit && | |
96 | keyChar == text.charAt(cursor)) { | |
97 | --cursor; | |
98 | } else { | |
99 | return U_MISMATCH; | |
100 | } | |
101 | } else { | |
102 | UMatchDegree m = | |
103 | subm->matches(text, cursor, limit, incremental); | |
104 | if (m != U_MATCH) { | |
105 | return m; | |
106 | } | |
107 | } | |
108 | } | |
109 | // Record the match position, but adjust for a normal | |
110 | // forward start, limit, and only if a prior match does not | |
111 | // exist -- we want the rightmost match. | |
112 | if (matchStart < 0) { | |
113 | matchStart = cursor+1; | |
114 | matchLimit = offset+1; | |
115 | } | |
116 | } else { | |
117 | for (i=0; i<pattern.length(); ++i) { | |
118 | if (incremental && cursor == limit) { | |
119 | // We've reached the context limit without a mismatch and | |
120 | // without completing our match. | |
121 | return U_PARTIAL_MATCH; | |
122 | } | |
123 | UChar keyChar = pattern.charAt(i); | |
124 | UnicodeMatcher* subm = data->lookupMatcher(keyChar); | |
125 | if (subm == 0) { | |
126 | // Don't need the cursor < limit check if | |
127 | // incremental is TRUE (because it's done above); do need | |
128 | // it otherwise. | |
129 | if (cursor < limit && | |
130 | keyChar == text.charAt(cursor)) { | |
131 | ++cursor; | |
132 | } else { | |
133 | return U_MISMATCH; | |
134 | } | |
135 | } else { | |
136 | UMatchDegree m = | |
137 | subm->matches(text, cursor, limit, incremental); | |
138 | if (m != U_MATCH) { | |
139 | return m; | |
140 | } | |
141 | } | |
142 | } | |
143 | // Record the match position | |
144 | matchStart = offset; | |
145 | matchLimit = cursor; | |
146 | } | |
147 | ||
148 | offset = cursor; | |
149 | return U_MATCH; | |
150 | } | |
151 | ||
152 | /** | |
153 | * Implement UnicodeMatcher | |
154 | */ | |
155 | UnicodeString& StringMatcher::toPattern(UnicodeString& result, | |
156 | UBool escapeUnprintable) const | |
157 | { | |
158 | result.truncate(0); | |
159 | UnicodeString str, quoteBuf; | |
160 | if (segmentNumber > 0) { | |
161 | result.append((UChar)40); /*(*/ | |
162 | } | |
163 | for (int32_t i=0; i<pattern.length(); ++i) { | |
164 | UChar keyChar = pattern.charAt(i); | |
165 | const UnicodeMatcher* m = data->lookupMatcher(keyChar); | |
166 | if (m == 0) { | |
167 | ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf); | |
168 | } else { | |
169 | ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable), | |
170 | TRUE, escapeUnprintable, quoteBuf); | |
171 | } | |
172 | } | |
173 | if (segmentNumber > 0) { | |
174 | result.append((UChar)41); /*)*/ | |
175 | } | |
176 | // Flush quoteBuf out to result | |
177 | ICU_Utility::appendToRule(result, -1, | |
178 | TRUE, escapeUnprintable, quoteBuf); | |
179 | return result; | |
180 | } | |
181 | ||
182 | /** | |
183 | * Implement UnicodeMatcher | |
184 | */ | |
185 | UBool StringMatcher::matchesIndexValue(uint8_t v) const { | |
186 | if (pattern.length() == 0) { | |
187 | return TRUE; | |
188 | } | |
189 | UChar32 c = pattern.char32At(0); | |
190 | const UnicodeMatcher *m = data->lookupMatcher(c); | |
191 | return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v); | |
192 | } | |
193 | ||
194 | /** | |
195 | * Implement UnicodeMatcher | |
196 | */ | |
197 | void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const { | |
198 | UChar32 ch; | |
199 | for (int32_t i=0; i<pattern.length(); i+=UTF_CHAR_LENGTH(ch)) { | |
374ca955 A |
200 | ch = pattern.char32At(i); |
201 | const UnicodeMatcher* matcher = data->lookupMatcher(ch); | |
202 | if (matcher == NULL) { | |
203 | toUnionTo.add(ch); | |
204 | } else { | |
205 | matcher->addMatchSetTo(toUnionTo); | |
206 | } | |
b75a7d8f A |
207 | } |
208 | } | |
209 | ||
210 | /** | |
211 | * UnicodeReplacer API | |
212 | */ | |
213 | int32_t StringMatcher::replace(Replaceable& text, | |
214 | int32_t start, | |
215 | int32_t limit, | |
374ca955 | 216 | int32_t& /*cursor*/) { |
b75a7d8f A |
217 | |
218 | int32_t outLen = 0; | |
219 | ||
220 | // Copy segment with out-of-band data | |
221 | int32_t dest = limit; | |
222 | // If there was no match, that means that a quantifier | |
223 | // matched zero-length. E.g., x (a)* y matched "xy". | |
224 | if (matchStart >= 0) { | |
225 | if (matchStart != matchLimit) { | |
226 | text.copy(matchStart, matchLimit, dest); | |
227 | outLen = matchLimit - matchStart; | |
228 | } | |
229 | } | |
230 | ||
231 | text.handleReplaceBetween(start, limit, EMPTY); // delete original text | |
232 | ||
233 | return outLen; | |
234 | } | |
235 | ||
236 | /** | |
237 | * UnicodeReplacer API | |
238 | */ | |
239 | UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule, | |
374ca955 | 240 | UBool /*escapeUnprintable*/) const { |
b75a7d8f A |
241 | // assert(segmentNumber > 0); |
242 | rule.truncate(0); | |
243 | rule.append((UChar)0x0024 /*$*/); | |
244 | ICU_Utility::appendNumber(rule, segmentNumber, 10, 1); | |
245 | return rule; | |
246 | } | |
247 | ||
248 | /** | |
249 | * Remove any match info. This must be called before performing a | |
250 | * set of matches with this segment. | |
251 | */ | |
252 | void StringMatcher::resetMatch() { | |
253 | matchStart = matchLimit = -1; | |
254 | } | |
255 | ||
256 | /** | |
257 | * Union the set of all characters that may output by this object | |
258 | * into the given set. | |
259 | * @param toUnionTo the set into which to union the output characters | |
260 | */ | |
374ca955 | 261 | void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const { |
b75a7d8f A |
262 | // The output of this replacer varies; it is the source text between |
263 | // matchStart and matchLimit. Since this varies depending on the | |
264 | // input text, we can't compute it here. We can either do nothing | |
265 | // or we can add ALL characters to the set. It's probably more useful | |
266 | // to do nothing. | |
267 | } | |
268 | ||
269 | /** | |
270 | * Implement UnicodeFunctor | |
271 | */ | |
272 | void StringMatcher::setData(const TransliterationRuleData* d) { | |
273 | data = d; | |
274 | int32_t i = 0; | |
275 | while (i<pattern.length()) { | |
276 | UChar32 c = pattern.char32At(i); | |
277 | UnicodeFunctor* f = data->lookup(c); | |
278 | if (f != NULL) { | |
279 | f->setData(data); | |
280 | } | |
281 | i += UTF_CHAR_LENGTH(c); | |
282 | } | |
283 | } | |
284 | ||
285 | U_NAMESPACE_END | |
286 | ||
287 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ | |
288 | ||
289 | //eof |