]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | /* |
4 | ********************************************************************** | |
51004dcb | 5 | * Copyright (c) 2002-2012, International Business Machines Corporation |
b75a7d8f A |
6 | * and others. All Rights Reserved. |
7 | ********************************************************************** | |
8 | * Date Name Description | |
9 | * 01/21/2002 aliu Creation. | |
10 | ********************************************************************** | |
11 | */ | |
12 | ||
13 | #include "unicode/utypes.h" | |
14 | ||
15 | #if !UCONFIG_NO_TRANSLITERATION | |
16 | ||
4388f060 A |
17 | #include "unicode/uniset.h" |
18 | #include "unicode/utf16.h" | |
b75a7d8f A |
19 | #include "strrepl.h" |
20 | #include "rbt_data.h" | |
21 | #include "util.h" | |
b75a7d8f A |
22 | |
23 | U_NAMESPACE_BEGIN | |
24 | ||
374ca955 A |
25 | UnicodeReplacer::~UnicodeReplacer() {} |
26 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer) | |
b75a7d8f A |
27 | |
28 | /** | |
29 | * Construct a StringReplacer that sets the emits the given output | |
30 | * text and sets the cursor to the given position. | |
31 | * @param theOutput text that will replace input text when the | |
32 | * replace() method is called. May contain stand-in characters | |
33 | * that represent nested replacers. | |
34 | * @param theCursorPos cursor position that will be returned by | |
35 | * the replace() method | |
36 | * @param theData transliterator context object that translates | |
37 | * stand-in characters to UnicodeReplacer objects | |
38 | */ | |
39 | StringReplacer::StringReplacer(const UnicodeString& theOutput, | |
40 | int32_t theCursorPos, | |
41 | const TransliterationRuleData* theData) { | |
42 | output = theOutput; | |
43 | cursorPos = theCursorPos; | |
44 | hasCursor = TRUE; | |
45 | data = theData; | |
46 | isComplex = TRUE; | |
47 | } | |
48 | ||
49 | /** | |
50 | * Construct a StringReplacer that sets the emits the given output | |
51 | * text and does not modify the cursor. | |
52 | * @param theOutput text that will replace input text when the | |
53 | * replace() method is called. May contain stand-in characters | |
54 | * that represent nested replacers. | |
55 | * @param theData transliterator context object that translates | |
56 | * stand-in characters to UnicodeReplacer objects | |
57 | */ | |
58 | StringReplacer::StringReplacer(const UnicodeString& theOutput, | |
59 | const TransliterationRuleData* theData) { | |
60 | output = theOutput; | |
61 | cursorPos = 0; | |
62 | hasCursor = FALSE; | |
63 | data = theData; | |
64 | isComplex = TRUE; | |
65 | } | |
66 | ||
67 | /** | |
68 | * Copy constructor. | |
69 | */ | |
374ca955 A |
70 | StringReplacer::StringReplacer(const StringReplacer& other) : |
71 | UnicodeFunctor(other), | |
72 | UnicodeReplacer(other) | |
73 | { | |
b75a7d8f A |
74 | output = other.output; |
75 | cursorPos = other.cursorPos; | |
76 | hasCursor = other.hasCursor; | |
77 | data = other.data; | |
78 | isComplex = other.isComplex; | |
79 | } | |
80 | ||
81 | /** | |
82 | * Destructor | |
83 | */ | |
84 | StringReplacer::~StringReplacer() { | |
85 | } | |
86 | ||
87 | /** | |
88 | * Implement UnicodeFunctor | |
89 | */ | |
340931cb | 90 | StringReplacer* StringReplacer::clone() const { |
b75a7d8f A |
91 | return new StringReplacer(*this); |
92 | } | |
93 | ||
94 | /** | |
95 | * Implement UnicodeFunctor | |
96 | */ | |
97 | UnicodeReplacer* StringReplacer::toReplacer() const { | |
51004dcb | 98 | return const_cast<StringReplacer *>(this); |
b75a7d8f A |
99 | } |
100 | ||
101 | /** | |
102 | * UnicodeReplacer API | |
103 | */ | |
104 | int32_t StringReplacer::replace(Replaceable& text, | |
105 | int32_t start, | |
106 | int32_t limit, | |
107 | int32_t& cursor) { | |
108 | int32_t outLen; | |
109 | int32_t newStart = 0; | |
110 | ||
111 | // NOTE: It should be possible to _always_ run the complex | |
112 | // processing code; just slower. If not, then there is a bug | |
113 | // in the complex processing code. | |
114 | ||
115 | // Simple (no nested replacers) Processing Code : | |
116 | if (!isComplex) { | |
117 | text.handleReplaceBetween(start, limit, output); | |
118 | outLen = output.length(); | |
119 | ||
120 | // Setup default cursor position (for cursorPos within output) | |
121 | newStart = cursorPos; | |
122 | } | |
123 | ||
124 | // Complex (nested replacers) Processing Code : | |
125 | else { | |
126 | /* When there are segments to be copied, use the Replaceable.copy() | |
127 | * API in order to retain out-of-band data. Copy everything to the | |
128 | * end of the string, then copy them back over the key. This preserves | |
129 | * the integrity of indices into the key and surrounding context while | |
130 | * generating the output text. | |
131 | */ | |
132 | UnicodeString buf; | |
133 | int32_t oOutput; // offset into 'output' | |
134 | isComplex = FALSE; | |
135 | ||
136 | // The temporary buffer starts at tempStart, and extends | |
137 | // to destLimit. The start of the buffer has a single | |
138 | // character from before the key. This provides style | |
139 | // data when addition characters are filled into the | |
140 | // temporary buffer. If there is nothing to the left, use | |
141 | // the non-character U+FFFF, which Replaceable subclasses | |
142 | // should treat specially as a "no-style character." | |
143 | // destStart points to the point after the style context | |
144 | // character, so it is tempStart+1 or tempStart+2. | |
145 | int32_t tempStart = text.length(); // start of temp buffer | |
146 | int32_t destStart = tempStart; // copy new text to here | |
147 | if (start > 0) { | |
4388f060 | 148 | int32_t len = U16_LENGTH(text.char32At(start-1)); |
b75a7d8f A |
149 | text.copy(start-len, start, tempStart); |
150 | destStart += len; | |
151 | } else { | |
152 | UnicodeString str((UChar) 0xFFFF); | |
153 | text.handleReplaceBetween(tempStart, tempStart, str); | |
154 | destStart++; | |
155 | } | |
156 | int32_t destLimit = destStart; | |
157 | ||
158 | for (oOutput=0; oOutput<output.length(); ) { | |
159 | if (oOutput == cursorPos) { | |
160 | // Record the position of the cursor | |
161 | newStart = destLimit - destStart; // relative to start | |
162 | } | |
163 | UChar32 c = output.char32At(oOutput); | |
164 | UnicodeReplacer* r = data->lookupReplacer(c); | |
165 | if (r == NULL) { | |
166 | // Accumulate straight (non-segment) text. | |
167 | buf.append(c); | |
168 | } else { | |
169 | isComplex = TRUE; | |
170 | ||
171 | // Insert any accumulated straight text. | |
172 | if (buf.length() > 0) { | |
173 | text.handleReplaceBetween(destLimit, destLimit, buf); | |
174 | destLimit += buf.length(); | |
175 | buf.truncate(0); | |
176 | } | |
177 | ||
178 | // Delegate output generation to replacer object | |
179 | int32_t len = r->replace(text, destLimit, destLimit, cursor); | |
180 | destLimit += len; | |
181 | } | |
4388f060 | 182 | oOutput += U16_LENGTH(c); |
b75a7d8f A |
183 | } |
184 | // Insert any accumulated straight text. | |
185 | if (buf.length() > 0) { | |
186 | text.handleReplaceBetween(destLimit, destLimit, buf); | |
187 | destLimit += buf.length(); | |
188 | } | |
189 | if (oOutput == cursorPos) { | |
190 | // Record the position of the cursor | |
191 | newStart = destLimit - destStart; // relative to start | |
192 | } | |
193 | ||
194 | outLen = destLimit - destStart; | |
195 | ||
196 | // Copy new text to start, and delete it | |
197 | text.copy(destStart, destLimit, start); | |
4388f060 | 198 | text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString()); |
b75a7d8f A |
199 | |
200 | // Delete the old text (the key) | |
4388f060 | 201 | text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString()); |
b75a7d8f A |
202 | } |
203 | ||
204 | if (hasCursor) { | |
205 | // Adjust the cursor for positions outside the key. These | |
206 | // refer to code points rather than code units. If cursorPos | |
207 | // is within the output string, then use newStart, which has | |
208 | // already been set above. | |
209 | if (cursorPos < 0) { | |
210 | newStart = start; | |
211 | int32_t n = cursorPos; | |
212 | // Outside the output string, cursorPos counts code points | |
213 | while (n < 0 && newStart > 0) { | |
4388f060 | 214 | newStart -= U16_LENGTH(text.char32At(newStart-1)); |
b75a7d8f A |
215 | ++n; |
216 | } | |
217 | newStart += n; | |
218 | } else if (cursorPos > output.length()) { | |
219 | newStart = start + outLen; | |
220 | int32_t n = cursorPos - output.length(); | |
221 | // Outside the output string, cursorPos counts code points | |
222 | while (n > 0 && newStart < text.length()) { | |
4388f060 | 223 | newStart += U16_LENGTH(text.char32At(newStart)); |
b75a7d8f A |
224 | --n; |
225 | } | |
226 | newStart += n; | |
227 | } else { | |
228 | // Cursor is within output string. It has been set up above | |
229 | // to be relative to start. | |
230 | newStart += start; | |
231 | } | |
232 | ||
233 | cursor = newStart; | |
234 | } | |
235 | ||
236 | return outLen; | |
237 | } | |
238 | ||
239 | /** | |
240 | * UnicodeReplacer API | |
241 | */ | |
242 | UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule, | |
243 | UBool escapeUnprintable) const { | |
244 | rule.truncate(0); | |
245 | UnicodeString quoteBuf; | |
246 | ||
247 | int32_t cursor = cursorPos; | |
248 | ||
249 | // Handle a cursor preceding the output | |
250 | if (hasCursor && cursor < 0) { | |
251 | while (cursor++ < 0) { | |
252 | ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); | |
253 | } | |
254 | // Fall through and append '|' below | |
255 | } | |
256 | ||
257 | for (int32_t i=0; i<output.length(); ++i) { | |
258 | if (hasCursor && i == cursor) { | |
259 | ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); | |
260 | } | |
261 | UChar c = output.charAt(i); // Ok to use 16-bits here | |
262 | ||
263 | UnicodeReplacer* r = data->lookupReplacer(c); | |
264 | if (r == NULL) { | |
265 | ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf); | |
266 | } else { | |
267 | UnicodeString buf; | |
268 | r->toReplacerPattern(buf, escapeUnprintable); | |
269 | buf.insert(0, (UChar)0x20); | |
270 | buf.append((UChar)0x20); | |
271 | ICU_Utility::appendToRule(rule, buf, | |
272 | TRUE, escapeUnprintable, quoteBuf); | |
273 | } | |
274 | } | |
275 | ||
276 | // Handle a cursor after the output. Use > rather than >= because | |
277 | // if cursor == output.length() it is at the end of the output, | |
278 | // which is the default position, so we need not emit it. | |
279 | if (hasCursor && cursor > output.length()) { | |
280 | cursor -= output.length(); | |
281 | while (cursor-- > 0) { | |
282 | ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); | |
283 | } | |
284 | ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); | |
285 | } | |
286 | // Flush quoteBuf out to result | |
287 | ICU_Utility::appendToRule(rule, -1, | |
288 | TRUE, escapeUnprintable, quoteBuf); | |
289 | ||
290 | return rule; | |
291 | } | |
292 | ||
293 | /** | |
294 | * Implement UnicodeReplacer | |
295 | */ | |
296 | void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { | |
297 | UChar32 ch; | |
4388f060 | 298 | for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) { |
374ca955 A |
299 | ch = output.char32At(i); |
300 | UnicodeReplacer* r = data->lookupReplacer(ch); | |
301 | if (r == NULL) { | |
302 | toUnionTo.add(ch); | |
303 | } else { | |
304 | r->addReplacementSetTo(toUnionTo); | |
305 | } | |
b75a7d8f A |
306 | } |
307 | } | |
308 | ||
309 | /** | |
310 | * UnicodeFunctor API | |
311 | */ | |
312 | void StringReplacer::setData(const TransliterationRuleData* d) { | |
313 | data = d; | |
314 | int32_t i = 0; | |
315 | while (i<output.length()) { | |
316 | UChar32 c = output.char32At(i); | |
317 | UnicodeFunctor* f = data->lookup(c); | |
318 | if (f != NULL) { | |
319 | f->setData(data); | |
320 | } | |
4388f060 | 321 | i += U16_LENGTH(c); |
b75a7d8f A |
322 | } |
323 | } | |
324 | ||
325 | U_NAMESPACE_END | |
326 | ||
327 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ | |
328 | ||
329 | //eof |