]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/strrepl.cpp
ICU-511.27.tar.gz
[apple/icu.git] / icuSources / i18n / strrepl.cpp
1 /*
2 **********************************************************************
3 * Copyright (c) 2002-2012, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 01/21/2002 aliu Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "unicode/uniset.h"
16 #include "unicode/utf16.h"
17 #include "strrepl.h"
18 #include "rbt_data.h"
19 #include "util.h"
20
21 U_NAMESPACE_BEGIN
22
23 UnicodeReplacer::~UnicodeReplacer() {}
24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
25
26 /**
27 * Construct a StringReplacer that sets the emits the given output
28 * text and sets the cursor to the given position.
29 * @param theOutput text that will replace input text when the
30 * replace() method is called. May contain stand-in characters
31 * that represent nested replacers.
32 * @param theCursorPos cursor position that will be returned by
33 * the replace() method
34 * @param theData transliterator context object that translates
35 * stand-in characters to UnicodeReplacer objects
36 */
37 StringReplacer::StringReplacer(const UnicodeString& theOutput,
38 int32_t theCursorPos,
39 const TransliterationRuleData* theData) {
40 output = theOutput;
41 cursorPos = theCursorPos;
42 hasCursor = TRUE;
43 data = theData;
44 isComplex = TRUE;
45 }
46
47 /**
48 * Construct a StringReplacer that sets the emits the given output
49 * text and does not modify the cursor.
50 * @param theOutput text that will replace input text when the
51 * replace() method is called. May contain stand-in characters
52 * that represent nested replacers.
53 * @param theData transliterator context object that translates
54 * stand-in characters to UnicodeReplacer objects
55 */
56 StringReplacer::StringReplacer(const UnicodeString& theOutput,
57 const TransliterationRuleData* theData) {
58 output = theOutput;
59 cursorPos = 0;
60 hasCursor = FALSE;
61 data = theData;
62 isComplex = TRUE;
63 }
64
65 /**
66 * Copy constructor.
67 */
68 StringReplacer::StringReplacer(const StringReplacer& other) :
69 UnicodeFunctor(other),
70 UnicodeReplacer(other)
71 {
72 output = other.output;
73 cursorPos = other.cursorPos;
74 hasCursor = other.hasCursor;
75 data = other.data;
76 isComplex = other.isComplex;
77 }
78
79 /**
80 * Destructor
81 */
82 StringReplacer::~StringReplacer() {
83 }
84
85 /**
86 * Implement UnicodeFunctor
87 */
88 UnicodeFunctor* StringReplacer::clone() const {
89 return new StringReplacer(*this);
90 }
91
92 /**
93 * Implement UnicodeFunctor
94 */
95 UnicodeReplacer* StringReplacer::toReplacer() const {
96 return const_cast<StringReplacer *>(this);
97 }
98
99 /**
100 * UnicodeReplacer API
101 */
102 int32_t StringReplacer::replace(Replaceable& text,
103 int32_t start,
104 int32_t limit,
105 int32_t& cursor) {
106 int32_t outLen;
107 int32_t newStart = 0;
108
109 // NOTE: It should be possible to _always_ run the complex
110 // processing code; just slower. If not, then there is a bug
111 // in the complex processing code.
112
113 // Simple (no nested replacers) Processing Code :
114 if (!isComplex) {
115 text.handleReplaceBetween(start, limit, output);
116 outLen = output.length();
117
118 // Setup default cursor position (for cursorPos within output)
119 newStart = cursorPos;
120 }
121
122 // Complex (nested replacers) Processing Code :
123 else {
124 /* When there are segments to be copied, use the Replaceable.copy()
125 * API in order to retain out-of-band data. Copy everything to the
126 * end of the string, then copy them back over the key. This preserves
127 * the integrity of indices into the key and surrounding context while
128 * generating the output text.
129 */
130 UnicodeString buf;
131 int32_t oOutput; // offset into 'output'
132 isComplex = FALSE;
133
134 // The temporary buffer starts at tempStart, and extends
135 // to destLimit. The start of the buffer has a single
136 // character from before the key. This provides style
137 // data when addition characters are filled into the
138 // temporary buffer. If there is nothing to the left, use
139 // the non-character U+FFFF, which Replaceable subclasses
140 // should treat specially as a "no-style character."
141 // destStart points to the point after the style context
142 // character, so it is tempStart+1 or tempStart+2.
143 int32_t tempStart = text.length(); // start of temp buffer
144 int32_t destStart = tempStart; // copy new text to here
145 if (start > 0) {
146 int32_t len = U16_LENGTH(text.char32At(start-1));
147 text.copy(start-len, start, tempStart);
148 destStart += len;
149 } else {
150 UnicodeString str((UChar) 0xFFFF);
151 text.handleReplaceBetween(tempStart, tempStart, str);
152 destStart++;
153 }
154 int32_t destLimit = destStart;
155
156 for (oOutput=0; oOutput<output.length(); ) {
157 if (oOutput == cursorPos) {
158 // Record the position of the cursor
159 newStart = destLimit - destStart; // relative to start
160 }
161 UChar32 c = output.char32At(oOutput);
162 UnicodeReplacer* r = data->lookupReplacer(c);
163 if (r == NULL) {
164 // Accumulate straight (non-segment) text.
165 buf.append(c);
166 } else {
167 isComplex = TRUE;
168
169 // Insert any accumulated straight text.
170 if (buf.length() > 0) {
171 text.handleReplaceBetween(destLimit, destLimit, buf);
172 destLimit += buf.length();
173 buf.truncate(0);
174 }
175
176 // Delegate output generation to replacer object
177 int32_t len = r->replace(text, destLimit, destLimit, cursor);
178 destLimit += len;
179 }
180 oOutput += U16_LENGTH(c);
181 }
182 // Insert any accumulated straight text.
183 if (buf.length() > 0) {
184 text.handleReplaceBetween(destLimit, destLimit, buf);
185 destLimit += buf.length();
186 }
187 if (oOutput == cursorPos) {
188 // Record the position of the cursor
189 newStart = destLimit - destStart; // relative to start
190 }
191
192 outLen = destLimit - destStart;
193
194 // Copy new text to start, and delete it
195 text.copy(destStart, destLimit, start);
196 text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());
197
198 // Delete the old text (the key)
199 text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
200 }
201
202 if (hasCursor) {
203 // Adjust the cursor for positions outside the key. These
204 // refer to code points rather than code units. If cursorPos
205 // is within the output string, then use newStart, which has
206 // already been set above.
207 if (cursorPos < 0) {
208 newStart = start;
209 int32_t n = cursorPos;
210 // Outside the output string, cursorPos counts code points
211 while (n < 0 && newStart > 0) {
212 newStart -= U16_LENGTH(text.char32At(newStart-1));
213 ++n;
214 }
215 newStart += n;
216 } else if (cursorPos > output.length()) {
217 newStart = start + outLen;
218 int32_t n = cursorPos - output.length();
219 // Outside the output string, cursorPos counts code points
220 while (n > 0 && newStart < text.length()) {
221 newStart += U16_LENGTH(text.char32At(newStart));
222 --n;
223 }
224 newStart += n;
225 } else {
226 // Cursor is within output string. It has been set up above
227 // to be relative to start.
228 newStart += start;
229 }
230
231 cursor = newStart;
232 }
233
234 return outLen;
235 }
236
237 /**
238 * UnicodeReplacer API
239 */
240 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
241 UBool escapeUnprintable) const {
242 rule.truncate(0);
243 UnicodeString quoteBuf;
244
245 int32_t cursor = cursorPos;
246
247 // Handle a cursor preceding the output
248 if (hasCursor && cursor < 0) {
249 while (cursor++ < 0) {
250 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
251 }
252 // Fall through and append '|' below
253 }
254
255 for (int32_t i=0; i<output.length(); ++i) {
256 if (hasCursor && i == cursor) {
257 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
258 }
259 UChar c = output.charAt(i); // Ok to use 16-bits here
260
261 UnicodeReplacer* r = data->lookupReplacer(c);
262 if (r == NULL) {
263 ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
264 } else {
265 UnicodeString buf;
266 r->toReplacerPattern(buf, escapeUnprintable);
267 buf.insert(0, (UChar)0x20);
268 buf.append((UChar)0x20);
269 ICU_Utility::appendToRule(rule, buf,
270 TRUE, escapeUnprintable, quoteBuf);
271 }
272 }
273
274 // Handle a cursor after the output. Use > rather than >= because
275 // if cursor == output.length() it is at the end of the output,
276 // which is the default position, so we need not emit it.
277 if (hasCursor && cursor > output.length()) {
278 cursor -= output.length();
279 while (cursor-- > 0) {
280 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
281 }
282 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
283 }
284 // Flush quoteBuf out to result
285 ICU_Utility::appendToRule(rule, -1,
286 TRUE, escapeUnprintable, quoteBuf);
287
288 return rule;
289 }
290
291 /**
292 * Implement UnicodeReplacer
293 */
294 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
295 UChar32 ch;
296 for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
297 ch = output.char32At(i);
298 UnicodeReplacer* r = data->lookupReplacer(ch);
299 if (r == NULL) {
300 toUnionTo.add(ch);
301 } else {
302 r->addReplacementSetTo(toUnionTo);
303 }
304 }
305 }
306
307 /**
308 * UnicodeFunctor API
309 */
310 void StringReplacer::setData(const TransliterationRuleData* d) {
311 data = d;
312 int32_t i = 0;
313 while (i<output.length()) {
314 UChar32 c = output.char32At(i);
315 UnicodeFunctor* f = data->lookup(c);
316 if (f != NULL) {
317 f->setData(data);
318 }
319 i += U16_LENGTH(c);
320 }
321 }
322
323 U_NAMESPACE_END
324
325 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
326
327 //eof