]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/strrepl.cpp
ICU-461.18.tar.gz
[apple/icu.git] / icuSources / i18n / strrepl.cpp
1 /*
2 **********************************************************************
3 * Copyright (c) 2002-2004, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 01/21/2002 aliu Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "strrepl.h"
16 #include "rbt_data.h"
17 #include "util.h"
18 #include "unicode/uniset.h"
19
20 U_NAMESPACE_BEGIN
21
22 static const UChar EMPTY[] = { 0 }; // empty string: ""
23
24 UnicodeReplacer::~UnicodeReplacer() {}
25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
26
27 /**
28 * Construct a StringReplacer that sets the emits the given output
29 * text and sets the cursor to the given position.
30 * @param theOutput text that will replace input text when the
31 * replace() method is called. May contain stand-in characters
32 * that represent nested replacers.
33 * @param theCursorPos cursor position that will be returned by
34 * the replace() method
35 * @param theData transliterator context object that translates
36 * stand-in characters to UnicodeReplacer objects
37 */
38 StringReplacer::StringReplacer(const UnicodeString& theOutput,
39 int32_t theCursorPos,
40 const TransliterationRuleData* theData) {
41 output = theOutput;
42 cursorPos = theCursorPos;
43 hasCursor = TRUE;
44 data = theData;
45 isComplex = TRUE;
46 }
47
48 /**
49 * Construct a StringReplacer that sets the emits the given output
50 * text and does not modify the cursor.
51 * @param theOutput text that will replace input text when the
52 * replace() method is called. May contain stand-in characters
53 * that represent nested replacers.
54 * @param theData transliterator context object that translates
55 * stand-in characters to UnicodeReplacer objects
56 */
57 StringReplacer::StringReplacer(const UnicodeString& theOutput,
58 const TransliterationRuleData* theData) {
59 output = theOutput;
60 cursorPos = 0;
61 hasCursor = FALSE;
62 data = theData;
63 isComplex = TRUE;
64 }
65
66 /**
67 * Copy constructor.
68 */
69 StringReplacer::StringReplacer(const StringReplacer& other) :
70 UnicodeFunctor(other),
71 UnicodeReplacer(other)
72 {
73 output = other.output;
74 cursorPos = other.cursorPos;
75 hasCursor = other.hasCursor;
76 data = other.data;
77 isComplex = other.isComplex;
78 }
79
80 /**
81 * Destructor
82 */
83 StringReplacer::~StringReplacer() {
84 }
85
86 /**
87 * Implement UnicodeFunctor
88 */
89 UnicodeFunctor* StringReplacer::clone() const {
90 return new StringReplacer(*this);
91 }
92
93 /**
94 * Implement UnicodeFunctor
95 */
96 UnicodeReplacer* StringReplacer::toReplacer() const {
97 return (UnicodeReplacer*) this;
98 }
99
100 /**
101 * UnicodeReplacer API
102 */
103 int32_t StringReplacer::replace(Replaceable& text,
104 int32_t start,
105 int32_t limit,
106 int32_t& cursor) {
107 int32_t outLen;
108 int32_t newStart = 0;
109
110 // NOTE: It should be possible to _always_ run the complex
111 // processing code; just slower. If not, then there is a bug
112 // in the complex processing code.
113
114 // Simple (no nested replacers) Processing Code :
115 if (!isComplex) {
116 text.handleReplaceBetween(start, limit, output);
117 outLen = output.length();
118
119 // Setup default cursor position (for cursorPos within output)
120 newStart = cursorPos;
121 }
122
123 // Complex (nested replacers) Processing Code :
124 else {
125 /* When there are segments to be copied, use the Replaceable.copy()
126 * API in order to retain out-of-band data. Copy everything to the
127 * end of the string, then copy them back over the key. This preserves
128 * the integrity of indices into the key and surrounding context while
129 * generating the output text.
130 */
131 UnicodeString buf;
132 int32_t oOutput; // offset into 'output'
133 isComplex = FALSE;
134
135 // The temporary buffer starts at tempStart, and extends
136 // to destLimit. The start of the buffer has a single
137 // character from before the key. This provides style
138 // data when addition characters are filled into the
139 // temporary buffer. If there is nothing to the left, use
140 // the non-character U+FFFF, which Replaceable subclasses
141 // should treat specially as a "no-style character."
142 // destStart points to the point after the style context
143 // character, so it is tempStart+1 or tempStart+2.
144 int32_t tempStart = text.length(); // start of temp buffer
145 int32_t destStart = tempStart; // copy new text to here
146 if (start > 0) {
147 int32_t len = UTF_CHAR_LENGTH(text.char32At(start-1));
148 text.copy(start-len, start, tempStart);
149 destStart += len;
150 } else {
151 UnicodeString str((UChar) 0xFFFF);
152 text.handleReplaceBetween(tempStart, tempStart, str);
153 destStart++;
154 }
155 int32_t destLimit = destStart;
156
157 for (oOutput=0; oOutput<output.length(); ) {
158 if (oOutput == cursorPos) {
159 // Record the position of the cursor
160 newStart = destLimit - destStart; // relative to start
161 }
162 UChar32 c = output.char32At(oOutput);
163 UnicodeReplacer* r = data->lookupReplacer(c);
164 if (r == NULL) {
165 // Accumulate straight (non-segment) text.
166 buf.append(c);
167 } else {
168 isComplex = TRUE;
169
170 // Insert any accumulated straight text.
171 if (buf.length() > 0) {
172 text.handleReplaceBetween(destLimit, destLimit, buf);
173 destLimit += buf.length();
174 buf.truncate(0);
175 }
176
177 // Delegate output generation to replacer object
178 int32_t len = r->replace(text, destLimit, destLimit, cursor);
179 destLimit += len;
180 }
181 oOutput += UTF_CHAR_LENGTH(c);
182 }
183 // Insert any accumulated straight text.
184 if (buf.length() > 0) {
185 text.handleReplaceBetween(destLimit, destLimit, buf);
186 destLimit += buf.length();
187 }
188 if (oOutput == cursorPos) {
189 // Record the position of the cursor
190 newStart = destLimit - destStart; // relative to start
191 }
192
193 outLen = destLimit - destStart;
194
195 // Copy new text to start, and delete it
196 text.copy(destStart, destLimit, start);
197 text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, EMPTY);
198
199 // Delete the old text (the key)
200 text.handleReplaceBetween(start + outLen, limit + outLen, EMPTY);
201 }
202
203 if (hasCursor) {
204 // Adjust the cursor for positions outside the key. These
205 // refer to code points rather than code units. If cursorPos
206 // is within the output string, then use newStart, which has
207 // already been set above.
208 if (cursorPos < 0) {
209 newStart = start;
210 int32_t n = cursorPos;
211 // Outside the output string, cursorPos counts code points
212 while (n < 0 && newStart > 0) {
213 newStart -= UTF_CHAR_LENGTH(text.char32At(newStart-1));
214 ++n;
215 }
216 newStart += n;
217 } else if (cursorPos > output.length()) {
218 newStart = start + outLen;
219 int32_t n = cursorPos - output.length();
220 // Outside the output string, cursorPos counts code points
221 while (n > 0 && newStart < text.length()) {
222 newStart += UTF_CHAR_LENGTH(text.char32At(newStart));
223 --n;
224 }
225 newStart += n;
226 } else {
227 // Cursor is within output string. It has been set up above
228 // to be relative to start.
229 newStart += start;
230 }
231
232 cursor = newStart;
233 }
234
235 return outLen;
236 }
237
238 /**
239 * UnicodeReplacer API
240 */
241 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
242 UBool escapeUnprintable) const {
243 rule.truncate(0);
244 UnicodeString quoteBuf;
245
246 int32_t cursor = cursorPos;
247
248 // Handle a cursor preceding the output
249 if (hasCursor && cursor < 0) {
250 while (cursor++ < 0) {
251 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
252 }
253 // Fall through and append '|' below
254 }
255
256 for (int32_t i=0; i<output.length(); ++i) {
257 if (hasCursor && i == cursor) {
258 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
259 }
260 UChar c = output.charAt(i); // Ok to use 16-bits here
261
262 UnicodeReplacer* r = data->lookupReplacer(c);
263 if (r == NULL) {
264 ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
265 } else {
266 UnicodeString buf;
267 r->toReplacerPattern(buf, escapeUnprintable);
268 buf.insert(0, (UChar)0x20);
269 buf.append((UChar)0x20);
270 ICU_Utility::appendToRule(rule, buf,
271 TRUE, escapeUnprintable, quoteBuf);
272 }
273 }
274
275 // Handle a cursor after the output. Use > rather than >= because
276 // if cursor == output.length() it is at the end of the output,
277 // which is the default position, so we need not emit it.
278 if (hasCursor && cursor > output.length()) {
279 cursor -= output.length();
280 while (cursor-- > 0) {
281 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
282 }
283 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
284 }
285 // Flush quoteBuf out to result
286 ICU_Utility::appendToRule(rule, -1,
287 TRUE, escapeUnprintable, quoteBuf);
288
289 return rule;
290 }
291
292 /**
293 * Implement UnicodeReplacer
294 */
295 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
296 UChar32 ch;
297 for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) {
298 ch = output.char32At(i);
299 UnicodeReplacer* r = data->lookupReplacer(ch);
300 if (r == NULL) {
301 toUnionTo.add(ch);
302 } else {
303 r->addReplacementSetTo(toUnionTo);
304 }
305 }
306 }
307
308 /**
309 * UnicodeFunctor API
310 */
311 void StringReplacer::setData(const TransliterationRuleData* d) {
312 data = d;
313 int32_t i = 0;
314 while (i<output.length()) {
315 UChar32 c = output.char32At(i);
316 UnicodeFunctor* f = data->lookup(c);
317 if (f != NULL) {
318 f->setData(data);
319 }
320 i += UTF_CHAR_LENGTH(c);
321 }
322 }
323
324 U_NAMESPACE_END
325
326 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
327
328 //eof