]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/strrepl.cpp
ICU-62141.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / strrepl.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4**********************************************************************
51004dcb 5* Copyright (c) 2002-2012, International Business Machines Corporation
b75a7d8f
A
6* and others. All Rights Reserved.
7**********************************************************************
8* Date Name Description
9* 01/21/2002 aliu Creation.
10**********************************************************************
11*/
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_TRANSLITERATION
16
4388f060
A
17#include "unicode/uniset.h"
18#include "unicode/utf16.h"
b75a7d8f
A
19#include "strrepl.h"
20#include "rbt_data.h"
21#include "util.h"
b75a7d8f
A
22
23U_NAMESPACE_BEGIN
24
374ca955
A
25UnicodeReplacer::~UnicodeReplacer() {}
26UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
b75a7d8f
A
27
28/**
29 * Construct a StringReplacer that sets the emits the given output
30 * text and sets the cursor to the given position.
31 * @param theOutput text that will replace input text when the
32 * replace() method is called. May contain stand-in characters
33 * that represent nested replacers.
34 * @param theCursorPos cursor position that will be returned by
35 * the replace() method
36 * @param theData transliterator context object that translates
37 * stand-in characters to UnicodeReplacer objects
38 */
39StringReplacer::StringReplacer(const UnicodeString& theOutput,
40 int32_t theCursorPos,
41 const TransliterationRuleData* theData) {
42 output = theOutput;
43 cursorPos = theCursorPos;
44 hasCursor = TRUE;
45 data = theData;
46 isComplex = TRUE;
47}
48
49/**
50 * Construct a StringReplacer that sets the emits the given output
51 * text and does not modify the cursor.
52 * @param theOutput text that will replace input text when the
53 * replace() method is called. May contain stand-in characters
54 * that represent nested replacers.
55 * @param theData transliterator context object that translates
56 * stand-in characters to UnicodeReplacer objects
57 */
58StringReplacer::StringReplacer(const UnicodeString& theOutput,
59 const TransliterationRuleData* theData) {
60 output = theOutput;
61 cursorPos = 0;
62 hasCursor = FALSE;
63 data = theData;
64 isComplex = TRUE;
65}
66
67/**
68 * Copy constructor.
69 */
374ca955
A
70StringReplacer::StringReplacer(const StringReplacer& other) :
71 UnicodeFunctor(other),
72 UnicodeReplacer(other)
73{
b75a7d8f
A
74 output = other.output;
75 cursorPos = other.cursorPos;
76 hasCursor = other.hasCursor;
77 data = other.data;
78 isComplex = other.isComplex;
79}
80
81/**
82 * Destructor
83 */
84StringReplacer::~StringReplacer() {
85}
86
87/**
88 * Implement UnicodeFunctor
89 */
90UnicodeFunctor* StringReplacer::clone() const {
91 return new StringReplacer(*this);
92}
93
94/**
95 * Implement UnicodeFunctor
96 */
97UnicodeReplacer* StringReplacer::toReplacer() const {
51004dcb 98 return const_cast<StringReplacer *>(this);
b75a7d8f
A
99}
100
101/**
102 * UnicodeReplacer API
103 */
104int32_t StringReplacer::replace(Replaceable& text,
105 int32_t start,
106 int32_t limit,
107 int32_t& cursor) {
108 int32_t outLen;
109 int32_t newStart = 0;
110
111 // NOTE: It should be possible to _always_ run the complex
112 // processing code; just slower. If not, then there is a bug
113 // in the complex processing code.
114
115 // Simple (no nested replacers) Processing Code :
116 if (!isComplex) {
117 text.handleReplaceBetween(start, limit, output);
118 outLen = output.length();
119
120 // Setup default cursor position (for cursorPos within output)
121 newStart = cursorPos;
122 }
123
124 // Complex (nested replacers) Processing Code :
125 else {
126 /* When there are segments to be copied, use the Replaceable.copy()
127 * API in order to retain out-of-band data. Copy everything to the
128 * end of the string, then copy them back over the key. This preserves
129 * the integrity of indices into the key and surrounding context while
130 * generating the output text.
131 */
132 UnicodeString buf;
133 int32_t oOutput; // offset into 'output'
134 isComplex = FALSE;
135
136 // The temporary buffer starts at tempStart, and extends
137 // to destLimit. The start of the buffer has a single
138 // character from before the key. This provides style
139 // data when addition characters are filled into the
140 // temporary buffer. If there is nothing to the left, use
141 // the non-character U+FFFF, which Replaceable subclasses
142 // should treat specially as a "no-style character."
143 // destStart points to the point after the style context
144 // character, so it is tempStart+1 or tempStart+2.
145 int32_t tempStart = text.length(); // start of temp buffer
146 int32_t destStart = tempStart; // copy new text to here
147 if (start > 0) {
4388f060 148 int32_t len = U16_LENGTH(text.char32At(start-1));
b75a7d8f
A
149 text.copy(start-len, start, tempStart);
150 destStart += len;
151 } else {
152 UnicodeString str((UChar) 0xFFFF);
153 text.handleReplaceBetween(tempStart, tempStart, str);
154 destStart++;
155 }
156 int32_t destLimit = destStart;
157
158 for (oOutput=0; oOutput<output.length(); ) {
159 if (oOutput == cursorPos) {
160 // Record the position of the cursor
161 newStart = destLimit - destStart; // relative to start
162 }
163 UChar32 c = output.char32At(oOutput);
164 UnicodeReplacer* r = data->lookupReplacer(c);
165 if (r == NULL) {
166 // Accumulate straight (non-segment) text.
167 buf.append(c);
168 } else {
169 isComplex = TRUE;
170
171 // Insert any accumulated straight text.
172 if (buf.length() > 0) {
173 text.handleReplaceBetween(destLimit, destLimit, buf);
174 destLimit += buf.length();
175 buf.truncate(0);
176 }
177
178 // Delegate output generation to replacer object
179 int32_t len = r->replace(text, destLimit, destLimit, cursor);
180 destLimit += len;
181 }
4388f060 182 oOutput += U16_LENGTH(c);
b75a7d8f
A
183 }
184 // Insert any accumulated straight text.
185 if (buf.length() > 0) {
186 text.handleReplaceBetween(destLimit, destLimit, buf);
187 destLimit += buf.length();
188 }
189 if (oOutput == cursorPos) {
190 // Record the position of the cursor
191 newStart = destLimit - destStart; // relative to start
192 }
193
194 outLen = destLimit - destStart;
195
196 // Copy new text to start, and delete it
197 text.copy(destStart, destLimit, start);
4388f060 198 text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());
b75a7d8f
A
199
200 // Delete the old text (the key)
4388f060 201 text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
b75a7d8f
A
202 }
203
204 if (hasCursor) {
205 // Adjust the cursor for positions outside the key. These
206 // refer to code points rather than code units. If cursorPos
207 // is within the output string, then use newStart, which has
208 // already been set above.
209 if (cursorPos < 0) {
210 newStart = start;
211 int32_t n = cursorPos;
212 // Outside the output string, cursorPos counts code points
213 while (n < 0 && newStart > 0) {
4388f060 214 newStart -= U16_LENGTH(text.char32At(newStart-1));
b75a7d8f
A
215 ++n;
216 }
217 newStart += n;
218 } else if (cursorPos > output.length()) {
219 newStart = start + outLen;
220 int32_t n = cursorPos - output.length();
221 // Outside the output string, cursorPos counts code points
222 while (n > 0 && newStart < text.length()) {
4388f060 223 newStart += U16_LENGTH(text.char32At(newStart));
b75a7d8f
A
224 --n;
225 }
226 newStart += n;
227 } else {
228 // Cursor is within output string. It has been set up above
229 // to be relative to start.
230 newStart += start;
231 }
232
233 cursor = newStart;
234 }
235
236 return outLen;
237}
238
239/**
240 * UnicodeReplacer API
241 */
242UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
243 UBool escapeUnprintable) const {
244 rule.truncate(0);
245 UnicodeString quoteBuf;
246
247 int32_t cursor = cursorPos;
248
249 // Handle a cursor preceding the output
250 if (hasCursor && cursor < 0) {
251 while (cursor++ < 0) {
252 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
253 }
254 // Fall through and append '|' below
255 }
256
257 for (int32_t i=0; i<output.length(); ++i) {
258 if (hasCursor && i == cursor) {
259 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
260 }
261 UChar c = output.charAt(i); // Ok to use 16-bits here
262
263 UnicodeReplacer* r = data->lookupReplacer(c);
264 if (r == NULL) {
265 ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
266 } else {
267 UnicodeString buf;
268 r->toReplacerPattern(buf, escapeUnprintable);
269 buf.insert(0, (UChar)0x20);
270 buf.append((UChar)0x20);
271 ICU_Utility::appendToRule(rule, buf,
272 TRUE, escapeUnprintable, quoteBuf);
273 }
274 }
275
276 // Handle a cursor after the output. Use > rather than >= because
277 // if cursor == output.length() it is at the end of the output,
278 // which is the default position, so we need not emit it.
279 if (hasCursor && cursor > output.length()) {
280 cursor -= output.length();
281 while (cursor-- > 0) {
282 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
283 }
284 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
285 }
286 // Flush quoteBuf out to result
287 ICU_Utility::appendToRule(rule, -1,
288 TRUE, escapeUnprintable, quoteBuf);
289
290 return rule;
291}
292
293/**
294 * Implement UnicodeReplacer
295 */
296void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
297 UChar32 ch;
4388f060 298 for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
374ca955
A
299 ch = output.char32At(i);
300 UnicodeReplacer* r = data->lookupReplacer(ch);
301 if (r == NULL) {
302 toUnionTo.add(ch);
303 } else {
304 r->addReplacementSetTo(toUnionTo);
305 }
b75a7d8f
A
306 }
307}
308
309/**
310 * UnicodeFunctor API
311 */
312void StringReplacer::setData(const TransliterationRuleData* d) {
313 data = d;
314 int32_t i = 0;
315 while (i<output.length()) {
316 UChar32 c = output.char32At(i);
317 UnicodeFunctor* f = data->lookup(c);
318 if (f != NULL) {
319 f->setData(data);
320 }
4388f060 321 i += U16_LENGTH(c);
b75a7d8f
A
322 }
323}
324
325U_NAMESPACE_END
326
327#endif /* #if !UCONFIG_NO_TRANSLITERATION */
328
329//eof