]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/edits.cpp
ICU-59180.0.1.tar.gz
[apple/icu.git] / icuSources / common / edits.cpp
1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 // edits.cpp
5 // created: 2017feb08 Markus W. Scherer
6
7 #include "unicode/utypes.h"
8 #include "unicode/edits.h"
9 #include "cmemory.h"
10 #include "uassert.h"
11
12 U_NAMESPACE_BEGIN
13
14 namespace {
15
16 // 0000uuuuuuuuuuuu records u+1 unchanged text units.
17 const int32_t MAX_UNCHANGED_LENGTH = 0x1000;
18 const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;
19
20 // 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units.
21 // No length change.
22 const int32_t MAX_SHORT_WIDTH = 6;
23 const int32_t MAX_SHORT_CHANGE_LENGTH = 0xfff;
24 const int32_t MAX_SHORT_CHANGE = 0x6fff;
25
26 // 0111mmmmmmnnnnnn records a replacement of m text units with n.
27 // m or n = 61: actual length follows in the next edits array unit.
28 // m or n = 62..63: actual length follows in the next two edits array units.
29 // Bit 30 of the actual length is in the head unit.
30 // Trailing units have bit 15 set.
31 const int32_t LENGTH_IN_1TRAIL = 61;
32 const int32_t LENGTH_IN_2TRAIL = 62;
33
34 } // namespace
35
36 Edits::~Edits() {
37 if(array != stackArray) {
38 uprv_free(array);
39 }
40 }
41
42 void Edits::reset() {
43 length = delta = 0;
44 }
45
46 void Edits::addUnchanged(int32_t unchangedLength) {
47 if(U_FAILURE(errorCode) || unchangedLength == 0) { return; }
48 if(unchangedLength < 0) {
49 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
50 return;
51 }
52 // Merge into previous unchanged-text record, if any.
53 int32_t last = lastUnit();
54 if(last < MAX_UNCHANGED) {
55 int32_t remaining = MAX_UNCHANGED - last;
56 if (remaining >= unchangedLength) {
57 setLastUnit(last + unchangedLength);
58 return;
59 }
60 setLastUnit(MAX_UNCHANGED);
61 unchangedLength -= remaining;
62 }
63 // Split large lengths into multiple units.
64 while(unchangedLength >= MAX_UNCHANGED_LENGTH) {
65 append(MAX_UNCHANGED);
66 unchangedLength -= MAX_UNCHANGED_LENGTH;
67 }
68 // Write a small (remaining) length.
69 if(unchangedLength > 0) {
70 append(unchangedLength - 1);
71 }
72 }
73
74 void Edits::addReplace(int32_t oldLength, int32_t newLength) {
75 if(U_FAILURE(errorCode)) { return; }
76 if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) {
77 // Replacement of short oldLength text units by same-length new text.
78 // Merge into previous short-replacement record, if any.
79 int32_t last = lastUnit();
80 if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
81 (last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) {
82 setLastUnit(last + 1);
83 return;
84 }
85 append(oldLength << 12);
86 return;
87 }
88
89 if(oldLength < 0 || newLength < 0) {
90 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
91 return;
92 }
93 if (oldLength == 0 && newLength == 0) {
94 return;
95 }
96 int32_t newDelta = newLength - oldLength;
97 if (newDelta != 0) {
98 if ((newDelta > 0 && delta >= 0 && newDelta > (INT32_MAX - delta)) ||
99 (newDelta < 0 && delta < 0 && newDelta < (INT32_MIN - delta))) {
100 // Integer overflow or underflow.
101 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
102 return;
103 }
104 delta += newDelta;
105 }
106
107 int32_t head = 0x7000;
108 if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) {
109 head |= oldLength << 6;
110 head |= newLength;
111 append(head);
112 } else if ((capacity - length) >= 5 || growArray()) {
113 int32_t limit = length + 1;
114 if(oldLength < LENGTH_IN_1TRAIL) {
115 head |= oldLength << 6;
116 } else if(oldLength <= 0x7fff) {
117 head |= LENGTH_IN_1TRAIL << 6;
118 array[limit++] = (uint16_t)(0x8000 | oldLength);
119 } else {
120 head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6;
121 array[limit++] = (uint16_t)(0x8000 | (oldLength >> 15));
122 array[limit++] = (uint16_t)(0x8000 | oldLength);
123 }
124 if(newLength < LENGTH_IN_1TRAIL) {
125 head |= newLength;
126 } else if(newLength <= 0x7fff) {
127 head |= LENGTH_IN_1TRAIL;
128 array[limit++] = (uint16_t)(0x8000 | newLength);
129 } else {
130 head |= LENGTH_IN_2TRAIL + (newLength >> 30);
131 array[limit++] = (uint16_t)(0x8000 | (newLength >> 15));
132 array[limit++] = (uint16_t)(0x8000 | newLength);
133 }
134 array[length] = (uint16_t)head;
135 length = limit;
136 }
137 }
138
139 void Edits::append(int32_t r) {
140 if(length < capacity || growArray()) {
141 array[length++] = (uint16_t)r;
142 }
143 }
144
145 UBool Edits::growArray() {
146 int32_t newCapacity;
147 if (array == stackArray) {
148 newCapacity = 2000;
149 } else if (capacity == INT32_MAX) {
150 // Not U_BUFFER_OVERFLOW_ERROR because that could be confused on a string transform API
151 // with a result-string-buffer overflow.
152 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
153 return FALSE;
154 } else if (capacity >= (INT32_MAX / 2)) {
155 newCapacity = INT32_MAX;
156 } else {
157 newCapacity = 2 * capacity;
158 }
159 // Grow by at least 5 units so that a maximal change record will fit.
160 if ((newCapacity - capacity) < 5) {
161 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
162 return FALSE;
163 }
164 uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)newCapacity * 2);
165 if (newArray == NULL) {
166 errorCode = U_MEMORY_ALLOCATION_ERROR;
167 return FALSE;
168 }
169 uprv_memcpy(newArray, array, (size_t)length * 2);
170 if (array != stackArray) {
171 uprv_free(array);
172 }
173 array = newArray;
174 capacity = newCapacity;
175 return TRUE;
176 }
177
178 UBool Edits::copyErrorTo(UErrorCode &outErrorCode) {
179 if (U_FAILURE(outErrorCode)) { return TRUE; }
180 if (U_SUCCESS(errorCode)) { return FALSE; }
181 outErrorCode = errorCode;
182 return TRUE;
183 }
184
185 UBool Edits::hasChanges() const {
186 if (delta != 0) {
187 return TRUE;
188 }
189 for (int32_t i = 0; i < length; ++i) {
190 if (array[i] > MAX_UNCHANGED) {
191 return TRUE;
192 }
193 }
194 return FALSE;
195 }
196
197 Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) :
198 array(a), index(0), length(len), remaining(0),
199 onlyChanges_(oc), coarse(crs),
200 changed(FALSE), oldLength_(0), newLength_(0),
201 srcIndex(0), replIndex(0), destIndex(0) {}
202
203 int32_t Edits::Iterator::readLength(int32_t head) {
204 if (head < LENGTH_IN_1TRAIL) {
205 return head;
206 } else if (head < LENGTH_IN_2TRAIL) {
207 U_ASSERT(index < length);
208 U_ASSERT(array[index] >= 0x8000);
209 return array[index++] & 0x7fff;
210 } else {
211 U_ASSERT((index + 2) <= length);
212 U_ASSERT(array[index] >= 0x8000);
213 U_ASSERT(array[index + 1] >= 0x8000);
214 int32_t len = ((head & 1) << 30) |
215 ((int32_t)(array[index] & 0x7fff) << 15) |
216 (array[index + 1] & 0x7fff);
217 index += 2;
218 return len;
219 }
220 }
221
222 void Edits::Iterator::updateIndexes() {
223 srcIndex += oldLength_;
224 if (changed) {
225 replIndex += newLength_;
226 }
227 destIndex += newLength_;
228 }
229
230 UBool Edits::Iterator::noNext() {
231 // No change beyond the string.
232 changed = FALSE;
233 oldLength_ = newLength_ = 0;
234 return FALSE;
235 }
236
237 UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) {
238 if (U_FAILURE(errorCode)) { return FALSE; }
239 // We have an errorCode in case we need to start guarding against integer overflows.
240 // It is also convenient for caller loops if we bail out when an error was set elsewhere.
241 updateIndexes();
242 if (remaining > 0) {
243 // Fine-grained iterator: Continue a sequence of equal-length changes.
244 --remaining;
245 return TRUE;
246 }
247 if (index >= length) {
248 return noNext();
249 }
250 int32_t u = array[index++];
251 if (u <= MAX_UNCHANGED) {
252 // Combine adjacent unchanged ranges.
253 changed = FALSE;
254 oldLength_ = u + 1;
255 while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
256 ++index;
257 oldLength_ += u + 1;
258 }
259 newLength_ = oldLength_;
260 if (onlyChanges) {
261 updateIndexes();
262 if (index >= length) {
263 return noNext();
264 }
265 // already fetched u > MAX_UNCHANGED at index
266 ++index;
267 } else {
268 return TRUE;
269 }
270 }
271 changed = TRUE;
272 if (u <= MAX_SHORT_CHANGE) {
273 if (coarse) {
274 int32_t w = u >> 12;
275 int32_t len = (u & 0xfff) + 1;
276 oldLength_ = newLength_ = len * w;
277 } else {
278 // Split a sequence of equal-length changes that was compressed into one unit.
279 oldLength_ = newLength_ = u >> 12;
280 remaining = u & 0xfff;
281 return TRUE;
282 }
283 } else {
284 U_ASSERT(u <= 0x7fff);
285 oldLength_ = readLength((u >> 6) & 0x3f);
286 newLength_ = readLength(u & 0x3f);
287 if (!coarse) {
288 return TRUE;
289 }
290 }
291 // Combine adjacent changes.
292 while (index < length && (u = array[index]) > MAX_UNCHANGED) {
293 ++index;
294 if (u <= MAX_SHORT_CHANGE) {
295 int32_t w = u >> 12;
296 int32_t len = (u & 0xfff) + 1;
297 len = len * w;
298 oldLength_ += len;
299 newLength_ += len;
300 } else {
301 U_ASSERT(u <= 0x7fff);
302 int32_t oldLen = readLength((u >> 6) & 0x3f);
303 int32_t newLen = readLength(u & 0x3f);
304 oldLength_ += oldLen;
305 newLength_ += newLen;
306 }
307 }
308 return TRUE;
309 }
310
311 UBool Edits::Iterator::findSourceIndex(int32_t i, UErrorCode &errorCode) {
312 if (U_FAILURE(errorCode) || i < 0) { return FALSE; }
313 if (i < srcIndex) {
314 // Reset the iterator to the start.
315 index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
316 } else if (i < (srcIndex + oldLength_)) {
317 // The index is in the current span.
318 return TRUE;
319 }
320 while (next(FALSE, errorCode)) {
321 if (i < (srcIndex + oldLength_)) {
322 // The index is in the current span.
323 return TRUE;
324 }
325 if (remaining > 0) {
326 // Is the index in one of the remaining compressed edits?
327 // srcIndex is the start of the current span, before the remaining ones.
328 int32_t len = (remaining + 1) * oldLength_;
329 if (i < (srcIndex + len)) {
330 int32_t n = (i - srcIndex) / oldLength_; // 1 <= n <= remaining
331 len = n * oldLength_;
332 srcIndex += len;
333 replIndex += len;
334 destIndex += len;
335 remaining -= n;
336 return TRUE;
337 }
338 // Make next() skip all of these edits at once.
339 oldLength_ = newLength_ = len;
340 remaining = 0;
341 }
342 }
343 return FALSE;
344 }
345
346 U_NAMESPACE_END