]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2017 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | ||
4 | // edits.cpp | |
5 | // created: 2017feb08 Markus W. Scherer | |
6 | ||
7 | #include "unicode/utypes.h" | |
8 | #include "unicode/edits.h" | |
9 | #include "cmemory.h" | |
10 | #include "uassert.h" | |
11 | ||
12 | U_NAMESPACE_BEGIN | |
13 | ||
14 | namespace { | |
15 | ||
16 | // 0000uuuuuuuuuuuu records u+1 unchanged text units. | |
17 | const int32_t MAX_UNCHANGED_LENGTH = 0x1000; | |
18 | const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1; | |
19 | ||
20 | // 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units. | |
21 | // No length change. | |
22 | const int32_t MAX_SHORT_WIDTH = 6; | |
23 | const int32_t MAX_SHORT_CHANGE_LENGTH = 0xfff; | |
24 | const int32_t MAX_SHORT_CHANGE = 0x6fff; | |
25 | ||
26 | // 0111mmmmmmnnnnnn records a replacement of m text units with n. | |
27 | // m or n = 61: actual length follows in the next edits array unit. | |
28 | // m or n = 62..63: actual length follows in the next two edits array units. | |
29 | // Bit 30 of the actual length is in the head unit. | |
30 | // Trailing units have bit 15 set. | |
31 | const int32_t LENGTH_IN_1TRAIL = 61; | |
32 | const int32_t LENGTH_IN_2TRAIL = 62; | |
33 | ||
34 | } // namespace | |
35 | ||
36 | Edits::~Edits() { | |
37 | if(array != stackArray) { | |
38 | uprv_free(array); | |
39 | } | |
40 | } | |
41 | ||
42 | void Edits::reset() { | |
43 | length = delta = 0; | |
44 | } | |
45 | ||
46 | void Edits::addUnchanged(int32_t unchangedLength) { | |
47 | if(U_FAILURE(errorCode) || unchangedLength == 0) { return; } | |
48 | if(unchangedLength < 0) { | |
49 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
50 | return; | |
51 | } | |
52 | // Merge into previous unchanged-text record, if any. | |
53 | int32_t last = lastUnit(); | |
54 | if(last < MAX_UNCHANGED) { | |
55 | int32_t remaining = MAX_UNCHANGED - last; | |
56 | if (remaining >= unchangedLength) { | |
57 | setLastUnit(last + unchangedLength); | |
58 | return; | |
59 | } | |
60 | setLastUnit(MAX_UNCHANGED); | |
61 | unchangedLength -= remaining; | |
62 | } | |
63 | // Split large lengths into multiple units. | |
64 | while(unchangedLength >= MAX_UNCHANGED_LENGTH) { | |
65 | append(MAX_UNCHANGED); | |
66 | unchangedLength -= MAX_UNCHANGED_LENGTH; | |
67 | } | |
68 | // Write a small (remaining) length. | |
69 | if(unchangedLength > 0) { | |
70 | append(unchangedLength - 1); | |
71 | } | |
72 | } | |
73 | ||
74 | void Edits::addReplace(int32_t oldLength, int32_t newLength) { | |
75 | if(U_FAILURE(errorCode)) { return; } | |
76 | if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) { | |
77 | // Replacement of short oldLength text units by same-length new text. | |
78 | // Merge into previous short-replacement record, if any. | |
79 | int32_t last = lastUnit(); | |
80 | if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE && | |
81 | (last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) { | |
82 | setLastUnit(last + 1); | |
83 | return; | |
84 | } | |
85 | append(oldLength << 12); | |
86 | return; | |
87 | } | |
88 | ||
89 | if(oldLength < 0 || newLength < 0) { | |
90 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
91 | return; | |
92 | } | |
93 | if (oldLength == 0 && newLength == 0) { | |
94 | return; | |
95 | } | |
96 | int32_t newDelta = newLength - oldLength; | |
97 | if (newDelta != 0) { | |
98 | if ((newDelta > 0 && delta >= 0 && newDelta > (INT32_MAX - delta)) || | |
99 | (newDelta < 0 && delta < 0 && newDelta < (INT32_MIN - delta))) { | |
100 | // Integer overflow or underflow. | |
101 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
102 | return; | |
103 | } | |
104 | delta += newDelta; | |
105 | } | |
106 | ||
107 | int32_t head = 0x7000; | |
108 | if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) { | |
109 | head |= oldLength << 6; | |
110 | head |= newLength; | |
111 | append(head); | |
112 | } else if ((capacity - length) >= 5 || growArray()) { | |
113 | int32_t limit = length + 1; | |
114 | if(oldLength < LENGTH_IN_1TRAIL) { | |
115 | head |= oldLength << 6; | |
116 | } else if(oldLength <= 0x7fff) { | |
117 | head |= LENGTH_IN_1TRAIL << 6; | |
118 | array[limit++] = (uint16_t)(0x8000 | oldLength); | |
119 | } else { | |
120 | head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6; | |
121 | array[limit++] = (uint16_t)(0x8000 | (oldLength >> 15)); | |
122 | array[limit++] = (uint16_t)(0x8000 | oldLength); | |
123 | } | |
124 | if(newLength < LENGTH_IN_1TRAIL) { | |
125 | head |= newLength; | |
126 | } else if(newLength <= 0x7fff) { | |
127 | head |= LENGTH_IN_1TRAIL; | |
128 | array[limit++] = (uint16_t)(0x8000 | newLength); | |
129 | } else { | |
130 | head |= LENGTH_IN_2TRAIL + (newLength >> 30); | |
131 | array[limit++] = (uint16_t)(0x8000 | (newLength >> 15)); | |
132 | array[limit++] = (uint16_t)(0x8000 | newLength); | |
133 | } | |
134 | array[length] = (uint16_t)head; | |
135 | length = limit; | |
136 | } | |
137 | } | |
138 | ||
139 | void Edits::append(int32_t r) { | |
140 | if(length < capacity || growArray()) { | |
141 | array[length++] = (uint16_t)r; | |
142 | } | |
143 | } | |
144 | ||
145 | UBool Edits::growArray() { | |
146 | int32_t newCapacity; | |
147 | if (array == stackArray) { | |
148 | newCapacity = 2000; | |
149 | } else if (capacity == INT32_MAX) { | |
150 | // Not U_BUFFER_OVERFLOW_ERROR because that could be confused on a string transform API | |
151 | // with a result-string-buffer overflow. | |
152 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
153 | return FALSE; | |
154 | } else if (capacity >= (INT32_MAX / 2)) { | |
155 | newCapacity = INT32_MAX; | |
156 | } else { | |
157 | newCapacity = 2 * capacity; | |
158 | } | |
159 | // Grow by at least 5 units so that a maximal change record will fit. | |
160 | if ((newCapacity - capacity) < 5) { | |
161 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
162 | return FALSE; | |
163 | } | |
164 | uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)newCapacity * 2); | |
165 | if (newArray == NULL) { | |
166 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |
167 | return FALSE; | |
168 | } | |
169 | uprv_memcpy(newArray, array, (size_t)length * 2); | |
170 | if (array != stackArray) { | |
171 | uprv_free(array); | |
172 | } | |
173 | array = newArray; | |
174 | capacity = newCapacity; | |
175 | return TRUE; | |
176 | } | |
177 | ||
178 | UBool Edits::copyErrorTo(UErrorCode &outErrorCode) { | |
179 | if (U_FAILURE(outErrorCode)) { return TRUE; } | |
180 | if (U_SUCCESS(errorCode)) { return FALSE; } | |
181 | outErrorCode = errorCode; | |
182 | return TRUE; | |
183 | } | |
184 | ||
185 | UBool Edits::hasChanges() const { | |
186 | if (delta != 0) { | |
187 | return TRUE; | |
188 | } | |
189 | for (int32_t i = 0; i < length; ++i) { | |
190 | if (array[i] > MAX_UNCHANGED) { | |
191 | return TRUE; | |
192 | } | |
193 | } | |
194 | return FALSE; | |
195 | } | |
196 | ||
197 | Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) : | |
198 | array(a), index(0), length(len), remaining(0), | |
199 | onlyChanges_(oc), coarse(crs), | |
200 | changed(FALSE), oldLength_(0), newLength_(0), | |
201 | srcIndex(0), replIndex(0), destIndex(0) {} | |
202 | ||
203 | int32_t Edits::Iterator::readLength(int32_t head) { | |
204 | if (head < LENGTH_IN_1TRAIL) { | |
205 | return head; | |
206 | } else if (head < LENGTH_IN_2TRAIL) { | |
207 | U_ASSERT(index < length); | |
208 | U_ASSERT(array[index] >= 0x8000); | |
209 | return array[index++] & 0x7fff; | |
210 | } else { | |
211 | U_ASSERT((index + 2) <= length); | |
212 | U_ASSERT(array[index] >= 0x8000); | |
213 | U_ASSERT(array[index + 1] >= 0x8000); | |
214 | int32_t len = ((head & 1) << 30) | | |
215 | ((int32_t)(array[index] & 0x7fff) << 15) | | |
216 | (array[index + 1] & 0x7fff); | |
217 | index += 2; | |
218 | return len; | |
219 | } | |
220 | } | |
221 | ||
222 | void Edits::Iterator::updateIndexes() { | |
223 | srcIndex += oldLength_; | |
224 | if (changed) { | |
225 | replIndex += newLength_; | |
226 | } | |
227 | destIndex += newLength_; | |
228 | } | |
229 | ||
230 | UBool Edits::Iterator::noNext() { | |
231 | // No change beyond the string. | |
232 | changed = FALSE; | |
233 | oldLength_ = newLength_ = 0; | |
234 | return FALSE; | |
235 | } | |
236 | ||
237 | UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) { | |
238 | if (U_FAILURE(errorCode)) { return FALSE; } | |
239 | // We have an errorCode in case we need to start guarding against integer overflows. | |
240 | // It is also convenient for caller loops if we bail out when an error was set elsewhere. | |
241 | updateIndexes(); | |
242 | if (remaining > 0) { | |
243 | // Fine-grained iterator: Continue a sequence of equal-length changes. | |
244 | --remaining; | |
245 | return TRUE; | |
246 | } | |
247 | if (index >= length) { | |
248 | return noNext(); | |
249 | } | |
250 | int32_t u = array[index++]; | |
251 | if (u <= MAX_UNCHANGED) { | |
252 | // Combine adjacent unchanged ranges. | |
253 | changed = FALSE; | |
254 | oldLength_ = u + 1; | |
255 | while (index < length && (u = array[index]) <= MAX_UNCHANGED) { | |
256 | ++index; | |
257 | oldLength_ += u + 1; | |
258 | } | |
259 | newLength_ = oldLength_; | |
260 | if (onlyChanges) { | |
261 | updateIndexes(); | |
262 | if (index >= length) { | |
263 | return noNext(); | |
264 | } | |
265 | // already fetched u > MAX_UNCHANGED at index | |
266 | ++index; | |
267 | } else { | |
268 | return TRUE; | |
269 | } | |
270 | } | |
271 | changed = TRUE; | |
272 | if (u <= MAX_SHORT_CHANGE) { | |
273 | if (coarse) { | |
274 | int32_t w = u >> 12; | |
275 | int32_t len = (u & 0xfff) + 1; | |
276 | oldLength_ = newLength_ = len * w; | |
277 | } else { | |
278 | // Split a sequence of equal-length changes that was compressed into one unit. | |
279 | oldLength_ = newLength_ = u >> 12; | |
280 | remaining = u & 0xfff; | |
281 | return TRUE; | |
282 | } | |
283 | } else { | |
284 | U_ASSERT(u <= 0x7fff); | |
285 | oldLength_ = readLength((u >> 6) & 0x3f); | |
286 | newLength_ = readLength(u & 0x3f); | |
287 | if (!coarse) { | |
288 | return TRUE; | |
289 | } | |
290 | } | |
291 | // Combine adjacent changes. | |
292 | while (index < length && (u = array[index]) > MAX_UNCHANGED) { | |
293 | ++index; | |
294 | if (u <= MAX_SHORT_CHANGE) { | |
295 | int32_t w = u >> 12; | |
296 | int32_t len = (u & 0xfff) + 1; | |
297 | len = len * w; | |
298 | oldLength_ += len; | |
299 | newLength_ += len; | |
300 | } else { | |
301 | U_ASSERT(u <= 0x7fff); | |
302 | int32_t oldLen = readLength((u >> 6) & 0x3f); | |
303 | int32_t newLen = readLength(u & 0x3f); | |
304 | oldLength_ += oldLen; | |
305 | newLength_ += newLen; | |
306 | } | |
307 | } | |
308 | return TRUE; | |
309 | } | |
310 | ||
311 | UBool Edits::Iterator::findSourceIndex(int32_t i, UErrorCode &errorCode) { | |
312 | if (U_FAILURE(errorCode) || i < 0) { return FALSE; } | |
313 | if (i < srcIndex) { | |
314 | // Reset the iterator to the start. | |
315 | index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0; | |
316 | } else if (i < (srcIndex + oldLength_)) { | |
317 | // The index is in the current span. | |
318 | return TRUE; | |
319 | } | |
320 | while (next(FALSE, errorCode)) { | |
321 | if (i < (srcIndex + oldLength_)) { | |
322 | // The index is in the current span. | |
323 | return TRUE; | |
324 | } | |
325 | if (remaining > 0) { | |
326 | // Is the index in one of the remaining compressed edits? | |
327 | // srcIndex is the start of the current span, before the remaining ones. | |
328 | int32_t len = (remaining + 1) * oldLength_; | |
329 | if (i < (srcIndex + len)) { | |
330 | int32_t n = (i - srcIndex) / oldLength_; // 1 <= n <= remaining | |
331 | len = n * oldLength_; | |
332 | srcIndex += len; | |
333 | replIndex += len; | |
334 | destIndex += len; | |
335 | remaining -= n; | |
336 | return TRUE; | |
337 | } | |
338 | // Make next() skip all of these edits at once. | |
339 | oldLength_ = newLength_ = len; | |
340 | remaining = 0; | |
341 | } | |
342 | } | |
343 | return FALSE; | |
344 | } | |
345 | ||
346 | U_NAMESPACE_END |