1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
5 // created: 2017feb08 Markus W. Scherer
7 #include "unicode/utypes.h"
8 #include "unicode/edits.h"
16 // 0000uuuuuuuuuuuu records u+1 unchanged text units.
17 const int32_t MAX_UNCHANGED_LENGTH
= 0x1000;
18 const int32_t MAX_UNCHANGED
= MAX_UNCHANGED_LENGTH
- 1;
20 // 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units.
22 const int32_t MAX_SHORT_WIDTH
= 6;
23 const int32_t MAX_SHORT_CHANGE_LENGTH
= 0xfff;
24 const int32_t MAX_SHORT_CHANGE
= 0x6fff;
26 // 0111mmmmmmnnnnnn records a replacement of m text units with n.
27 // m or n = 61: actual length follows in the next edits array unit.
28 // m or n = 62..63: actual length follows in the next two edits array units.
29 // Bit 30 of the actual length is in the head unit.
30 // Trailing units have bit 15 set.
31 const int32_t LENGTH_IN_1TRAIL
= 61;
32 const int32_t LENGTH_IN_2TRAIL
= 62;
37 if(array
!= stackArray
) {
46 void Edits::addUnchanged(int32_t unchangedLength
) {
47 if(U_FAILURE(errorCode
) || unchangedLength
== 0) { return; }
48 if(unchangedLength
< 0) {
49 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
52 // Merge into previous unchanged-text record, if any.
53 int32_t last
= lastUnit();
54 if(last
< MAX_UNCHANGED
) {
55 int32_t remaining
= MAX_UNCHANGED
- last
;
56 if (remaining
>= unchangedLength
) {
57 setLastUnit(last
+ unchangedLength
);
60 setLastUnit(MAX_UNCHANGED
);
61 unchangedLength
-= remaining
;
63 // Split large lengths into multiple units.
64 while(unchangedLength
>= MAX_UNCHANGED_LENGTH
) {
65 append(MAX_UNCHANGED
);
66 unchangedLength
-= MAX_UNCHANGED_LENGTH
;
68 // Write a small (remaining) length.
69 if(unchangedLength
> 0) {
70 append(unchangedLength
- 1);
74 void Edits::addReplace(int32_t oldLength
, int32_t newLength
) {
75 if(U_FAILURE(errorCode
)) { return; }
76 if(oldLength
== newLength
&& 0 < oldLength
&& oldLength
<= MAX_SHORT_WIDTH
) {
77 // Replacement of short oldLength text units by same-length new text.
78 // Merge into previous short-replacement record, if any.
79 int32_t last
= lastUnit();
80 if(MAX_UNCHANGED
< last
&& last
< MAX_SHORT_CHANGE
&&
81 (last
>> 12) == oldLength
&& (last
& 0xfff) < MAX_SHORT_CHANGE_LENGTH
) {
82 setLastUnit(last
+ 1);
85 append(oldLength
<< 12);
89 if(oldLength
< 0 || newLength
< 0) {
90 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
93 if (oldLength
== 0 && newLength
== 0) {
96 int32_t newDelta
= newLength
- oldLength
;
98 if ((newDelta
> 0 && delta
>= 0 && newDelta
> (INT32_MAX
- delta
)) ||
99 (newDelta
< 0 && delta
< 0 && newDelta
< (INT32_MIN
- delta
))) {
100 // Integer overflow or underflow.
101 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
107 int32_t head
= 0x7000;
108 if (oldLength
< LENGTH_IN_1TRAIL
&& newLength
< LENGTH_IN_1TRAIL
) {
109 head
|= oldLength
<< 6;
112 } else if ((capacity
- length
) >= 5 || growArray()) {
113 int32_t limit
= length
+ 1;
114 if(oldLength
< LENGTH_IN_1TRAIL
) {
115 head
|= oldLength
<< 6;
116 } else if(oldLength
<= 0x7fff) {
117 head
|= LENGTH_IN_1TRAIL
<< 6;
118 array
[limit
++] = (uint16_t)(0x8000 | oldLength
);
120 head
|= (LENGTH_IN_2TRAIL
+ (oldLength
>> 30)) << 6;
121 array
[limit
++] = (uint16_t)(0x8000 | (oldLength
>> 15));
122 array
[limit
++] = (uint16_t)(0x8000 | oldLength
);
124 if(newLength
< LENGTH_IN_1TRAIL
) {
126 } else if(newLength
<= 0x7fff) {
127 head
|= LENGTH_IN_1TRAIL
;
128 array
[limit
++] = (uint16_t)(0x8000 | newLength
);
130 head
|= LENGTH_IN_2TRAIL
+ (newLength
>> 30);
131 array
[limit
++] = (uint16_t)(0x8000 | (newLength
>> 15));
132 array
[limit
++] = (uint16_t)(0x8000 | newLength
);
134 array
[length
] = (uint16_t)head
;
139 void Edits::append(int32_t r
) {
140 if(length
< capacity
|| growArray()) {
141 array
[length
++] = (uint16_t)r
;
145 UBool
Edits::growArray() {
147 if (array
== stackArray
) {
149 } else if (capacity
== INT32_MAX
) {
150 // Not U_BUFFER_OVERFLOW_ERROR because that could be confused on a string transform API
151 // with a result-string-buffer overflow.
152 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
154 } else if (capacity
>= (INT32_MAX
/ 2)) {
155 newCapacity
= INT32_MAX
;
157 newCapacity
= 2 * capacity
;
159 // Grow by at least 5 units so that a maximal change record will fit.
160 if ((newCapacity
- capacity
) < 5) {
161 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
164 uint16_t *newArray
= (uint16_t *)uprv_malloc((size_t)newCapacity
* 2);
165 if (newArray
== NULL
) {
166 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
169 uprv_memcpy(newArray
, array
, (size_t)length
* 2);
170 if (array
!= stackArray
) {
174 capacity
= newCapacity
;
178 UBool
Edits::copyErrorTo(UErrorCode
&outErrorCode
) {
179 if (U_FAILURE(outErrorCode
)) { return TRUE
; }
180 if (U_SUCCESS(errorCode
)) { return FALSE
; }
181 outErrorCode
= errorCode
;
185 UBool
Edits::hasChanges() const {
189 for (int32_t i
= 0; i
< length
; ++i
) {
190 if (array
[i
] > MAX_UNCHANGED
) {
197 Edits::Iterator::Iterator(const uint16_t *a
, int32_t len
, UBool oc
, UBool crs
) :
198 array(a
), index(0), length(len
), remaining(0),
199 onlyChanges_(oc
), coarse(crs
),
200 changed(FALSE
), oldLength_(0), newLength_(0),
201 srcIndex(0), replIndex(0), destIndex(0) {}
203 int32_t Edits::Iterator::readLength(int32_t head
) {
204 if (head
< LENGTH_IN_1TRAIL
) {
206 } else if (head
< LENGTH_IN_2TRAIL
) {
207 U_ASSERT(index
< length
);
208 U_ASSERT(array
[index
] >= 0x8000);
209 return array
[index
++] & 0x7fff;
211 U_ASSERT((index
+ 2) <= length
);
212 U_ASSERT(array
[index
] >= 0x8000);
213 U_ASSERT(array
[index
+ 1] >= 0x8000);
214 int32_t len
= ((head
& 1) << 30) |
215 ((int32_t)(array
[index
] & 0x7fff) << 15) |
216 (array
[index
+ 1] & 0x7fff);
222 void Edits::Iterator::updateIndexes() {
223 srcIndex
+= oldLength_
;
225 replIndex
+= newLength_
;
227 destIndex
+= newLength_
;
230 UBool
Edits::Iterator::noNext() {
231 // No change beyond the string.
233 oldLength_
= newLength_
= 0;
237 UBool
Edits::Iterator::next(UBool onlyChanges
, UErrorCode
&errorCode
) {
238 if (U_FAILURE(errorCode
)) { return FALSE
; }
239 // We have an errorCode in case we need to start guarding against integer overflows.
240 // It is also convenient for caller loops if we bail out when an error was set elsewhere.
243 // Fine-grained iterator: Continue a sequence of equal-length changes.
247 if (index
>= length
) {
250 int32_t u
= array
[index
++];
251 if (u
<= MAX_UNCHANGED
) {
252 // Combine adjacent unchanged ranges.
255 while (index
< length
&& (u
= array
[index
]) <= MAX_UNCHANGED
) {
259 newLength_
= oldLength_
;
262 if (index
>= length
) {
265 // already fetched u > MAX_UNCHANGED at index
272 if (u
<= MAX_SHORT_CHANGE
) {
275 int32_t len
= (u
& 0xfff) + 1;
276 oldLength_
= newLength_
= len
* w
;
278 // Split a sequence of equal-length changes that was compressed into one unit.
279 oldLength_
= newLength_
= u
>> 12;
280 remaining
= u
& 0xfff;
284 U_ASSERT(u
<= 0x7fff);
285 oldLength_
= readLength((u
>> 6) & 0x3f);
286 newLength_
= readLength(u
& 0x3f);
291 // Combine adjacent changes.
292 while (index
< length
&& (u
= array
[index
]) > MAX_UNCHANGED
) {
294 if (u
<= MAX_SHORT_CHANGE
) {
296 int32_t len
= (u
& 0xfff) + 1;
301 U_ASSERT(u
<= 0x7fff);
302 int32_t oldLen
= readLength((u
>> 6) & 0x3f);
303 int32_t newLen
= readLength(u
& 0x3f);
304 oldLength_
+= oldLen
;
305 newLength_
+= newLen
;
311 UBool
Edits::Iterator::findSourceIndex(int32_t i
, UErrorCode
&errorCode
) {
312 if (U_FAILURE(errorCode
) || i
< 0) { return FALSE
; }
314 // Reset the iterator to the start.
315 index
= remaining
= oldLength_
= newLength_
= srcIndex
= replIndex
= destIndex
= 0;
316 } else if (i
< (srcIndex
+ oldLength_
)) {
317 // The index is in the current span.
320 while (next(FALSE
, errorCode
)) {
321 if (i
< (srcIndex
+ oldLength_
)) {
322 // The index is in the current span.
326 // Is the index in one of the remaining compressed edits?
327 // srcIndex is the start of the current span, before the remaining ones.
328 int32_t len
= (remaining
+ 1) * oldLength_
;
329 if (i
< (srcIndex
+ len
)) {
330 int32_t n
= (i
- srcIndex
) / oldLength_
; // 1 <= n <= remaining
331 len
= n
* oldLength_
;
338 // Make next() skip all of these edits at once.
339 oldLength_
= newLength_
= len
;