]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/unicode/edits.h
ICU-62109.0.1.tar.gz
[apple/icu.git] / icuSources / common / unicode / edits.h
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 // edits.h
5 // created: 2016dec30 Markus W. Scherer
6
7 #ifndef __EDITS_H__
8 #define __EDITS_H__
9
10 #include "unicode/utypes.h"
11 #include "unicode/uobject.h"
12
13 /**
14 * \file
15 * \brief C++ API: C++ class Edits for low-level string transformations on styled text.
16 */
17
18 #if U_SHOW_CPLUSPLUS_API
19 U_NAMESPACE_BEGIN
20
21 class UnicodeString;
22
23 /**
24 * Records lengths of string edits but not replacement text. Supports replacements, insertions, deletions
25 * in linear progression. Does not support moving/reordering of text.
26 *
27 * There are two types of edits: <em>change edits</em> and <em>no-change edits</em>. Add edits to
28 * instances of this class using {@link #addReplace(int, int)} (for change edits) and
29 * {@link #addUnchanged(int)} (for no-change edits). Change edits are retained with full granularity,
30 * whereas adjacent no-change edits are always merged together. In no-change edits, there is a one-to-one
31 * mapping between code points in the source and destination strings.
32 *
33 * After all edits have been added, instances of this class should be considered immutable, and an
34 * {@link Edits::Iterator} can be used for queries.
35 *
36 * There are four flavors of Edits::Iterator:
37 *
38 * <ul>
39 * <li>{@link #getFineIterator()} retains full granularity of change edits.
40 * <li>{@link #getFineChangesIterator()} retains full granularity of change edits, and when calling
41 * next() on the iterator, skips over no-change edits (unchanged regions).
42 * <li>{@link #getCoarseIterator()} treats adjacent change edits as a single edit. (Adjacent no-change
43 * edits are automatically merged during the construction phase.)
44 * <li>{@link #getCoarseChangesIterator()} treats adjacent change edits as a single edit, and when
45 * calling next() on the iterator, skips over no-change edits (unchanged regions).
46 * </ul>
47 *
48 * For example, consider the string "abcßDeF", which case-folds to "abcssdef". This string has the
49 * following fine edits:
50 * <ul>
51 * <li>abc ⇨ abc (no-change)
52 * <li>ß ⇨ ss (change)
53 * <li>D ⇨ d (change)
54 * <li>e ⇨ e (no-change)
55 * <li>F ⇨ f (change)
56 * </ul>
57 * and the following coarse edits (note how adjacent change edits get merged together):
58 * <ul>
59 * <li>abc ⇨ abc (no-change)
60 * <li>ßD ⇨ ssd (change)
61 * <li>e ⇨ e (no-change)
62 * <li>F ⇨ f (change)
63 * </ul>
64 *
65 * The "fine changes" and "coarse changes" iterators will step through only the change edits when their
66 * {@link Edits::Iterator#next()} methods are called. They are identical to the non-change iterators when
67 * their {@link Edits::Iterator#findSourceIndex(int)} or {@link Edits::Iterator#findDestinationIndex(int)}
68 * methods are used to walk through the string.
69 *
70 * For examples of how to use this class, see the test <code>TestCaseMapEditsIteratorDocs</code> in
71 * UCharacterCaseTest.java.
72 *
73 * An Edits object tracks a separate UErrorCode, but ICU string transformation functions
74 * (e.g., case mapping functions) merge any such errors into their API's UErrorCode.
75 *
76 * @stable ICU 59
77 */
78 class U_COMMON_API Edits U_FINAL : public UMemory {
79 public:
80 /**
81 * Constructs an empty object.
82 * @stable ICU 59
83 */
84 Edits() :
85 array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0),
86 errorCode_(U_ZERO_ERROR) {}
87 /**
88 * Copy constructor.
89 * @param other source edits
90 * @draft ICU 60
91 */
92 Edits(const Edits &other) :
93 array(stackArray), capacity(STACK_CAPACITY), length(other.length),
94 delta(other.delta), numChanges(other.numChanges),
95 errorCode_(other.errorCode_) {
96 copyArray(other);
97 }
98 /**
99 * Move constructor, might leave src empty.
100 * This object will have the same contents that the source object had.
101 * @param src source edits
102 * @draft ICU 60
103 */
104 Edits(Edits &&src) U_NOEXCEPT :
105 array(stackArray), capacity(STACK_CAPACITY), length(src.length),
106 delta(src.delta), numChanges(src.numChanges),
107 errorCode_(src.errorCode_) {
108 moveArray(src);
109 }
110
111 /**
112 * Destructor.
113 * @stable ICU 59
114 */
115 ~Edits();
116
117 /**
118 * Assignment operator.
119 * @param other source edits
120 * @return *this
121 * @draft ICU 60
122 */
123 Edits &operator=(const Edits &other);
124
125 /**
126 * Move assignment operator, might leave src empty.
127 * This object will have the same contents that the source object had.
128 * The behavior is undefined if *this and src are the same object.
129 * @param src source edits
130 * @return *this
131 * @draft ICU 60
132 */
133 Edits &operator=(Edits &&src) U_NOEXCEPT;
134
135 /**
136 * Resets the data but may not release memory.
137 * @stable ICU 59
138 */
139 void reset() U_NOEXCEPT;
140
141 /**
142 * Adds a no-change edit: a record for an unchanged segment of text.
143 * Normally called from inside ICU string transformation functions, not user code.
144 * @stable ICU 59
145 */
146 void addUnchanged(int32_t unchangedLength);
147 /**
148 * Adds a change edit: a record for a text replacement/insertion/deletion.
149 * Normally called from inside ICU string transformation functions, not user code.
150 * @stable ICU 59
151 */
152 void addReplace(int32_t oldLength, int32_t newLength);
153 /**
154 * Sets the UErrorCode if an error occurred while recording edits.
155 * Preserves older error codes in the outErrorCode.
156 * Normally called from inside ICU string transformation functions, not user code.
157 * @param outErrorCode Set to an error code if it does not contain one already
158 * and an error occurred while recording edits.
159 * Otherwise unchanged.
160 * @return TRUE if U_FAILURE(outErrorCode)
161 * @stable ICU 59
162 */
163 UBool copyErrorTo(UErrorCode &outErrorCode);
164
165 /**
166 * How much longer is the new text compared with the old text?
167 * @return new length minus old length
168 * @stable ICU 59
169 */
170 int32_t lengthDelta() const { return delta; }
171 /**
172 * @return TRUE if there are any change edits
173 * @stable ICU 59
174 */
175 UBool hasChanges() const { return numChanges != 0; }
176
177 #ifndef U_HIDE_DRAFT_API
178 /**
179 * @return the number of change edits
180 * @draft ICU 60
181 */
182 int32_t numberOfChanges() const { return numChanges; }
183 #endif // U_HIDE_DRAFT_API
184
185 /**
186 * Access to the list of edits.
187 *
188 * At any moment in time, an instance of this class points to a single edit: a "window" into a span
189 * of the source string and the corresponding span of the destination string. The source string span
190 * starts at {@link #sourceIndex()} and runs for {@link #oldLength()} chars; the destination string
191 * span starts at {@link #destinationIndex()} and runs for {@link #newLength()} chars.
192 *
193 * The iterator can be moved between edits using the {@link #next()}, {@link #findSourceIndex(int)},
194 * and {@link #findDestinationIndex(int)} methods. Calling any of these methods mutates the iterator
195 * to make it point to the corresponding edit.
196 *
197 * For more information, see the documentation for {@link Edits}.
198 *
199 * @see getCoarseIterator
200 * @see getFineIterator
201 * @stable ICU 59
202 */
203 struct U_COMMON_API Iterator U_FINAL : public UMemory {
204 /**
205 * Default constructor, empty iterator.
206 * @draft ICU 60
207 */
208 Iterator() :
209 array(nullptr), index(0), length(0),
210 remaining(0), onlyChanges_(FALSE), coarse(FALSE),
211 dir(0), changed(FALSE), oldLength_(0), newLength_(0),
212 srcIndex(0), replIndex(0), destIndex(0) {}
213 /**
214 * Copy constructor.
215 * @stable ICU 59
216 */
217 Iterator(const Iterator &other) = default;
218 /**
219 * Assignment operator.
220 * @stable ICU 59
221 */
222 Iterator &operator=(const Iterator &other) = default;
223
224 /**
225 * Advances the iterator to the next edit.
226 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
227 * or else the function returns immediately. Check for U_FAILURE()
228 * on output or use with function chaining. (See User Guide for details.)
229 * @return TRUE if there is another edit
230 * @stable ICU 59
231 */
232 UBool next(UErrorCode &errorCode) { return next(onlyChanges_, errorCode); }
233
234 /**
235 * Moves the iterator to the edit that contains the source index.
236 * The source index may be found in a no-change edit
237 * even if normal iteration would skip no-change edits.
238 * Normal iteration can continue from a found edit.
239 *
240 * The iterator state before this search logically does not matter.
241 * (It may affect the performance of the search.)
242 *
243 * The iterator state after this search is undefined
244 * if the source index is out of bounds for the source string.
245 *
246 * @param i source index
247 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
248 * or else the function returns immediately. Check for U_FAILURE()
249 * on output or use with function chaining. (See User Guide for details.)
250 * @return TRUE if the edit for the source index was found
251 * @stable ICU 59
252 */
253 UBool findSourceIndex(int32_t i, UErrorCode &errorCode) {
254 return findIndex(i, TRUE, errorCode) == 0;
255 }
256
257 #ifndef U_HIDE_DRAFT_API
258 /**
259 * Moves the iterator to the edit that contains the destination index.
260 * The destination index may be found in a no-change edit
261 * even if normal iteration would skip no-change edits.
262 * Normal iteration can continue from a found edit.
263 *
264 * The iterator state before this search logically does not matter.
265 * (It may affect the performance of the search.)
266 *
267 * The iterator state after this search is undefined
268 * if the source index is out of bounds for the source string.
269 *
270 * @param i destination index
271 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
272 * or else the function returns immediately. Check for U_FAILURE()
273 * on output or use with function chaining. (See User Guide for details.)
274 * @return TRUE if the edit for the destination index was found
275 * @draft ICU 60
276 */
277 UBool findDestinationIndex(int32_t i, UErrorCode &errorCode) {
278 return findIndex(i, FALSE, errorCode) == 0;
279 }
280
281 /**
282 * Computes the destination index corresponding to the given source index.
283 * If the source index is inside a change edit (not at its start),
284 * then the destination index at the end of that edit is returned,
285 * since there is no information about index mapping inside a change edit.
286 *
287 * (This means that indexes to the start and middle of an edit,
288 * for example around a grapheme cluster, are mapped to indexes
289 * encompassing the entire edit.
290 * The alternative, mapping an interior index to the start,
291 * would map such an interval to an empty one.)
292 *
293 * This operation will usually but not always modify this object.
294 * The iterator state after this search is undefined.
295 *
296 * @param i source index
297 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
298 * or else the function returns immediately. Check for U_FAILURE()
299 * on output or use with function chaining. (See User Guide for details.)
300 * @return destination index; undefined if i is not 0..string length
301 * @draft ICU 60
302 */
303 int32_t destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode);
304
305 /**
306 * Computes the source index corresponding to the given destination index.
307 * If the destination index is inside a change edit (not at its start),
308 * then the source index at the end of that edit is returned,
309 * since there is no information about index mapping inside a change edit.
310 *
311 * (This means that indexes to the start and middle of an edit,
312 * for example around a grapheme cluster, are mapped to indexes
313 * encompassing the entire edit.
314 * The alternative, mapping an interior index to the start,
315 * would map such an interval to an empty one.)
316 *
317 * This operation will usually but not always modify this object.
318 * The iterator state after this search is undefined.
319 *
320 * @param i destination index
321 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
322 * or else the function returns immediately. Check for U_FAILURE()
323 * on output or use with function chaining. (See User Guide for details.)
324 * @return source index; undefined if i is not 0..string length
325 * @draft ICU 60
326 */
327 int32_t sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode);
328 #endif // U_HIDE_DRAFT_API
329
330 /**
331 * Returns whether the edit currently represented by the iterator is a change edit.
332 *
333 * @return TRUE if this edit replaces oldLength() units with newLength() different ones.
334 * FALSE if oldLength units remain unchanged.
335 * @stable ICU 59
336 */
337 UBool hasChange() const { return changed; }
338
339 /**
340 * The length of the current span in the source string, which starts at {@link #sourceIndex}.
341 *
342 * @return the number of units in the original string which are replaced or remain unchanged.
343 * @stable ICU 59
344 */
345 int32_t oldLength() const { return oldLength_; }
346
347 /**
348 * The length of the current span in the destination string, which starts at
349 * {@link #destinationIndex}, or in the replacement string, which starts at
350 * {@link #replacementIndex}.
351 *
352 * @return the number of units in the modified string, if hasChange() is TRUE.
353 * Same as oldLength if hasChange() is FALSE.
354 * @stable ICU 59
355 */
356 int32_t newLength() const { return newLength_; }
357
358 /**
359 * The start index of the current span in the source string; the span has length
360 * {@link #oldLength}.
361 *
362 * @return the current index into the source string
363 * @stable ICU 59
364 */
365 int32_t sourceIndex() const { return srcIndex; }
366
367 /**
368 * The start index of the current span in the replacement string; the span has length
369 * {@link #newLength}. Well-defined only if the current edit is a change edit.
370 * <p>
371 * The <em>replacement string</em> is the concatenation of all substrings of the destination
372 * string corresponding to change edits.
373 * <p>
374 * This method is intended to be used together with operations that write only replacement
375 * characters (e.g., {@link CaseMap#omitUnchangedText()}). The source string can then be modified
376 * in-place.
377 *
378 * @return the current index into the replacement-characters-only string,
379 * not counting unchanged spans
380 * @stable ICU 59
381 */
382 int32_t replacementIndex() const {
383 // TODO: Throw an exception if we aren't in a change edit?
384 return replIndex;
385 }
386
387 /**
388 * The start index of the current span in the destination string; the span has length
389 * {@link #newLength}.
390 *
391 * @return the current index into the full destination string
392 * @stable ICU 59
393 */
394 int32_t destinationIndex() const { return destIndex; }
395
396 #ifndef U_HIDE_INTERNAL_API
397 /**
398 * A string representation of the current edit represented by the iterator for debugging. You
399 * should not depend on the contents of the return string.
400 * @internal
401 */
402 UnicodeString& toString(UnicodeString& appendTo) const;
403 #endif // U_HIDE_INTERNAL_API
404
405 private:
406 friend class Edits;
407
408 Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs);
409
410 int32_t readLength(int32_t head);
411 void updateNextIndexes();
412 void updatePreviousIndexes();
413 UBool noNext();
414 UBool next(UBool onlyChanges, UErrorCode &errorCode);
415 UBool previous(UErrorCode &errorCode);
416 /** @return -1: error or i<0; 0: found; 1: i>=string length */
417 int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode);
418
419 const uint16_t *array;
420 int32_t index, length;
421 // 0 if we are not within compressed equal-length changes.
422 // Otherwise the number of remaining changes, including the current one.
423 int32_t remaining;
424 UBool onlyChanges_, coarse;
425
426 int8_t dir; // iteration direction: back(<0), initial(0), forward(>0)
427 UBool changed;
428 int32_t oldLength_, newLength_;
429 int32_t srcIndex, replIndex, destIndex;
430 };
431
432 /**
433 * Returns an Iterator for coarse-grained change edits
434 * (adjacent change edits are treated as one).
435 * Can be used to perform simple string updates.
436 * Skips no-change edits.
437 * @return an Iterator that merges adjacent changes.
438 * @stable ICU 59
439 */
440 Iterator getCoarseChangesIterator() const {
441 return Iterator(array, length, TRUE, TRUE);
442 }
443
444 /**
445 * Returns an Iterator for coarse-grained change and no-change edits
446 * (adjacent change edits are treated as one).
447 * Can be used to perform simple string updates.
448 * Adjacent change edits are treated as one edit.
449 * @return an Iterator that merges adjacent changes.
450 * @stable ICU 59
451 */
452 Iterator getCoarseIterator() const {
453 return Iterator(array, length, FALSE, TRUE);
454 }
455
456 /**
457 * Returns an Iterator for fine-grained change edits
458 * (full granularity of change edits is retained).
459 * Can be used for modifying styled text.
460 * Skips no-change edits.
461 * @return an Iterator that separates adjacent changes.
462 * @stable ICU 59
463 */
464 Iterator getFineChangesIterator() const {
465 return Iterator(array, length, TRUE, FALSE);
466 }
467
468 /**
469 * Returns an Iterator for fine-grained change and no-change edits
470 * (full granularity of change edits is retained).
471 * Can be used for modifying styled text.
472 * @return an Iterator that separates adjacent changes.
473 * @stable ICU 59
474 */
475 Iterator getFineIterator() const {
476 return Iterator(array, length, FALSE, FALSE);
477 }
478
479 #ifndef U_HIDE_DRAFT_API
480 /**
481 * Merges the two input Edits and appends the result to this object.
482 *
483 * Consider two string transformations (for example, normalization and case mapping)
484 * where each records Edits in addition to writing an output string.<br>
485 * Edits ab reflect how substrings of input string a
486 * map to substrings of intermediate string b.<br>
487 * Edits bc reflect how substrings of intermediate string b
488 * map to substrings of output string c.<br>
489 * This function merges ab and bc such that the additional edits
490 * recorded in this object reflect how substrings of input string a
491 * map to substrings of output string c.
492 *
493 * If unrelated Edits are passed in where the output string of the first
494 * has a different length than the input string of the second,
495 * then a U_ILLEGAL_ARGUMENT_ERROR is reported.
496 *
497 * @param ab reflects how substrings of input string a
498 * map to substrings of intermediate string b.
499 * @param bc reflects how substrings of intermediate string b
500 * map to substrings of output string c.
501 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
502 * or else the function returns immediately. Check for U_FAILURE()
503 * on output or use with function chaining. (See User Guide for details.)
504 * @return *this, with the merged edits appended
505 * @draft ICU 60
506 */
507 Edits &mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode);
508 #endif // U_HIDE_DRAFT_API
509
510 private:
511 void releaseArray() U_NOEXCEPT;
512 Edits &copyArray(const Edits &other);
513 Edits &moveArray(Edits &src) U_NOEXCEPT;
514
515 void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; }
516 int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; }
517
518 void append(int32_t r);
519 UBool growArray();
520
521 static const int32_t STACK_CAPACITY = 100;
522 uint16_t *array;
523 int32_t capacity;
524 int32_t length;
525 int32_t delta;
526 int32_t numChanges;
527 UErrorCode errorCode_;
528 uint16_t stackArray[STACK_CAPACITY];
529 };
530
531 U_NAMESPACE_END
532 #endif // U_SHOW_CPLUSPLUS_API
533
534 #endif // __EDITS_H__