1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ********************************************************************
6 * Copyright (C) 1997-2011, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ********************************************************************
15 #include "unicode/utypes.h"
17 #if U_SHOW_CPLUSPLUS_API
19 #include "unicode/uobject.h"
20 #include "unicode/unistr.h"
23 * \brief C++ API: Character Iterator
28 * Abstract class that defines an API for forward-only iteration
30 * This is a minimal interface for iteration without random access
31 * or backwards iteration. It is especially useful for wrapping
32 * streams with converters into an object for collation or
35 * <p>Characters can be accessed in two ways: as code units or as
37 * Unicode code points are 21-bit integers and are the scalar values
38 * of Unicode characters. ICU uses the type UChar32 for them.
39 * Unicode code units are the storage units of a given
40 * Unicode/UCS Transformation Format (a character encoding scheme).
41 * With UTF-16, all code points can be represented with either one
42 * or two code units ("surrogates").
43 * String storage is typically based on code units, while properties
44 * of characters are typically determined using code point values.
45 * Some processes may be designed to work with sequences of code units,
46 * or it may be known that all characters that are important to an
47 * algorithm can be represented with single code units.
48 * Other processes will need to use the code point access functions.</p>
50 * <p>ForwardCharacterIterator provides nextPostInc() to access
51 * a code unit and advance an internal position into the text object,
52 * similar to a <code>return text[position++]</code>.<br>
53 * It provides next32PostInc() to access a code point and advance an internal
56 * <p>next32PostInc() assumes that the current position is that of
57 * the beginning of a code point, i.e., of its first code unit.
58 * After next32PostInc(), this will be true again.
59 * In general, access to code units and code points in the same
60 * iteration loop should not be mixed. In UTF-16, if the current position
61 * is on a second code unit (Low Surrogate), then only that code unit
62 * is returned even by next32PostInc().</p>
64 * <p>For iteration with either function, there are two ways to
65 * check for the end of the iteration. When there are no more
66 * characters in the text object:
68 * <li>The hasNext() function returns FALSE.</li>
69 * <li>nextPostInc() and next32PostInc() return DONE
70 * when one attempts to read beyond the end of the text object.</li>
75 * void function1(ForwardCharacterIterator &it) {
77 * while(it.hasNext()) {
78 * c=it.next32PostInc();
83 * void function1(ForwardCharacterIterator &it) {
85 * while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {
94 class U_COMMON_API ForwardCharacterIterator
: public UObject
{
97 * Value returned by most of ForwardCharacterIterator's functions
98 * when the iterator has reached the limits of its iteration.
101 enum { DONE
= 0xffff };
107 virtual ~ForwardCharacterIterator();
110 * Returns true when both iterators refer to the same
111 * character in the same character-storage object.
112 * @param that The ForwardCharacterIterator to be compared for equality
113 * @return true when both iterators refer to the same
114 * character in the same character-storage object
117 virtual UBool
operator==(const ForwardCharacterIterator
& that
) const = 0;
120 * Returns true when the iterators refer to different
121 * text-storage objects, or to different characters in the
122 * same text-storage object.
123 * @param that The ForwardCharacterIterator to be compared for inequality
124 * @return true when the iterators refer to different
125 * text-storage objects, or to different characters in the
126 * same text-storage object
129 inline UBool
operator!=(const ForwardCharacterIterator
& that
) const;
132 * Generates a hash code for this iterator.
133 * @return the hash code.
136 virtual int32_t hashCode(void) const = 0;
139 * Returns a UClassID for this ForwardCharacterIterator ("poor man's
140 * RTTI").<P> Despite the fact that this function is public,
141 * DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!
142 * @return a UClassID for this ForwardCharacterIterator
145 virtual UClassID
getDynamicClassID(void) const = 0;
148 * Gets the current code unit for returning and advances to the next code unit
149 * in the iteration range
150 * (toward endIndex()). If there are
151 * no more code units to return, returns DONE.
152 * @return the current code unit.
155 virtual char16_t nextPostInc(void) = 0;
158 * Gets the current code point for returning and advances to the next code point
159 * in the iteration range
160 * (toward endIndex()). If there are
161 * no more code points to return, returns DONE.
162 * @return the current code point.
165 virtual UChar32
next32PostInc(void) = 0;
168 * Returns FALSE if there are no more code units or code points
169 * at or after the current position in the iteration range.
170 * This is used with nextPostInc() or next32PostInc() in forward
172 * @returns FALSE if there are no more code units or code points
173 * at or after the current position in the iteration range.
176 virtual UBool
hasNext() = 0;
179 /** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/
180 ForwardCharacterIterator();
182 /** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/
183 ForwardCharacterIterator(const ForwardCharacterIterator
&other
);
186 * Assignment operator to be overridden in the implementing class.
189 ForwardCharacterIterator
&operator=(const ForwardCharacterIterator
&) { return *this; }
193 * Abstract class that defines an API for iteration
195 * This is an interface for forward and backward iteration
196 * and random access into a text object.
198 * <p>The API provides backward compatibility to the Java and older ICU
199 * CharacterIterator classes but extends them significantly:
201 * <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>
202 * <li>While the old API functions provided forward iteration with
203 * "pre-increment" semantics, the new one also provides functions
204 * with "post-increment" semantics. They are more efficient and should
205 * be the preferred iterator functions for new implementations.
206 * The backward iteration always had "pre-decrement" semantics, which
207 * are efficient.</li>
208 * <li>Just like ForwardCharacterIterator, it provides access to
209 * both code units and code points. Code point access versions are available
210 * for the old and the new iteration semantics.</li>
211 * <li>There are new functions for setting and moving the current position
212 * without returning a character, for efficiency.</li>
215 * See ForwardCharacterIterator for examples for using the new forward iteration
216 * functions. For backward iteration, there is also a hasPrevious() function
217 * that can be used analogously to hasNext().
218 * The old functions work as before and are shown below.</p>
220 * <p>Examples for some of the new functions:</p>
222 * Forward iteration with hasNext():
224 * void forward1(CharacterIterator &it) {
226 * for(it.setToStart(); it.hasNext();) {
227 * c=it.next32PostInc();
232 * Forward iteration more similar to loops with the old forward iteration,
233 * showing a way to convert simple for() loops:
235 * void forward2(CharacterIterator &it) {
237 * for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {
242 * Backward iteration with setToEnd() and hasPrevious():
244 * void backward1(CharacterIterator &it) {
246 * for(it.setToEnd(); it.hasPrevious();) {
252 * Backward iteration with a more traditional for() loop:
254 * void backward2(CharacterIterator &it) {
256 * for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {
262 * Example for random access:
264 * void random(CharacterIterator &it) {
265 * // set to the third code point from the beginning
266 * it.move32(3, CharacterIterator::kStart);
267 * // get a code point from here without moving the position
268 * UChar32 c=it.current32();
269 * // get the position
270 * int32_t pos=it.getIndex();
271 * // get the previous code unit
272 * char16_t u=it.previous();
273 * // move back one more code unit
274 * it.move(-1, CharacterIterator::kCurrent);
275 * // set the position back to where it was
276 * // and read the same code point c and move beyond it
278 * if(c!=it.next32PostInc()) {
279 * exit(1); // CharacterIterator inconsistent
284 * <p>Examples, especially for the old API:</p>
286 * Function processing characters, in this example simple output
289 * void processChar( char16_t c )
295 * Traverse the text from start to finish
298 * void traverseForward(CharacterIterator& iter)
300 * for(char16_t c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
306 * Traverse the text backwards, from end to start
309 * void traverseBackward(CharacterIterator& iter)
311 * for(char16_t c = iter.last(); c != CharacterIterator.DONE; c = iter.previous()) {
317 * Traverse both forward and backward from a given position in the text.
318 * Calls to notBoundary() in this example represents some additional stopping criteria.
321 * void traverseOut(CharacterIterator& iter, int32_t pos)
324 * for (c = iter.setIndex(pos);
325 * c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
326 * c = iter.next()) {}
327 * int32_t end = iter.getIndex();
328 * for (c = iter.setIndex(pos);
329 * c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
330 * c = iter.previous()) {}
331 * int32_t start = iter.getIndex() + 1;
333 * cout << "start: " << start << " end: " << end << endl;
334 * for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {
340 * Creating a StringCharacterIterator and calling the test functions
343 * void CharacterIterator_Example( void )
345 * cout << endl << "===== CharacterIterator_Example: =====" << endl;
346 * UnicodeString text("Ein kleiner Satz.");
347 * StringCharacterIterator iterator(text);
348 * cout << "----- traverseForward: -----------" << endl;
349 * traverseForward( iterator );
350 * cout << endl << endl << "----- traverseBackward: ----------" << endl;
351 * traverseBackward( iterator );
352 * cout << endl << endl << "----- traverseOut: ---------------" << endl;
353 * traverseOut( iterator, 7 );
354 * cout << endl << endl << "-----" << endl;
361 class U_COMMON_API CharacterIterator
: public ForwardCharacterIterator
{
364 * Origin enumeration for the move() and move32() functions.
367 enum EOrigin
{ kStart
, kCurrent
, kEnd
};
373 virtual ~CharacterIterator();
376 * Returns a pointer to a new CharacterIterator of the same
377 * concrete class as this one, and referring to the same
378 * character in the same text-storage object as this one. The
379 * caller is responsible for deleting the new clone.
380 * @return a pointer to a new CharacterIterator
383 virtual CharacterIterator
* clone() const = 0;
386 * Sets the iterator to refer to the first code unit in its
387 * iteration range, and returns that code unit.
388 * This can be used to begin an iteration with next().
389 * @return the first code unit in its iteration range.
392 virtual char16_t first(void) = 0;
395 * Sets the iterator to refer to the first code unit in its
396 * iteration range, returns that code unit, and moves the position
397 * to the second code unit. This is an alternative to setToStart()
398 * for forward iteration with nextPostInc().
399 * @return the first code unit in its iteration range.
402 virtual char16_t firstPostInc(void);
405 * Sets the iterator to refer to the first code point in its
406 * iteration range, and returns that code unit,
407 * This can be used to begin an iteration with next32().
408 * Note that an iteration with next32PostInc(), beginning with,
409 * e.g., setToStart() or firstPostInc(), is more efficient.
410 * @return the first code point in its iteration range.
413 virtual UChar32
first32(void) = 0;
416 * Sets the iterator to refer to the first code point in its
417 * iteration range, returns that code point, and moves the position
418 * to the second code point. This is an alternative to setToStart()
419 * for forward iteration with next32PostInc().
420 * @return the first code point in its iteration range.
423 virtual UChar32
first32PostInc(void);
426 * Sets the iterator to refer to the first code unit or code point in its
427 * iteration range. This can be used to begin a forward
428 * iteration with nextPostInc() or next32PostInc().
429 * @return the start position of the iteration range
432 inline int32_t setToStart();
435 * Sets the iterator to refer to the last code unit in its
436 * iteration range, and returns that code unit.
437 * This can be used to begin an iteration with previous().
438 * @return the last code unit.
441 virtual char16_t last(void) = 0;
444 * Sets the iterator to refer to the last code point in its
445 * iteration range, and returns that code unit.
446 * This can be used to begin an iteration with previous32().
447 * @return the last code point.
450 virtual UChar32
last32(void) = 0;
453 * Sets the iterator to the end of its iteration range, just behind
454 * the last code unit or code point. This can be used to begin a backward
455 * iteration with previous() or previous32().
456 * @return the end position of the iteration range
459 inline int32_t setToEnd();
462 * Sets the iterator to refer to the "position"-th code unit
463 * in the text-storage object the iterator refers to, and
464 * returns that code unit.
465 * @param position the "position"-th code unit in the text-storage object
466 * @return the "position"-th code unit.
469 virtual char16_t setIndex(int32_t position
) = 0;
472 * Sets the iterator to refer to the beginning of the code point
473 * that contains the "position"-th code unit
474 * in the text-storage object the iterator refers to, and
475 * returns that code point.
476 * The current position is adjusted to the beginning of the code point
477 * (its first code unit).
478 * @param position the "position"-th code unit in the text-storage object
479 * @return the "position"-th code point.
482 virtual UChar32
setIndex32(int32_t position
) = 0;
485 * Returns the code unit the iterator currently refers to.
486 * @return the current code unit.
489 virtual char16_t current(void) const = 0;
492 * Returns the code point the iterator currently refers to.
493 * @return the current code point.
496 virtual UChar32
current32(void) const = 0;
499 * Advances to the next code unit in the iteration range
500 * (toward endIndex()), and returns that code unit. If there are
501 * no more code units to return, returns DONE.
502 * @return the next code unit.
505 virtual char16_t next(void) = 0;
508 * Advances to the next code point in the iteration range
509 * (toward endIndex()), and returns that code point. If there are
510 * no more code points to return, returns DONE.
511 * Note that iteration with "pre-increment" semantics is less
512 * efficient than iteration with "post-increment" semantics
513 * that is provided by next32PostInc().
514 * @return the next code point.
517 virtual UChar32
next32(void) = 0;
520 * Advances to the previous code unit in the iteration range
521 * (toward startIndex()), and returns that code unit. If there are
522 * no more code units to return, returns DONE.
523 * @return the previous code unit.
526 virtual char16_t previous(void) = 0;
529 * Advances to the previous code point in the iteration range
530 * (toward startIndex()), and returns that code point. If there are
531 * no more code points to return, returns DONE.
532 * @return the previous code point.
535 virtual UChar32
previous32(void) = 0;
538 * Returns FALSE if there are no more code units or code points
539 * before the current position in the iteration range.
540 * This is used with previous() or previous32() in backward
542 * @return FALSE if there are no more code units or code points
543 * before the current position in the iteration range, return TRUE otherwise.
546 virtual UBool
hasPrevious() = 0;
549 * Returns the numeric index in the underlying text-storage
550 * object of the character returned by first(). Since it's
551 * possible to create an iterator that iterates across only
552 * part of a text-storage object, this number isn't
554 * @returns the numeric index in the underlying text-storage
555 * object of the character returned by first().
558 inline int32_t startIndex(void) const;
561 * Returns the numeric index in the underlying text-storage
562 * object of the position immediately BEYOND the character
563 * returned by last().
564 * @return the numeric index in the underlying text-storage
565 * object of the position immediately BEYOND the character
566 * returned by last().
569 inline int32_t endIndex(void) const;
572 * Returns the numeric index in the underlying text-storage
573 * object of the character the iterator currently refers to
574 * (i.e., the character returned by current()).
575 * @return the numeric index in the text-storage object of
576 * the character the iterator currently refers to
579 inline int32_t getIndex(void) const;
582 * Returns the length of the entire text in the underlying
583 * text-storage object.
584 * @return the length of the entire text in the text-storage object
587 inline int32_t getLength() const;
590 * Moves the current position relative to the start or end of the
591 * iteration range, or relative to the current position itself.
592 * The movement is expressed in numbers of code units forward
593 * or backward by specifying a positive or negative delta.
594 * @param delta the position relative to origin. A positive delta means forward;
595 * a negative delta means backward.
596 * @param origin Origin enumeration {kStart, kCurrent, kEnd}
597 * @return the new position
600 virtual int32_t move(int32_t delta
, EOrigin origin
) = 0;
603 * Moves the current position relative to the start or end of the
604 * iteration range, or relative to the current position itself.
605 * The movement is expressed in numbers of code points forward
606 * or backward by specifying a positive or negative delta.
607 * @param delta the position relative to origin. A positive delta means forward;
608 * a negative delta means backward.
609 * @param origin Origin enumeration {kStart, kCurrent, kEnd}
610 * @return the new position
614 // One of the system headers right now is sometimes defining a conflicting macro we don't use
617 virtual int32_t move32(int32_t delta
, EOrigin origin
) = 0;
620 * Copies the text under iteration into the UnicodeString
621 * referred to by "result".
622 * @param result Receives a copy of the text under iteration.
625 virtual void getText(UnicodeString
& result
) = 0;
635 * Constructor, just setting the length field in this base class.
638 CharacterIterator(int32_t length
);
641 * Constructor, just setting the length and position fields in this base class.
644 CharacterIterator(int32_t length
, int32_t position
);
647 * Constructor, just setting the length, start, end, and position fields in this base class.
650 CharacterIterator(int32_t length
, int32_t textBegin
, int32_t textEnd
, int32_t position
);
655 * @param that The CharacterIterator to be copied
658 CharacterIterator(const CharacterIterator
&that
);
661 * Assignment operator. Sets this CharacterIterator to have the same behavior,
662 * as the one passed in.
663 * @param that The CharacterIterator passed in.
664 * @return the newly set CharacterIterator.
667 CharacterIterator
&operator=(const CharacterIterator
&that
);
670 * Base class text length field.
671 * Necessary this for correct getText() and hashCode().
677 * Base class field for the current position.
683 * Base class field for the start of the iteration range.
689 * Base class field for the end of the iteration range.
696 ForwardCharacterIterator::operator!=(const ForwardCharacterIterator
& that
) const {
697 return !operator==(that
);
701 CharacterIterator::setToStart() {
702 return move(0, kStart
);
706 CharacterIterator::setToEnd() {
707 return move(0, kEnd
);
711 CharacterIterator::startIndex(void) const {
716 CharacterIterator::endIndex(void) const {
721 CharacterIterator::getIndex(void) const {
726 CharacterIterator::getLength(void) const {
732 #endif /* U_SHOW_CPLUSPLUS_API */