1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ********************************************************************
6 * Copyright (C) 1997-2011, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ********************************************************************
15 #include "unicode/utypes.h"
16 #include "unicode/uobject.h"
17 #include "unicode/unistr.h"
20 * \brief C++ API: Character Iterator
23 #if U_SHOW_CPLUSPLUS_API
26 * Abstract class that defines an API for forward-only iteration
28 * This is a minimal interface for iteration without random access
29 * or backwards iteration. It is especially useful for wrapping
30 * streams with converters into an object for collation or
33 * <p>Characters can be accessed in two ways: as code units or as
35 * Unicode code points are 21-bit integers and are the scalar values
36 * of Unicode characters. ICU uses the type UChar32 for them.
37 * Unicode code units are the storage units of a given
38 * Unicode/UCS Transformation Format (a character encoding scheme).
39 * With UTF-16, all code points can be represented with either one
40 * or two code units ("surrogates").
41 * String storage is typically based on code units, while properties
42 * of characters are typically determined using code point values.
43 * Some processes may be designed to work with sequences of code units,
44 * or it may be known that all characters that are important to an
45 * algorithm can be represented with single code units.
46 * Other processes will need to use the code point access functions.</p>
48 * <p>ForwardCharacterIterator provides nextPostInc() to access
49 * a code unit and advance an internal position into the text object,
50 * similar to a <code>return text[position++]</code>.<br>
51 * It provides next32PostInc() to access a code point and advance an internal
54 * <p>next32PostInc() assumes that the current position is that of
55 * the beginning of a code point, i.e., of its first code unit.
56 * After next32PostInc(), this will be true again.
57 * In general, access to code units and code points in the same
58 * iteration loop should not be mixed. In UTF-16, if the current position
59 * is on a second code unit (Low Surrogate), then only that code unit
60 * is returned even by next32PostInc().</p>
62 * <p>For iteration with either function, there are two ways to
63 * check for the end of the iteration. When there are no more
64 * characters in the text object:
66 * <li>The hasNext() function returns FALSE.</li>
67 * <li>nextPostInc() and next32PostInc() return DONE
68 * when one attempts to read beyond the end of the text object.</li>
73 * void function1(ForwardCharacterIterator &it) {
75 * while(it.hasNext()) {
76 * c=it.next32PostInc();
81 * void function1(ForwardCharacterIterator &it) {
83 * while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {
92 class U_COMMON_API ForwardCharacterIterator
: public UObject
{
95 * Value returned by most of ForwardCharacterIterator's functions
96 * when the iterator has reached the limits of its iteration.
99 enum { DONE
= 0xffff };
105 virtual ~ForwardCharacterIterator();
108 * Returns true when both iterators refer to the same
109 * character in the same character-storage object.
110 * @param that The ForwardCharacterIterator to be compared for equality
111 * @return true when both iterators refer to the same
112 * character in the same character-storage object
115 virtual UBool
operator==(const ForwardCharacterIterator
& that
) const = 0;
118 * Returns true when the iterators refer to different
119 * text-storage objects, or to different characters in the
120 * same text-storage object.
121 * @param that The ForwardCharacterIterator to be compared for inequality
122 * @return true when the iterators refer to different
123 * text-storage objects, or to different characters in the
124 * same text-storage object
127 inline UBool
operator!=(const ForwardCharacterIterator
& that
) const;
130 * Generates a hash code for this iterator.
131 * @return the hash code.
134 virtual int32_t hashCode(void) const = 0;
137 * Returns a UClassID for this ForwardCharacterIterator ("poor man's
138 * RTTI").<P> Despite the fact that this function is public,
139 * DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!
140 * @return a UClassID for this ForwardCharacterIterator
143 virtual UClassID
getDynamicClassID(void) const = 0;
146 * Gets the current code unit for returning and advances to the next code unit
147 * in the iteration range
148 * (toward endIndex()). If there are
149 * no more code units to return, returns DONE.
150 * @return the current code unit.
153 virtual char16_t nextPostInc(void) = 0;
156 * Gets the current code point for returning and advances to the next code point
157 * in the iteration range
158 * (toward endIndex()). If there are
159 * no more code points to return, returns DONE.
160 * @return the current code point.
163 virtual UChar32
next32PostInc(void) = 0;
166 * Returns FALSE if there are no more code units or code points
167 * at or after the current position in the iteration range.
168 * This is used with nextPostInc() or next32PostInc() in forward
170 * @returns FALSE if there are no more code units or code points
171 * at or after the current position in the iteration range.
174 virtual UBool
hasNext() = 0;
177 /** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/
178 ForwardCharacterIterator();
180 /** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/
181 ForwardCharacterIterator(const ForwardCharacterIterator
&other
);
184 * Assignment operator to be overridden in the implementing class.
187 ForwardCharacterIterator
&operator=(const ForwardCharacterIterator
&) { return *this; }
191 * Abstract class that defines an API for iteration
193 * This is an interface for forward and backward iteration
194 * and random access into a text object.
196 * <p>The API provides backward compatibility to the Java and older ICU
197 * CharacterIterator classes but extends them significantly:
199 * <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>
200 * <li>While the old API functions provided forward iteration with
201 * "pre-increment" semantics, the new one also provides functions
202 * with "post-increment" semantics. They are more efficient and should
203 * be the preferred iterator functions for new implementations.
204 * The backward iteration always had "pre-decrement" semantics, which
205 * are efficient.</li>
206 * <li>Just like ForwardCharacterIterator, it provides access to
207 * both code units and code points. Code point access versions are available
208 * for the old and the new iteration semantics.</li>
209 * <li>There are new functions for setting and moving the current position
210 * without returning a character, for efficiency.</li>
213 * See ForwardCharacterIterator for examples for using the new forward iteration
214 * functions. For backward iteration, there is also a hasPrevious() function
215 * that can be used analogously to hasNext().
216 * The old functions work as before and are shown below.</p>
218 * <p>Examples for some of the new functions:</p>
220 * Forward iteration with hasNext():
222 * void forward1(CharacterIterator &it) {
224 * for(it.setToStart(); it.hasNext();) {
225 * c=it.next32PostInc();
230 * Forward iteration more similar to loops with the old forward iteration,
231 * showing a way to convert simple for() loops:
233 * void forward2(CharacterIterator &it) {
235 * for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {
240 * Backward iteration with setToEnd() and hasPrevious():
242 * void backward1(CharacterIterator &it) {
244 * for(it.setToEnd(); it.hasPrevious();) {
250 * Backward iteration with a more traditional for() loop:
252 * void backward2(CharacterIterator &it) {
254 * for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {
260 * Example for random access:
262 * void random(CharacterIterator &it) {
263 * // set to the third code point from the beginning
264 * it.move32(3, CharacterIterator::kStart);
265 * // get a code point from here without moving the position
266 * UChar32 c=it.current32();
267 * // get the position
268 * int32_t pos=it.getIndex();
269 * // get the previous code unit
270 * char16_t u=it.previous();
271 * // move back one more code unit
272 * it.move(-1, CharacterIterator::kCurrent);
273 * // set the position back to where it was
274 * // and read the same code point c and move beyond it
276 * if(c!=it.next32PostInc()) {
277 * exit(1); // CharacterIterator inconsistent
282 * <p>Examples, especially for the old API:</p>
284 * Function processing characters, in this example simple output
287 * void processChar( char16_t c )
293 * Traverse the text from start to finish
296 * void traverseForward(CharacterIterator& iter)
298 * for(char16_t c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
304 * Traverse the text backwards, from end to start
307 * void traverseBackward(CharacterIterator& iter)
309 * for(char16_t c = iter.last(); c != CharacterIterator.DONE; c = iter.previous()) {
315 * Traverse both forward and backward from a given position in the text.
316 * Calls to notBoundary() in this example represents some additional stopping criteria.
319 * void traverseOut(CharacterIterator& iter, int32_t pos)
322 * for (c = iter.setIndex(pos);
323 * c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
324 * c = iter.next()) {}
325 * int32_t end = iter.getIndex();
326 * for (c = iter.setIndex(pos);
327 * c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
328 * c = iter.previous()) {}
329 * int32_t start = iter.getIndex() + 1;
331 * cout << "start: " << start << " end: " << end << endl;
332 * for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {
338 * Creating a StringCharacterIterator and calling the test functions
341 * void CharacterIterator_Example( void )
343 * cout << endl << "===== CharacterIterator_Example: =====" << endl;
344 * UnicodeString text("Ein kleiner Satz.");
345 * StringCharacterIterator iterator(text);
346 * cout << "----- traverseForward: -----------" << endl;
347 * traverseForward( iterator );
348 * cout << endl << endl << "----- traverseBackward: ----------" << endl;
349 * traverseBackward( iterator );
350 * cout << endl << endl << "----- traverseOut: ---------------" << endl;
351 * traverseOut( iterator, 7 );
352 * cout << endl << endl << "-----" << endl;
359 class U_COMMON_API CharacterIterator
: public ForwardCharacterIterator
{
362 * Origin enumeration for the move() and move32() functions.
365 enum EOrigin
{ kStart
, kCurrent
, kEnd
};
371 virtual ~CharacterIterator();
374 * Returns a pointer to a new CharacterIterator of the same
375 * concrete class as this one, and referring to the same
376 * character in the same text-storage object as this one. The
377 * caller is responsible for deleting the new clone.
378 * @return a pointer to a new CharacterIterator
381 virtual CharacterIterator
* clone(void) const = 0;
384 * Sets the iterator to refer to the first code unit in its
385 * iteration range, and returns that code unit.
386 * This can be used to begin an iteration with next().
387 * @return the first code unit in its iteration range.
390 virtual char16_t first(void) = 0;
393 * Sets the iterator to refer to the first code unit in its
394 * iteration range, returns that code unit, and moves the position
395 * to the second code unit. This is an alternative to setToStart()
396 * for forward iteration with nextPostInc().
397 * @return the first code unit in its iteration range.
400 virtual char16_t firstPostInc(void);
403 * Sets the iterator to refer to the first code point in its
404 * iteration range, and returns that code unit,
405 * This can be used to begin an iteration with next32().
406 * Note that an iteration with next32PostInc(), beginning with,
407 * e.g., setToStart() or firstPostInc(), is more efficient.
408 * @return the first code point in its iteration range.
411 virtual UChar32
first32(void) = 0;
414 * Sets the iterator to refer to the first code point in its
415 * iteration range, returns that code point, and moves the position
416 * to the second code point. This is an alternative to setToStart()
417 * for forward iteration with next32PostInc().
418 * @return the first code point in its iteration range.
421 virtual UChar32
first32PostInc(void);
424 * Sets the iterator to refer to the first code unit or code point in its
425 * iteration range. This can be used to begin a forward
426 * iteration with nextPostInc() or next32PostInc().
427 * @return the start position of the iteration range
430 inline int32_t setToStart();
433 * Sets the iterator to refer to the last code unit in its
434 * iteration range, and returns that code unit.
435 * This can be used to begin an iteration with previous().
436 * @return the last code unit.
439 virtual char16_t last(void) = 0;
442 * Sets the iterator to refer to the last code point in its
443 * iteration range, and returns that code unit.
444 * This can be used to begin an iteration with previous32().
445 * @return the last code point.
448 virtual UChar32
last32(void) = 0;
451 * Sets the iterator to the end of its iteration range, just behind
452 * the last code unit or code point. This can be used to begin a backward
453 * iteration with previous() or previous32().
454 * @return the end position of the iteration range
457 inline int32_t setToEnd();
460 * Sets the iterator to refer to the "position"-th code unit
461 * in the text-storage object the iterator refers to, and
462 * returns that code unit.
463 * @param position the "position"-th code unit in the text-storage object
464 * @return the "position"-th code unit.
467 virtual char16_t setIndex(int32_t position
) = 0;
470 * Sets the iterator to refer to the beginning of the code point
471 * that contains the "position"-th code unit
472 * in the text-storage object the iterator refers to, and
473 * returns that code point.
474 * The current position is adjusted to the beginning of the code point
475 * (its first code unit).
476 * @param position the "position"-th code unit in the text-storage object
477 * @return the "position"-th code point.
480 virtual UChar32
setIndex32(int32_t position
) = 0;
483 * Returns the code unit the iterator currently refers to.
484 * @return the current code unit.
487 virtual char16_t current(void) const = 0;
490 * Returns the code point the iterator currently refers to.
491 * @return the current code point.
494 virtual UChar32
current32(void) const = 0;
497 * Advances to the next code unit in the iteration range
498 * (toward endIndex()), and returns that code unit. If there are
499 * no more code units to return, returns DONE.
500 * @return the next code unit.
503 virtual char16_t next(void) = 0;
506 * Advances to the next code point in the iteration range
507 * (toward endIndex()), and returns that code point. If there are
508 * no more code points to return, returns DONE.
509 * Note that iteration with "pre-increment" semantics is less
510 * efficient than iteration with "post-increment" semantics
511 * that is provided by next32PostInc().
512 * @return the next code point.
515 virtual UChar32
next32(void) = 0;
518 * Advances to the previous code unit in the iteration range
519 * (toward startIndex()), and returns that code unit. If there are
520 * no more code units to return, returns DONE.
521 * @return the previous code unit.
524 virtual char16_t previous(void) = 0;
527 * Advances to the previous code point in the iteration range
528 * (toward startIndex()), and returns that code point. If there are
529 * no more code points to return, returns DONE.
530 * @return the previous code point.
533 virtual UChar32
previous32(void) = 0;
536 * Returns FALSE if there are no more code units or code points
537 * before the current position in the iteration range.
538 * This is used with previous() or previous32() in backward
540 * @return FALSE if there are no more code units or code points
541 * before the current position in the iteration range, return TRUE otherwise.
544 virtual UBool
hasPrevious() = 0;
547 * Returns the numeric index in the underlying text-storage
548 * object of the character returned by first(). Since it's
549 * possible to create an iterator that iterates across only
550 * part of a text-storage object, this number isn't
552 * @returns the numeric index in the underlying text-storage
553 * object of the character returned by first().
556 inline int32_t startIndex(void) const;
559 * Returns the numeric index in the underlying text-storage
560 * object of the position immediately BEYOND the character
561 * returned by last().
562 * @return the numeric index in the underlying text-storage
563 * object of the position immediately BEYOND the character
564 * returned by last().
567 inline int32_t endIndex(void) const;
570 * Returns the numeric index in the underlying text-storage
571 * object of the character the iterator currently refers to
572 * (i.e., the character returned by current()).
573 * @return the numeric index in the text-storage object of
574 * the character the iterator currently refers to
577 inline int32_t getIndex(void) const;
580 * Returns the length of the entire text in the underlying
581 * text-storage object.
582 * @return the length of the entire text in the text-storage object
585 inline int32_t getLength() const;
588 * Moves the current position relative to the start or end of the
589 * iteration range, or relative to the current position itself.
590 * The movement is expressed in numbers of code units forward
591 * or backward by specifying a positive or negative delta.
592 * @param delta the position relative to origin. A positive delta means forward;
593 * a negative delta means backward.
594 * @param origin Origin enumeration {kStart, kCurrent, kEnd}
595 * @return the new position
598 virtual int32_t move(int32_t delta
, EOrigin origin
) = 0;
601 * Moves the current position relative to the start or end of the
602 * iteration range, or relative to the current position itself.
603 * The movement is expressed in numbers of code points forward
604 * or backward by specifying a positive or negative delta.
605 * @param delta the position relative to origin. A positive delta means forward;
606 * a negative delta means backward.
607 * @param origin Origin enumeration {kStart, kCurrent, kEnd}
608 * @return the new position
612 // One of the system headers right now is sometimes defining a conflicting macro we don't use
615 virtual int32_t move32(int32_t delta
, EOrigin origin
) = 0;
618 * Copies the text under iteration into the UnicodeString
619 * referred to by "result".
620 * @param result Receives a copy of the text under iteration.
623 virtual void getText(UnicodeString
& result
) = 0;
633 * Constructor, just setting the length field in this base class.
636 CharacterIterator(int32_t length
);
639 * Constructor, just setting the length and position fields in this base class.
642 CharacterIterator(int32_t length
, int32_t position
);
645 * Constructor, just setting the length, start, end, and position fields in this base class.
648 CharacterIterator(int32_t length
, int32_t textBegin
, int32_t textEnd
, int32_t position
);
653 * @param that The CharacterIterator to be copied
656 CharacterIterator(const CharacterIterator
&that
);
659 * Assignment operator. Sets this CharacterIterator to have the same behavior,
660 * as the one passed in.
661 * @param that The CharacterIterator passed in.
662 * @return the newly set CharacterIterator.
665 CharacterIterator
&operator=(const CharacterIterator
&that
);
668 * Base class text length field.
669 * Necessary this for correct getText() and hashCode().
675 * Base class field for the current position.
681 * Base class field for the start of the iteration range.
687 * Base class field for the end of the iteration range.
694 ForwardCharacterIterator::operator!=(const ForwardCharacterIterator
& that
) const {
695 return !operator==(that
);
699 CharacterIterator::setToStart() {
700 return move(0, kStart
);
704 CharacterIterator::setToEnd() {
705 return move(0, kEnd
);
709 CharacterIterator::startIndex(void) const {
714 CharacterIterator::endIndex(void) const {
719 CharacterIterator::getIndex(void) const {
724 CharacterIterator::getLength(void) const {
729 #endif // U_SHOW_CPLUSPLUS_API