]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/unicode/chariter.h
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / common / unicode / chariter.h
CommitLineData
b75a7d8f
A
1/*
2********************************************************************
3*
374ca955 4* Copyright (C) 1997-2004, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7********************************************************************
8*/
9
10#ifndef CHARITER_H
11#define CHARITER_H
12
13#include "unicode/utypes.h"
14#include "unicode/uobject.h"
15#include "unicode/unistr.h"
16
17U_NAMESPACE_BEGIN
18/**
19 * Abstract class that defines an API for forward-only iteration
20 * on text objects.
21 * This is a minimal interface for iteration without random access
22 * or backwards iteration. It is especially useful for wrapping
23 * streams with converters into an object for collation or
24 * normalization.
25 *
26 * <p>Characters can be accessed in two ways: as code units or as
27 * code points.
28 * Unicode code points are 21-bit integers and are the scalar values
29 * of Unicode characters. ICU uses the type UChar32 for them.
30 * Unicode code units are the storage units of a given
31 * Unicode/UCS Transformation Format (a character encoding scheme).
32 * With UTF-16, all code points can be represented with either one
33 * or two code units ("surrogates").
34 * String storage is typically based on code units, while properties
35 * of characters are typically determined using code point values.
36 * Some processes may be designed to work with sequences of code units,
37 * or it may be known that all characters that are important to an
38 * algorithm can be represented with single code units.
39 * Other processes will need to use the code point access functions.</p>
40 *
41 * <p>ForwardCharacterIterator provides nextPostInc() to access
42 * a code unit and advance an internal position into the text object,
43 * similar to a <code>return text[position++]</code>.<br>
44 * It provides next32PostInc() to access a code point and advance an internal
45 * position.</p>
46 *
47 * <p>next32PostInc() assumes that the current position is that of
48 * the beginning of a code point, i.e., of its first code unit.
49 * After next32PostInc(), this will be true again.
50 * In general, access to code units and code points in the same
51 * iteration loop should not be mixed. In UTF-16, if the current position
52 * is on a second code unit (Low Surrogate), then only that code unit
53 * is returned even by next32PostInc().</p>
54 *
55 * <p>For iteration with either function, there are two ways to
56 * check for the end of the iteration. When there are no more
57 * characters in the text object:
58 * <ul>
59 * <li>The hasNext() function returns FALSE.</li>
60 * <li>nextPostInc() and next32PostInc() return DONE
61 * when one attempts to read beyond the end of the text object.</li>
62 * </ul>
63 *
64 * Example:
65 * \code
66 * void function1(ForwardCharacterIterator &it) {
67 * UChar32 c;
68 * while(it.hasNext()) {
69 * c=it.next32PostInc();
70 * // use c
71 * }
72 * }
73 *
74 * void function1(ForwardCharacterIterator &it) {
75 * UChar c;
76 * while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {
77 * // use c
78 * }
79 * }
80 * \endcode
81 * </p>
82 *
83 * @stable ICU 2.0
84 */
85class U_COMMON_API ForwardCharacterIterator : public UObject {
86public:
374ca955
A
87 /**
88 * Value returned by most of ForwardCharacterIterator's functions
89 * when the iterator has reached the limits of its iteration.
90 * @stable ICU 2.0
91 */
92 enum { DONE = 0xffff };
93
94 /**
95 * Destructor.
96 * @stable ICU 2.0
97 */
98 virtual ~ForwardCharacterIterator();
99
100 /**
101 * Returns true when both iterators refer to the same
102 * character in the same character-storage object.
103 * @param that The ForwardCharacterIterator to be compared for equality
104 * @return true when both iterators refer to the same
105 * character in the same character-storage object
106 * @stable ICU 2.0
107 */
108 virtual UBool operator==(const ForwardCharacterIterator& that) const = 0;
109
110 /**
111 * Returns true when the iterators refer to different
112 * text-storage objects, or to different characters in the
113 * same text-storage object.
114 * @param that The ForwardCharacterIterator to be compared for inequality
115 * @return true when the iterators refer to different
116 * text-storage objects, or to different characters in the
117 * same text-storage object
118 * @stable ICU 2.0
119 */
120 inline UBool operator!=(const ForwardCharacterIterator& that) const;
121
122 /**
123 * Generates a hash code for this iterator.
124 * @return the hash code.
125 * @stable ICU 2.0
126 */
127 virtual int32_t hashCode(void) const = 0;
128
129 /**
130 * Returns a UClassID for this ForwardCharacterIterator ("poor man's
131 * RTTI").<P> Despite the fact that this function is public,
132 * DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!
133 * @return a UClassID for this ForwardCharacterIterator
134 * @stable ICU 2.0
135 */
136 virtual UClassID getDynamicClassID(void) const = 0;
137
138 /**
139 * Gets the current code unit for returning and advances to the next code unit
140 * in the iteration range
141 * (toward endIndex()). If there are
142 * no more code units to return, returns DONE.
143 * @return the current code unit.
144 * @stable ICU 2.0
145 */
146 virtual UChar nextPostInc(void) = 0;
147
148 /**
149 * Gets the current code point for returning and advances to the next code point
150 * in the iteration range
151 * (toward endIndex()). If there are
152 * no more code points to return, returns DONE.
153 * @return the current code point.
154 * @stable ICU 2.0
155 */
156 virtual UChar32 next32PostInc(void) = 0;
157
158 /**
159 * Returns FALSE if there are no more code units or code points
160 * at or after the current position in the iteration range.
161 * This is used with nextPostInc() or next32PostInc() in forward
162 * iteration.
163 * @returns FALSE if there are no more code units or code points
164 * at or after the current position in the iteration range.
165 * @stable ICU 2.0
166 */
167 virtual UBool hasNext() = 0;
168
b75a7d8f 169protected:
374ca955
A
170 /** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/
171 ForwardCharacterIterator();
172
173 /** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/
174 ForwardCharacterIterator(const ForwardCharacterIterator &other);
175
176 /**
177 * Assignment operator to be overridden in the implementing class.
178 * @stable ICU 2.0
179 */
180 ForwardCharacterIterator &operator=(const ForwardCharacterIterator&) { return *this; }
b75a7d8f
A
181};
182
183/**
184 * Abstract class that defines an API for iteration
185 * on text objects.
186 * This is an interface for forward and backward iteration
187 * and random access into a text object.
188 *
189 * <p>The API provides backward compatibility to the Java and older ICU
190 * CharacterIterator classes but extends them significantly:
191 * <ol>
192 * <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>
193 * <li>While the old API functions provided forward iteration with
194 * "pre-increment" semantics, the new one also provides functions
195 * with "post-increment" semantics. They are more efficient and should
196 * be the preferred iterator functions for new implementations.
197 * The backward iteration always had "pre-decrement" semantics, which
198 * are efficient.</li>
199 * <li>Just like ForwardCharacterIterator, it provides access to
200 * both code units and code points. Code point access versions are available
201 * for the old and the new iteration semantics.</li>
202 * <li>There are new functions for setting and moving the current position
203 * without returning a character, for efficiency.</li>
204 * </ol>
205 *
206 * See ForwardCharacterIterator for examples for using the new forward iteration
207 * functions. For backward iteration, there is also a hasPrevious() function
208 * that can be used analogously to hasNext().
209 * The old functions work as before and are shown below.</p>
210 *
211 * <p>Examples for some of the new functions:</p>
212 *
213 * Forward iteration with hasNext():
214 * \code
215 * void forward1(CharacterIterator &it) {
216 * UChar32 c;
217 * for(it.setToStart(); it.hasNext();) {
218 * c=it.next32PostInc();
219 * // use c
220 * }
221 * }
222 * \endcode
223 * Forward iteration more similar to loops with the old forward iteration,
224 * showing a way to convert simple for() loops:
225 * \code
226 * void forward2(CharacterIterator &it) {
227 * UChar c;
228 * for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {
229 * // use c
230 * }
231 * }
232 * \endcode
233 * Backward iteration with setToEnd() and hasPrevious():
234 * \code
235 * void backward1(CharacterIterator &it) {
236 * UChar32 c;
237 * for(it.setToEnd(); it.hasPrevious();) {
238 * c=it.previous32();
239 * // use c
240 * }
241 * }
242 * \endcode
243 * Backward iteration with a more traditional for() loop:
244 * \code
245 * void backward2(CharacterIterator &it) {
246 * UChar c;
247 * for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {
248 * // use c
249 * }
250 * }
251 * \endcode
252 *
253 * Example for random access:
254 * \code
255 * void random(CharacterIterator &it) {
256 * // set to the third code point from the beginning
257 * it.move32(3, CharacterIterator::kStart);
258 * // get a code point from here without moving the position
259 * UChar32 c=it.current32();
260 * // get the position
261 * int32_t pos=it.getIndex();
262 * // get the previous code unit
263 * UChar u=it.previous();
264 * // move back one more code unit
265 * it.move(-1, CharacterIterator::kCurrent);
266 * // set the position back to where it was
267 * // and read the same code point c and move beyond it
268 * it.setIndex(pos);
269 * if(c!=it.next32PostInc()) {
270 * exit(1); // CharacterIterator inconsistent
271 * }
272 * }
273 * \endcode
274 *
275 * <p>Examples, especially for the old API:</p>
276 *
277 * Function processing characters, in this example simple output
278 * <pre>
279 * \code
280 * void processChar( UChar c )
281 * {
282 * cout << " " << c;
283 * }
284 * \endcode
285 * </pre>
286 * Traverse the text from start to finish
287 * <pre>
288 * \code
289 * void traverseForward(CharacterIterator& iter)
290 * {
291 * for(UChar c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
292 * processChar(c);
293 * }
294 * }
295 * \endcode
296 * </pre>
297 * Traverse the text backwards, from end to start
298 * <pre>
299 * \code
300 * void traverseBackward(CharacterIterator& iter)
301 * {
302 * for(UChar c = iter.last(); c != CharacterIterator.DONE; c = iter.previous()) {
303 * processChar(c);
304 * }
305 * }
306 * \endcode
307 * </pre>
308 * Traverse both forward and backward from a given position in the text.
309 * Calls to notBoundary() in this example represents some additional stopping criteria.
310 * <pre>
311 * \code
312 * void traverseOut(CharacterIterator& iter, int32_t pos)
313 * {
314 * UChar c;
315 * for (c = iter.setIndex(pos);
316 * c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
317 * c = iter.next()) {}
318 * int32_t end = iter.getIndex();
319 * for (c = iter.setIndex(pos);
320 * c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
321 * c = iter.previous()) {}
322 * int32_t start = iter.getIndex() + 1;
323 *
324 * cout << "start: " << start << " end: " << end << endl;
325 * for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {
326 * processChar(c);
327 * }
328 * }
329 * \endcode
330 * </pre>
331 * Creating a StringCharacterIterator and calling the test functions
332 * <pre>
333 * \code
334 * void CharacterIterator_Example( void )
335 * {
336 * cout << endl << "===== CharacterIterator_Example: =====" << endl;
337 * UnicodeString text("Ein kleiner Satz.");
338 * StringCharacterIterator iterator(text);
339 * cout << "----- traverseForward: -----------" << endl;
340 * traverseForward( iterator );
341 * cout << endl << endl << "----- traverseBackward: ----------" << endl;
342 * traverseBackward( iterator );
343 * cout << endl << endl << "----- traverseOut: ---------------" << endl;
344 * traverseOut( iterator, 7 );
345 * cout << endl << endl << "-----" << endl;
346 * }
347 * \endcode
348 * </pre>
349 *
350 * @stable ICU 2.0
351 */
352class U_COMMON_API CharacterIterator : public ForwardCharacterIterator {
353public:
374ca955
A
354 /**
355 * Origin enumeration for the move() and move32() functions.
356 * @stable ICU 2.0
357 */
358 enum EOrigin { kStart, kCurrent, kEnd };
359
360 /**
361 * Returns a pointer to a new CharacterIterator of the same
362 * concrete class as this one, and referring to the same
363 * character in the same text-storage object as this one. The
364 * caller is responsible for deleting the new clone.
365 * @return a pointer to a new CharacterIterator
366 * @stable ICU 2.0
367 */
368 virtual CharacterIterator* clone(void) const = 0;
369
370 /**
371 * Sets the iterator to refer to the first code unit in its
372 * iteration range, and returns that code unit.
373 * This can be used to begin an iteration with next().
374 * @return the first code unit in its iteration range.
375 * @stable ICU 2.0
376 */
377 virtual UChar first(void) = 0;
378
379 /**
380 * Sets the iterator to refer to the first code unit in its
381 * iteration range, returns that code unit, and moves the position
382 * to the second code unit. This is an alternative to setToStart()
383 * for forward iteration with nextPostInc().
384 * @return the first code unit in its iteration range.
385 * @stable ICU 2.0
386 */
387 virtual UChar firstPostInc(void);
388
389 /**
390 * Sets the iterator to refer to the first code point in its
391 * iteration range, and returns that code unit,
392 * This can be used to begin an iteration with next32().
393 * Note that an iteration with next32PostInc(), beginning with,
394 * e.g., setToStart() or firstPostInc(), is more efficient.
395 * @return the first code point in its iteration range.
396 * @stable ICU 2.0
397 */
398 virtual UChar32 first32(void) = 0;
399
400 /**
401 * Sets the iterator to refer to the first code point in its
402 * iteration range, returns that code point, and moves the position
403 * to the second code point. This is an alternative to setToStart()
404 * for forward iteration with next32PostInc().
405 * @return the first code point in its iteration range.
406 * @stable ICU 2.0
407 */
408 virtual UChar32 first32PostInc(void);
409
410 /**
411 * Sets the iterator to refer to the first code unit or code point in its
412 * iteration range. This can be used to begin a forward
413 * iteration with nextPostInc() or next32PostInc().
414 * @return the start position of the iteration range
415 * @stable ICU 2.0
416 */
417 inline int32_t setToStart();
418
419 /**
420 * Sets the iterator to refer to the last code unit in its
421 * iteration range, and returns that code unit.
422 * This can be used to begin an iteration with previous().
423 * @return the last code unit.
424 * @stable ICU 2.0
425 */
426 virtual UChar last(void) = 0;
b75a7d8f 427
374ca955
A
428 /**
429 * Sets the iterator to refer to the last code point in its
430 * iteration range, and returns that code unit.
431 * This can be used to begin an iteration with previous32().
432 * @return the last code point.
433 * @stable ICU 2.0
434 */
435 virtual UChar32 last32(void) = 0;
436
437 /**
438 * Sets the iterator to the end of its iteration range, just behind
439 * the last code unit or code point. This can be used to begin a backward
440 * iteration with previous() or previous32().
441 * @return the end position of the iteration range
442 * @stable ICU 2.0
443 */
444 inline int32_t setToEnd();
445
446 /**
447 * Sets the iterator to refer to the "position"-th code unit
448 * in the text-storage object the iterator refers to, and
449 * returns that code unit.
450 * @param position the "position"-th code unit in the text-storage object
451 * @return the "position"-th code unit.
452 * @stable ICU 2.0
453 */
454 virtual UChar setIndex(int32_t position) = 0;
455
456 /**
457 * Sets the iterator to refer to the beginning of the code point
458 * that contains the "position"-th code unit
459 * in the text-storage object the iterator refers to, and
460 * returns that code point.
461 * The current position is adjusted to the beginning of the code point
462 * (its first code unit).
463 * @param position the "position"-th code unit in the text-storage object
464 * @return the "position"-th code point.
465 * @stable ICU 2.0
466 */
467 virtual UChar32 setIndex32(int32_t position) = 0;
468
469 /**
470 * Returns the code unit the iterator currently refers to.
471 * @return the current code unit.
472 * @stable ICU 2.0
473 */
474 virtual UChar current(void) const = 0;
b75a7d8f 475
374ca955
A
476 /**
477 * Returns the code point the iterator currently refers to.
478 * @return the current code point.
479 * @stable ICU 2.0
480 */
481 virtual UChar32 current32(void) const = 0;
b75a7d8f 482
374ca955
A
483 /**
484 * Advances to the next code unit in the iteration range
485 * (toward endIndex()), and returns that code unit. If there are
486 * no more code units to return, returns DONE.
487 * @return the next code unit.
488 * @stable ICU 2.0
489 */
490 virtual UChar next(void) = 0;
b75a7d8f 491
374ca955
A
492 /**
493 * Advances to the next code point in the iteration range
494 * (toward endIndex()), and returns that code point. If there are
495 * no more code points to return, returns DONE.
496 * Note that iteration with "pre-increment" semantics is less
497 * efficient than iteration with "post-increment" semantics
498 * that is provided by next32PostInc().
499 * @return the next code point.
500 * @stable ICU 2.0
501 */
502 virtual UChar32 next32(void) = 0;
b75a7d8f 503
374ca955
A
504 /**
505 * Advances to the previous code unit in the iteration range
506 * (toward startIndex()), and returns that code unit. If there are
507 * no more code units to return, returns DONE.
508 * @return the previous code unit.
509 * @stable ICU 2.0
510 */
511 virtual UChar previous(void) = 0;
512
513 /**
514 * Advances to the previous code point in the iteration range
515 * (toward startIndex()), and returns that code point. If there are
516 * no more code points to return, returns DONE.
517 * @return the previous code point.
518 * @stable ICU 2.0
519 */
520 virtual UChar32 previous32(void) = 0;
521
522 /**
523 * Returns FALSE if there are no more code units or code points
524 * before the current position in the iteration range.
525 * This is used with previous() or previous32() in backward
526 * iteration.
527 * @return FALSE if there are no more code units or code points
528 * before the current position in the iteration range, return TRUE otherwise.
529 * @stable ICU 2.0
530 */
531 virtual UBool hasPrevious() = 0;
532
533 /**
534 * Returns the numeric index in the underlying text-storage
535 * object of the character returned by first(). Since it's
536 * possible to create an iterator that iterates across only
537 * part of a text-storage object, this number isn't
538 * necessarily 0.
539 * @returns the numeric index in the underlying text-storage
540 * object of the character returned by first().
541 * @stable ICU 2.0
542 */
543 inline int32_t startIndex(void) const;
b75a7d8f 544
374ca955
A
545 /**
546 * Returns the numeric index in the underlying text-storage
547 * object of the position immediately BEYOND the character
548 * returned by last().
549 * @return the numeric index in the underlying text-storage
550 * object of the position immediately BEYOND the character
551 * returned by last().
552 * @stable ICU 2.0
553 */
554 inline int32_t endIndex(void) const;
b75a7d8f 555
374ca955
A
556 /**
557 * Returns the numeric index in the underlying text-storage
558 * object of the character the iterator currently refers to
559 * (i.e., the character returned by current()).
560 * @return the numberic index in the text-storage object of
561 * the character the iterator currently refers to
562 * @stable ICU 2.0
563 */
564 inline int32_t getIndex(void) const;
565
566 /**
567 * Returns the length of the entire text in the underlying
568 * text-storage object.
569 * @return the length of the entire text in the text-storage object
570 * @stable ICU 2.0
571 */
572 inline int32_t getLength() const;
573
574 /**
575 * Moves the current position relative to the start or end of the
576 * iteration range, or relative to the current position itself.
577 * The movement is expressed in numbers of code units forward
578 * or backward by specifying a positive or negative delta.
579 * @param delta the position relative to origin. A positive delta means forward;
580 * a negative delta means backward.
581 * @param origin Origin enumeration {kStart, kCurrent, kEnd}
582 * @return the new position
583 * @stable ICU 2.0
584 */
585 virtual int32_t move(int32_t delta, EOrigin origin) = 0;
586
587 /**
588 * Moves the current position relative to the start or end of the
589 * iteration range, or relative to the current position itself.
590 * The movement is expressed in numbers of code points forward
591 * or backward by specifying a positive or negative delta.
592 * @param delta the position relative to origin. A positive delta means forward;
593 * a negative delta means backward.
594 * @param origin Origin enumeration {kStart, kCurrent, kEnd}
595 * @return the new position
596 * @stable ICU 2.0
597 */
598 virtual int32_t move32(int32_t delta, EOrigin origin) = 0;
599
600 /**
601 * Copies the text under iteration into the UnicodeString
602 * referred to by "result".
603 * @param result Receives a copy of the text under iteration.
604 * @stable ICU 2.0
605 */
606 virtual void getText(UnicodeString& result) = 0;
b75a7d8f
A
607
608protected:
374ca955
A
609 /**
610 * Empty constructor.
611 * @stable ICU 2.0
612 */
613 CharacterIterator();
614
615 /**
616 * Constructor, just setting the length field in this base class.
617 * @stable ICU 2.0
618 */
619 CharacterIterator(int32_t length);
620
621 /**
622 * Constructor, just setting the length and position fields in this base class.
623 * @stable ICU 2.0
624 */
625 CharacterIterator(int32_t length, int32_t position);
626
627 /**
628 * Constructor, just setting the length, start, end, and position fields in this base class.
629 * @stable ICU 2.0
630 */
631 CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position);
b75a7d8f 632
374ca955
A
633 /**
634 * Copy constructor.
635 *
636 * @param that The CharacterIterator to be copied
637 * @stable ICU 2.0
638 */
639 CharacterIterator(const CharacterIterator &that);
640
641 /**
642 * Assignment operator. Sets this CharacterIterator to have the same behavior,
643 * as the one passed in.
644 * @param that The CharacterIterator passed in.
645 * @return the newly set CharacterIterator.
646 * @stable ICU 2.0
647 */
648 CharacterIterator &operator=(const CharacterIterator &that);
649
650 /**
651 * Base class text length field.
652 * Necessary this for correct getText() and hashCode().
653 * @stable ICU 2.0
654 */
655 int32_t textLength;
656
657 /**
658 * Base class field for the current position.
659 * @stable ICU 2.0
660 */
661 int32_t pos;
662
663 /**
664 * Base class field for the start of the iteration range.
665 * @stable ICU 2.0
666 */
667 int32_t begin;
668
669 /**
670 * Base class field for the end of the iteration range.
671 * @stable ICU 2.0
672 */
673 int32_t end;
b75a7d8f
A
674};
675
676inline UBool
677ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const {
374ca955 678 return !operator==(that);
b75a7d8f
A
679}
680
681inline int32_t
682CharacterIterator::setToStart() {
374ca955 683 return move(0, kStart);
b75a7d8f
A
684}
685
686inline int32_t
687CharacterIterator::setToEnd() {
374ca955 688 return move(0, kEnd);
b75a7d8f
A
689}
690
691inline int32_t
692CharacterIterator::startIndex(void) const {
374ca955 693 return begin;
b75a7d8f
A
694}
695
696inline int32_t
697CharacterIterator::endIndex(void) const {
374ca955 698 return end;
b75a7d8f
A
699}
700
701inline int32_t
702CharacterIterator::getIndex(void) const {
374ca955 703 return pos;
b75a7d8f
A
704}
705
706inline int32_t
707CharacterIterator::getLength(void) const {
374ca955 708 return textLength;
b75a7d8f
A
709}
710
711U_NAMESPACE_END
712#endif