2 *******************************************************************************
4 * Copyright (C) 2002-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2002jan18
14 * created by: Markus W. Scherer
22 * \brief C API: Unicode Character Iteration
27 #include "unicode/utypes.h"
32 class CharacterIterator
;
41 typedef struct UCharIterator UCharIterator
; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
44 * Origin constants for UCharIterator.getIndex() and UCharIterator.move().
45 * @see UCharIteratorMove
49 typedef enum UCharIteratorOrigin
{
50 UITER_START
, UITER_CURRENT
, UITER_LIMIT
, UITER_ZERO
, UITER_LENGTH
51 } UCharIteratorOrigin
;
53 /** Constants for UCharIterator. @stable ICU 2.6 */
56 * Constant value that may be returned by UCharIteratorMove
57 * indicating that the final UTF-16 index is not known, but that the move succeeded.
58 * This can occur when moving relative to limit or length, or
59 * when moving relative to the current index after a setState()
60 * when the current UTF-16 index is not known.
62 * It would be very inefficient to have to count from the beginning of the text
63 * just to get the current/limit/length index after moving relative to it.
64 * The actual index can be determined with getIndex(UITER_CURRENT)
65 * which will count the UChars if necessary.
69 UITER_UNKNOWN_INDEX
=-2
74 * Constant for UCharIterator getState() indicating an error or
76 * Returned by uiter_getState()/UCharIteratorGetState
77 * when an error occurs.
78 * Also, some UCharIterator implementations may not be able to return
79 * a valid state for each position. This will be clearly documented
80 * for each such iterator (none of the public ones here).
84 #define UITER_NO_STATE ((uint32_t)0xffffffff)
87 * Function type declaration for UCharIterator.getIndex().
89 * Gets the current position, or the start or limit of the
92 * This function may perform slowly for UITER_CURRENT after setState() was called,
93 * or for UITER_LENGTH, because an iterator implementation may have to count
94 * UChars if the underlying storage is not UTF-16.
96 * @param iter the UCharIterator structure ("this pointer")
97 * @param origin get the 0, start, limit, length, or current index
98 * @return the requested index, or U_SENTINEL in an error condition
100 * @see UCharIteratorOrigin
104 typedef int32_t U_CALLCONV
105 UCharIteratorGetIndex(UCharIterator
*iter
, UCharIteratorOrigin origin
);
108 * Function type declaration for UCharIterator.move().
110 * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index).
112 * Moves the current position relative to the start or limit of the
113 * iteration range, or relative to the current position itself.
114 * The movement is expressed in numbers of code units forward
115 * or backward by specifying a positive or negative delta.
116 * Out of bounds movement will be pinned to the start or limit.
118 * This function may perform slowly for moving relative to UITER_LENGTH
119 * because an iterator implementation may have to count the rest of the
120 * UChars if the native storage is not UTF-16.
122 * When moving relative to the limit or length, or
123 * relative to the current position after setState() was called,
124 * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient
125 * determination of the actual UTF-16 index.
126 * The actual index can be determined with getIndex(UITER_CURRENT)
127 * which will count the UChars if necessary.
128 * See UITER_UNKNOWN_INDEX for details.
130 * @param iter the UCharIterator structure ("this pointer")
131 * @param delta can be positive, zero, or negative
132 * @param origin move relative to the 0, start, limit, length, or current index
133 * @return the new index, or U_SENTINEL on an error condition,
134 * or UITER_UNKNOWN_INDEX when the index is not known.
136 * @see UCharIteratorOrigin
138 * @see UITER_UNKNOWN_INDEX
141 typedef int32_t U_CALLCONV
142 UCharIteratorMove(UCharIterator
*iter
, int32_t delta
, UCharIteratorOrigin origin
);
145 * Function type declaration for UCharIterator.hasNext().
147 * Check if current() and next() can still
148 * return another code unit.
150 * @param iter the UCharIterator structure ("this pointer")
151 * @return boolean value for whether current() and next() can still return another code unit
156 typedef UBool U_CALLCONV
157 UCharIteratorHasNext(UCharIterator
*iter
);
160 * Function type declaration for UCharIterator.hasPrevious().
162 * Check if previous() can still return another code unit.
164 * @param iter the UCharIterator structure ("this pointer")
165 * @return boolean value for whether previous() can still return another code unit
170 typedef UBool U_CALLCONV
171 UCharIteratorHasPrevious(UCharIterator
*iter
);
174 * Function type declaration for UCharIterator.current().
176 * Return the code unit at the current position,
177 * or U_SENTINEL if there is none (index is at the limit).
179 * @param iter the UCharIterator structure ("this pointer")
180 * @return the current code unit
185 typedef UChar32 U_CALLCONV
186 UCharIteratorCurrent(UCharIterator
*iter
);
189 * Function type declaration for UCharIterator.next().
191 * Return the code unit at the current index and increment
192 * the index (post-increment, like s[i++]),
193 * or return U_SENTINEL if there is none (index is at the limit).
195 * @param iter the UCharIterator structure ("this pointer")
196 * @return the current code unit (and post-increment the current index)
201 typedef UChar32 U_CALLCONV
202 UCharIteratorNext(UCharIterator
*iter
);
205 * Function type declaration for UCharIterator.previous().
207 * Decrement the index and return the code unit from there
208 * (pre-decrement, like s[--i]),
209 * or return U_SENTINEL if there is none (index is at the start).
211 * @param iter the UCharIterator structure ("this pointer")
212 * @return the previous code unit (after pre-decrementing the current index)
217 typedef UChar32 U_CALLCONV
218 UCharIteratorPrevious(UCharIterator
*iter
);
221 * Function type declaration for UCharIterator.reservedFn().
222 * Reserved for future use.
224 * @param iter the UCharIterator structure ("this pointer")
225 * @param something some integer argument
226 * @return some integer
231 typedef int32_t U_CALLCONV
232 UCharIteratorReserved(UCharIterator
*iter
, int32_t something
);
235 * Function type declaration for UCharIterator.getState().
237 * Get the "state" of the iterator in the form of a single 32-bit word.
238 * It is recommended that the state value be calculated to be as small as
239 * is feasible. For strings with limited lengths, fewer than 32 bits may
242 * This is used together with setState()/UCharIteratorSetState
243 * to save and restore the iterator position more efficiently than with
246 * The iterator state is defined as a uint32_t value because it is designed
247 * for use in ucol_nextSortKeyPart() which provides 32 bits to store the state
248 * of the character iterator.
250 * With some UCharIterator implementations (e.g., UTF-8),
251 * getting and setting the UTF-16 index with existing functions
252 * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but
253 * relatively slow because the iterator has to "walk" from a known index
254 * to the requested one.
255 * This takes more time the farther it needs to go.
257 * An opaque state value allows an iterator implementation to provide
258 * an internal index (UTF-8: the source byte array index) for
259 * fast, constant-time restoration.
261 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
262 * the UTF-16 index may not be restored as well, but the iterator can deliver
263 * the correct text contents and move relative to the current position
264 * without performance degradation.
266 * Some UCharIterator implementations may not be able to return
267 * a valid state for each position, in which case they return UITER_NO_STATE instead.
268 * This will be clearly documented for each such iterator (none of the public ones here).
270 * @param iter the UCharIterator structure ("this pointer")
271 * @return the state word
274 * @see UCharIteratorSetState
275 * @see UITER_NO_STATE
278 typedef uint32_t U_CALLCONV
279 UCharIteratorGetState(const UCharIterator
*iter
);
282 * Function type declaration for UCharIterator.setState().
284 * Restore the "state" of the iterator using a state word from a getState() call.
285 * The iterator object need not be the same one as for which getState() was called,
286 * but it must be of the same type (set up using the same uiter_setXYZ function)
287 * and it must iterate over the same string
288 * (binary identical regardless of memory address).
289 * For more about the state word see UCharIteratorGetState.
291 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
292 * the UTF-16 index may not be restored as well, but the iterator can deliver
293 * the correct text contents and move relative to the current position
294 * without performance degradation.
296 * @param iter the UCharIterator structure ("this pointer")
297 * @param state the state word from a getState() call
298 * on a same-type, same-string iterator
299 * @param pErrorCode Must be a valid pointer to an error code value,
300 * which must not indicate a failure before the function call.
303 * @see UCharIteratorGetState
306 typedef void U_CALLCONV
307 UCharIteratorSetState(UCharIterator
*iter
, uint32_t state
, UErrorCode
*pErrorCode
);
311 * C API for code unit iteration.
312 * This can be used as a C wrapper around
313 * CharacterIterator, Replaceable, or implemented using simple strings, etc.
315 * There are two roles for using UCharIterator:
317 * A "provider" sets the necessary function pointers and controls the "protected"
318 * fields of the UCharIterator structure. A "provider" passes a UCharIterator
319 * into C APIs that need a UCharIterator as an abstract, flexible string interface.
321 * Implementations of such C APIs are "callers" of UCharIterator functions;
322 * they only use the "public" function pointers and never access the "protected"
325 * The current() and next() functions only check the current index against the
326 * limit, and previous() only checks the current index against the start,
327 * to see if the iterator already reached the end of the iteration range.
329 * The assumption - in all iterators - is that the index is moved via the API,
330 * which means it won't go out of bounds, or the index is modified by
331 * user code that knows enough about the iterator implementation to set valid
334 * UCharIterator functions return code unit values 0..0xffff,
335 * or U_SENTINEL if the iteration bounds are reached.
339 struct UCharIterator
{
341 * (protected) Pointer to string or wrapped object or similar.
342 * Not used by caller.
348 * (protected) Length of string or similar.
349 * Not used by caller.
355 * (protected) Start index or similar.
356 * Not used by caller.
362 * (protected) Current index or similar.
363 * Not used by caller.
369 * (protected) Limit index or similar.
370 * Not used by caller.
376 * (protected) Used by UTF-8 iterators and possibly others.
379 int32_t reservedField
;
382 * (public) Returns the current position or the
383 * start or limit index of the iteration range.
385 * @see UCharIteratorGetIndex
388 UCharIteratorGetIndex
*getIndex
;
391 * (public) Moves the current position relative to the start or limit of the
392 * iteration range, or relative to the current position itself.
393 * The movement is expressed in numbers of code units forward
394 * or backward by specifying a positive or negative delta.
396 * @see UCharIteratorMove
399 UCharIteratorMove
*move
;
402 * (public) Check if current() and next() can still
403 * return another code unit.
405 * @see UCharIteratorHasNext
408 UCharIteratorHasNext
*hasNext
;
411 * (public) Check if previous() can still return another code unit.
413 * @see UCharIteratorHasPrevious
416 UCharIteratorHasPrevious
*hasPrevious
;
419 * (public) Return the code unit at the current position,
420 * or U_SENTINEL if there is none (index is at the limit).
422 * @see UCharIteratorCurrent
425 UCharIteratorCurrent
*current
;
428 * (public) Return the code unit at the current index and increment
429 * the index (post-increment, like s[i++]),
430 * or return U_SENTINEL if there is none (index is at the limit).
432 * @see UCharIteratorNext
435 UCharIteratorNext
*next
;
438 * (public) Decrement the index and return the code unit from there
439 * (pre-decrement, like s[--i]),
440 * or return U_SENTINEL if there is none (index is at the start).
442 * @see UCharIteratorPrevious
445 UCharIteratorPrevious
*previous
;
448 * (public) Reserved for future use. Currently NULL.
450 * @see UCharIteratorReserved
453 UCharIteratorReserved
*reservedFn
;
456 * (public) Return the state of the iterator, to be restored later with setState().
457 * This function pointer is NULL if the iterator does not implement it.
459 * @see UCharIteratorGet
462 UCharIteratorGetState
*getState
;
465 * (public) Restore the iterator state from the state word from a call
467 * This function pointer is NULL if the iterator does not implement it.
469 * @see UCharIteratorSet
472 UCharIteratorSetState
*setState
;
476 * Helper function for UCharIterator to get the code point
477 * at the current index.
479 * Return the code point that includes the code unit at the current position,
480 * or U_SENTINEL if there is none (index is at the limit).
481 * If the current code unit is a lead or trail surrogate,
482 * then the following or preceding surrogate is used to form
483 * the code point value.
485 * @param iter the UCharIterator structure ("this pointer")
486 * @return the current code point
490 * @see UnicodeString::char32At()
493 U_STABLE UChar32 U_EXPORT2
494 uiter_current32(UCharIterator
*iter
);
497 * Helper function for UCharIterator to get the next code point.
499 * Return the code point at the current index and increment
500 * the index (post-increment, like s[i++]),
501 * or return U_SENTINEL if there is none (index is at the limit).
503 * @param iter the UCharIterator structure ("this pointer")
504 * @return the current code point (and post-increment the current index)
510 U_STABLE UChar32 U_EXPORT2
511 uiter_next32(UCharIterator
*iter
);
514 * Helper function for UCharIterator to get the previous code point.
516 * Decrement the index and return the code point from there
517 * (pre-decrement, like s[--i]),
518 * or return U_SENTINEL if there is none (index is at the start).
520 * @param iter the UCharIterator structure ("this pointer")
521 * @return the previous code point (after pre-decrementing the current index)
527 U_STABLE UChar32 U_EXPORT2
528 uiter_previous32(UCharIterator
*iter
);
531 * Get the "state" of the iterator in the form of a single 32-bit word.
532 * This is a convenience function that calls iter->getState(iter)
533 * if iter->getState is not NULL;
534 * if it is NULL or any other error occurs, then UITER_NO_STATE is returned.
536 * Some UCharIterator implementations may not be able to return
537 * a valid state for each position, in which case they return UITER_NO_STATE instead.
538 * This will be clearly documented for each such iterator (none of the public ones here).
540 * @param iter the UCharIterator structure ("this pointer")
541 * @return the state word
544 * @see UCharIteratorGetState
545 * @see UITER_NO_STATE
548 U_STABLE
uint32_t U_EXPORT2
549 uiter_getState(const UCharIterator
*iter
);
552 * Restore the "state" of the iterator using a state word from a getState() call.
553 * This is a convenience function that calls iter->setState(iter, state, pErrorCode)
554 * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set.
556 * @param iter the UCharIterator structure ("this pointer")
557 * @param state the state word from a getState() call
558 * on a same-type, same-string iterator
559 * @param pErrorCode Must be a valid pointer to an error code value,
560 * which must not indicate a failure before the function call.
563 * @see UCharIteratorSetState
566 U_STABLE
void U_EXPORT2
567 uiter_setState(UCharIterator
*iter
, uint32_t state
, UErrorCode
*pErrorCode
);
570 * Set up a UCharIterator to iterate over a string.
572 * Sets the UCharIterator function pointers for iteration over the string s
573 * with iteration boundaries start=index=0 and length=limit=string length.
574 * The "provider" may set the start, index, and limit values at any time
575 * within the range 0..length.
576 * The length field will be ignored.
578 * The string pointer s is set into UCharIterator.context without copying
579 * or reallocating the string contents.
581 * getState() simply returns the current index.
582 * move() will always return the final index.
584 * @param iter UCharIterator structure to be set for iteration
585 * @param s String to iterate over
586 * @param length Length of s, or -1 if NUL-terminated
591 U_STABLE
void U_EXPORT2
592 uiter_setString(UCharIterator
*iter
, const UChar
*s
, int32_t length
);
595 * Set up a UCharIterator to iterate over a UTF-16BE string
596 * (byte vector with a big-endian pair of bytes per UChar).
598 * Everything works just like with a normal UChar iterator (uiter_setString),
599 * except that UChars are assembled from byte pairs,
600 * and that the length argument here indicates an even number of bytes.
602 * getState() simply returns the current index.
603 * move() will always return the final index.
605 * @param iter UCharIterator structure to be set for iteration
606 * @param s UTF-16BE string to iterate over
607 * @param length Length of s as an even number of bytes, or -1 if NUL-terminated
608 * (NUL means pair of 0 bytes at even index from s)
611 * @see uiter_setString
614 U_STABLE
void U_EXPORT2
615 uiter_setUTF16BE(UCharIterator
*iter
, const char *s
, int32_t length
);
618 * Set up a UCharIterator to iterate over a UTF-8 string.
620 * Sets the UCharIterator function pointers for iteration over the UTF-8 string s
621 * with UTF-8 iteration boundaries 0 and length.
622 * The implementation counts the UTF-16 index on the fly and
623 * lazily evaluates the UTF-16 length of the text.
625 * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
626 * When the reservedField is not 0, then it contains a supplementary code point
627 * and the UTF-16 index is between the two corresponding surrogates.
628 * At that point, the UTF-8 index is behind that code point.
630 * The UTF-8 string pointer s is set into UCharIterator.context without copying
631 * or reallocating the string contents.
633 * getState() returns a state value consisting of
634 * - the current UTF-8 source byte index (bits 31..1)
635 * - a flag (bit 0) that indicates whether the UChar position is in the middle
636 * of a surrogate pair
637 * (from a 4-byte UTF-8 sequence for the corresponding supplementary code point)
639 * getState() cannot also encode the UTF-16 index in the state value.
640 * move(relative to limit or length), or
641 * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX.
643 * @param iter UCharIterator structure to be set for iteration
644 * @param s UTF-8 string to iterate over
645 * @param length Length of s in bytes, or -1 if NUL-terminated
650 U_STABLE
void U_EXPORT2
651 uiter_setUTF8(UCharIterator
*iter
, const char *s
, int32_t length
);
656 * Set up a UCharIterator to wrap around a C++ CharacterIterator.
658 * Sets the UCharIterator function pointers for iteration using the
659 * CharacterIterator charIter.
661 * The CharacterIterator pointer charIter is set into UCharIterator.context
662 * without copying or cloning the CharacterIterator object.
663 * The other "protected" UCharIterator fields are set to 0 and will be ignored.
664 * The iteration index and boundaries are controlled by the CharacterIterator.
666 * getState() simply returns the current index.
667 * move() will always return the final index.
669 * @param iter UCharIterator structure to be set for iteration
670 * @param charIter CharacterIterator to wrap
675 U_STABLE
void U_EXPORT2
676 uiter_setCharacterIterator(UCharIterator
*iter
, CharacterIterator
*charIter
);
679 * Set up a UCharIterator to iterate over a C++ Replaceable.
681 * Sets the UCharIterator function pointers for iteration over the
682 * Replaceable rep with iteration boundaries start=index=0 and
683 * length=limit=rep->length().
684 * The "provider" may set the start, index, and limit values at any time
685 * within the range 0..length=rep->length().
686 * The length field will be ignored.
688 * The Replaceable pointer rep is set into UCharIterator.context without copying
689 * or cloning/reallocating the Replaceable object.
691 * getState() simply returns the current index.
692 * move() will always return the final index.
694 * @param iter UCharIterator structure to be set for iteration
695 * @param rep Replaceable to iterate over
700 U_STABLE
void U_EXPORT2
701 uiter_setReplaceable(UCharIterator
*iter
, const Replaceable
*rep
);