2 *******************************************************************************
4 * Copyright (C) 2002-2003, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2002jan18
14 * created by: Markus W. Scherer
22 * \brief C API: Unicode Character Iteration
27 #include "unicode/utypes.h"
32 class CharacterIterator
;
41 typedef struct UCharIterator UCharIterator
; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
44 * Origin constants for UCharIterator.getIndex() and UCharIterator.move().
45 * @see UCharIteratorMove
49 typedef enum UCharIteratorOrigin
{
50 UITER_START
, UITER_CURRENT
, UITER_LIMIT
, UITER_ZERO
, UITER_LENGTH
51 } UCharIteratorOrigin
;
53 /** Constants for UCharIterator. @draft ICU 2.6 */
56 * Constant value that may be returned by UCharIteratorMove
57 * indicating that the final UTF-16 index is not known, but that the move succeeded.
58 * This can occur when moving relative to limit or length, or
59 * when moving relative to the current index after a setState()
60 * when the current UTF-16 index is not known.
62 * It would be very inefficient to have to count from the beginning of the text
63 * just to get the current/limit/length index after moving relative to it.
64 * The actual index can be determined with getIndex(UITER_CURRENT)
65 * which will count the UChars if necessary.
69 UITER_UNKNOWN_INDEX
=-2
73 * Constant for UCharIterator getState() indicating an error or
75 * Returned by uiter_getState()/UCharIteratorGetState
76 * when an error occurs.
77 * Also, some UCharIterator implementations may not be able to return
78 * a valid state for each position. This will be clearly documented
79 * for each such iterator (none of the public ones here).
83 #define UITER_NO_STATE ((uint32_t)0xffffffff)
86 * Function type declaration for UCharIterator.getIndex().
88 * Gets the current position, or the start or limit of the
91 * This function may perform slowly for UITER_CURRENT after setState() was called,
92 * or for UITER_LENGTH, because an iterator implementation may have to count
93 * UChars if the underlying storage is not UTF-16.
95 * @param iter the UCharIterator structure ("this pointer")
96 * @param origin get the 0, start, limit, length, or current index
97 * @return the requested index, or U_SENTINEL in an error condition
99 * @see UCharIteratorOrigin
103 typedef int32_t U_CALLCONV
104 UCharIteratorGetIndex(UCharIterator
*iter
, UCharIteratorOrigin origin
);
107 * Function type declaration for UCharIterator.move().
109 * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index).
111 * Moves the current position relative to the start or limit of the
112 * iteration range, or relative to the current position itself.
113 * The movement is expressed in numbers of code units forward
114 * or backward by specifying a positive or negative delta.
115 * Out of bounds movement will be pinned to the start or limit.
117 * This function may perform slowly for moving relative to UITER_LENGTH
118 * because an iterator implementation may have to count the rest of the
119 * UChars if the native storage is not UTF-16.
121 * When moving relative to the limit or length, or
122 * relative to the current position after setState() was called,
123 * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient
124 * determination of the actual UTF-16 index.
125 * The actual index can be determined with getIndex(UITER_CURRENT)
126 * which will count the UChars if necessary.
127 * See UITER_UNKNOWN_INDEX for details.
129 * @param iter the UCharIterator structure ("this pointer")
130 * @param delta can be positive, zero, or negative
131 * @param origin move relative to the 0, start, limit, length, or current index
132 * @return the new index, or U_SENTINEL on an error condition,
133 * or UITER_UNKNOWN_INDEX when the index is not known.
135 * @see UCharIteratorOrigin
137 * @see UITER_UNKNOWN_INDEX
140 typedef int32_t U_CALLCONV
141 UCharIteratorMove(UCharIterator
*iter
, int32_t delta
, UCharIteratorOrigin origin
);
144 * Function type declaration for UCharIterator.hasNext().
146 * Check if current() and next() can still
147 * return another code unit.
149 * @param iter the UCharIterator structure ("this pointer")
150 * @return boolean value for whether current() and next() can still return another code unit
155 typedef UBool U_CALLCONV
156 UCharIteratorHasNext(UCharIterator
*iter
);
159 * Function type declaration for UCharIterator.hasPrevious().
161 * Check if previous() can still return another code unit.
163 * @param iter the UCharIterator structure ("this pointer")
164 * @return boolean value for whether previous() can still return another code unit
169 typedef UBool U_CALLCONV
170 UCharIteratorHasPrevious(UCharIterator
*iter
);
173 * Function type declaration for UCharIterator.current().
175 * Return the code unit at the current position,
176 * or U_SENTINEL if there is none (index is at the limit).
178 * @param iter the UCharIterator structure ("this pointer")
179 * @return the current code unit
184 typedef UChar32 U_CALLCONV
185 UCharIteratorCurrent(UCharIterator
*iter
);
188 * Function type declaration for UCharIterator.next().
190 * Return the code unit at the current index and increment
191 * the index (post-increment, like s[i++]),
192 * or return U_SENTINEL if there is none (index is at the limit).
194 * @param iter the UCharIterator structure ("this pointer")
195 * @return the current code unit (and post-increment the current index)
200 typedef UChar32 U_CALLCONV
201 UCharIteratorNext(UCharIterator
*iter
);
204 * Function type declaration for UCharIterator.previous().
206 * Decrement the index and return the code unit from there
207 * (pre-decrement, like s[--i]),
208 * or return U_SENTINEL if there is none (index is at the start).
210 * @param iter the UCharIterator structure ("this pointer")
211 * @return the previous code unit (after pre-decrementing the current index)
216 typedef UChar32 U_CALLCONV
217 UCharIteratorPrevious(UCharIterator
*iter
);
220 * Function type declaration for UCharIterator.reservedFn().
221 * Reserved for future use.
223 * @param iter the UCharIterator structure ("this pointer")
224 * @param something some integer argument
225 * @return some integer
230 typedef int32_t U_CALLCONV
231 UCharIteratorReserved(UCharIterator
*iter
, int32_t something
);
234 * Function type declaration for UCharIterator.getState().
236 * Get the "state" of the iterator in the form of a single 32-bit word.
237 * It is recommended that the state value be calculated to be as small as
238 * is feasible. For strings with limited lengths, fewer than 32 bits may
241 * This is used together with setState()/UCharIteratorSetState
242 * to save and restore the iterator position more efficiently than with
245 * With some UCharIterator implementations (e.g., UTF-8),
246 * getting and setting the UTF-16 index with existing functions
247 * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but
248 * relatively slow because the iterator has to "walk" from a known index
249 * to the requested one.
250 * This takes more time the farther it needs to go.
252 * An opaque state value allows an iterator implementation to provide
253 * an internal index (UTF-8: the source byte array index) for
254 * fast, constant-time restoration.
256 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
257 * the UTF-16 index may not be restored as well, but the iterator can deliver
258 * the correct text contents and move relative to the current position
259 * without performance degradation.
261 * Some UCharIterator implementations may not be able to return
262 * a valid state for each position, in which case they return UITER_NO_STATE instead.
263 * This will be clearly documented for each such iterator (none of the public ones here).
265 * @param iter the UCharIterator structure ("this pointer")
266 * @return the state word
269 * @see UCharIteratorSetState
270 * @see UITER_NO_STATE
273 typedef uint32_t U_CALLCONV
274 UCharIteratorGetState(const UCharIterator
*iter
);
277 * Function type declaration for UCharIterator.setState().
279 * Restore the "state" of the iterator using a state word from a getState() call.
280 * The iterator object need not be the same one as for which getState() was called,
281 * but it must be of the same type (set up using the same uiter_setXYZ function)
282 * and it must iterate over the same string
283 * (binary identical regardless of memory address).
284 * For more about the state word see UCharIteratorGetState.
286 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
287 * the UTF-16 index may not be restored as well, but the iterator can deliver
288 * the correct text contents and move relative to the current position
289 * without performance degradation.
291 * @param iter the UCharIterator structure ("this pointer")
292 * @param state the state word from a getState() call
293 * on a same-type, same-string iterator
294 * @param pErrorCode Must be a valid pointer to an error code value,
295 * which must not indicate a failure before the function call.
298 * @see UCharIteratorGetState
301 typedef void U_CALLCONV
302 UCharIteratorSetState(UCharIterator
*iter
, uint32_t state
, UErrorCode
*pErrorCode
);
306 * C API for code unit iteration.
307 * This can be used as a C wrapper around
308 * CharacterIterator, Replaceable, or implemented using simple strings, etc.
310 * There are two roles for using UCharIterator:
312 * A "provider" sets the necessary function pointers and controls the "protected"
313 * fields of the UCharIterator structure. A "provider" passes a UCharIterator
314 * into C APIs that need a UCharIterator as an abstract, flexible string interface.
316 * Implementations of such C APIs are "callers" of UCharIterator functions;
317 * they only use the "public" function pointers and never access the "protected"
320 * UCharIterator functions return code unit values 0..0xffff,
321 * or U_SENTINEL if the iteration bounds are reached.
325 struct UCharIterator
{
327 * (protected) Pointer to string or wrapped object or similar.
328 * Not used by caller.
334 * (protected) Length of string or similar.
335 * Not used by caller.
341 * (protected) Start index or similar.
342 * Not used by caller.
348 * (protected) Current index or similar.
349 * Not used by caller.
355 * (protected) Limit index or similar.
356 * Not used by caller.
362 * (protected) Used by UTF-8 iterators and possibly others.
365 int32_t reservedField
;
368 * (public) Returns the current position or the
369 * start or limit index of the iteration range.
371 * @see UCharIteratorGetIndex
374 UCharIteratorGetIndex
*getIndex
;
377 * (public) Moves the current position relative to the start or limit of the
378 * iteration range, or relative to the current position itself.
379 * The movement is expressed in numbers of code units forward
380 * or backward by specifying a positive or negative delta.
382 * @see UCharIteratorMove
385 UCharIteratorMove
*move
;
388 * (public) Check if current() and next() can still
389 * return another code unit.
391 * @see UCharIteratorHasNext
394 UCharIteratorHasNext
*hasNext
;
397 * (public) Check if previous() can still return another code unit.
399 * @see UCharIteratorHasPrevious
402 UCharIteratorHasPrevious
*hasPrevious
;
405 * (public) Return the code unit at the current position,
406 * or U_SENTINEL if there is none (index is at the limit).
408 * @see UCharIteratorCurrent
411 UCharIteratorCurrent
*current
;
414 * (public) Return the code unit at the current index and increment
415 * the index (post-increment, like s[i++]),
416 * or return U_SENTINEL if there is none (index is at the limit).
418 * @see UCharIteratorNext
421 UCharIteratorNext
*next
;
424 * (public) Decrement the index and return the code unit from there
425 * (pre-decrement, like s[--i]),
426 * or return U_SENTINEL if there is none (index is at the start).
428 * @see UCharIteratorPrevious
431 UCharIteratorPrevious
*previous
;
434 * (public) Reserved for future use. Currently NULL.
436 * @see UCharIteratorReserved
439 UCharIteratorReserved
*reservedFn
;
442 * (public) Return the state of the iterator, to be restored later with setState().
443 * This function pointer is NULL if the iterator does not implement it.
445 * @see UCharIteratorGet
448 UCharIteratorGetState
*getState
;
451 * (public) Restore the iterator state from the state word from a call
453 * This function pointer is NULL if the iterator does not implement it.
455 * @see UCharIteratorSet
458 UCharIteratorSetState
*setState
;
462 * Helper function for UCharIterator to get the code point
463 * at the current index.
465 * Return the code point that includes the code unit at the current position,
466 * or U_SENTINEL if there is none (index is at the limit).
467 * If the current code unit is a lead or trail surrogate,
468 * then the following or preceding surrogate is used to form
469 * the code point value.
471 * @param iter the UCharIterator structure ("this pointer")
472 * @return the current code point
476 * @see UnicodeString::char32At()
479 U_CAPI UChar32 U_EXPORT2
480 uiter_current32(UCharIterator
*iter
);
483 * Helper function for UCharIterator to get the next code point.
485 * Return the code point at the current index and increment
486 * the index (post-increment, like s[i++]),
487 * or return U_SENTINEL if there is none (index is at the limit).
489 * @param iter the UCharIterator structure ("this pointer")
490 * @return the current code point (and post-increment the current index)
496 U_CAPI UChar32 U_EXPORT2
497 uiter_next32(UCharIterator
*iter
);
500 * Helper function for UCharIterator to get the previous code point.
502 * Decrement the index and return the code point from there
503 * (pre-decrement, like s[--i]),
504 * or return U_SENTINEL if there is none (index is at the start).
506 * @param iter the UCharIterator structure ("this pointer")
507 * @return the previous code point (after pre-decrementing the current index)
513 U_CAPI UChar32 U_EXPORT2
514 uiter_previous32(UCharIterator
*iter
);
517 * Get the "state" of the iterator in the form of a single 32-bit word.
518 * This is a convenience function that calls iter->getState(iter)
519 * if iter->getState is not NULL;
520 * if it is NULL or any other error occurs, then UITER_NO_STATE is returned.
522 * Some UCharIterator implementations may not be able to return
523 * a valid state for each position, in which case they return UITER_NO_STATE instead.
524 * This will be clearly documented for each such iterator (none of the public ones here).
526 * @param iter the UCharIterator structure ("this pointer")
527 * @return the state word
530 * @see UCharIteratorGetState
531 * @see UITER_NO_STATE
534 U_CAPI
uint32_t U_EXPORT2
535 uiter_getState(const UCharIterator
*iter
);
538 * Restore the "state" of the iterator using a state word from a getState() call.
539 * This is a convenience function that calls iter->setState(iter, state, pErrorCode)
540 * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set.
542 * @param iter the UCharIterator structure ("this pointer")
543 * @param state the state word from a getState() call
544 * on a same-type, same-string iterator
545 * @param pErrorCode Must be a valid pointer to an error code value,
546 * which must not indicate a failure before the function call.
549 * @see UCharIteratorSetState
552 U_CAPI
void U_EXPORT2
553 uiter_setState(UCharIterator
*iter
, uint32_t state
, UErrorCode
*pErrorCode
);
556 * Set up a UCharIterator to iterate over a string.
558 * Sets the UCharIterator function pointers for iteration over the string s
559 * with iteration boundaries start=index=0 and length=limit=string length.
560 * The "provider" may set the start, index, and limit values at any time
561 * within the range 0..length.
562 * The length field will be ignored.
564 * The string pointer s is set into UCharIterator.context without copying
565 * or reallocating the string contents.
567 * getState() simply returns the current index.
568 * move() will always return the final index.
570 * @param iter UCharIterator structure to be set for iteration
571 * @param s String to iterate over
572 * @param length Length of s, or -1 if NUL-terminated
577 U_CAPI
void U_EXPORT2
578 uiter_setString(UCharIterator
*iter
, const UChar
*s
, int32_t length
);
581 * Set up a UCharIterator to iterate over a UTF-16BE string
582 * (byte vector with a big-endian pair of bytes per UChar).
584 * Everything works just like with a normal UChar iterator (uiter_setString),
585 * except that UChars are assembled from byte pairs,
586 * and that the length argument here indicates an even number of bytes.
588 * getState() simply returns the current index.
589 * move() will always return the final index.
591 * @param iter UCharIterator structure to be set for iteration
592 * @param s UTF-16BE string to iterate over
593 * @param length Length of s as an even number of bytes, or -1 if NUL-terminated
594 * (NUL means pair of 0 bytes at even index from s)
597 * @see uiter_setString
600 U_CAPI
void U_EXPORT2
601 uiter_setUTF16BE(UCharIterator
*iter
, const char *s
, int32_t length
);
604 * Set up a UCharIterator to iterate over a UTF-8 string.
606 * Sets the UCharIterator function pointers for iteration over the UTF-8 string s
607 * with UTF-8 iteration boundaries 0 and length.
608 * The implementation counts the UTF-16 index on the fly and
609 * lazily evaluates the UTF-16 length of the text.
611 * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
612 * When the reservedField is not 0, then it contains a supplementary code point
613 * and the UTF-16 index is between the two corresponding surrogates.
614 * At that point, the UTF-8 index is behind that code point.
616 * The UTF-8 string pointer s is set into UCharIterator.context without copying
617 * or reallocating the string contents.
619 * getState() returns a state value consisting of
620 * - the current UTF-8 source byte index (bits 31..1)
621 * - a flag (bit 0) that indicates whether the UChar position is in the middle
622 * of a surrogate pair
623 * (from a 4-byte UTF-8 sequence for the corresponding supplementary code point)
625 * getState() cannot also encode the UTF-16 index in the state value.
626 * move(relative to limit or length), or
627 * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX.
629 * @param iter UCharIterator structure to be set for iteration
630 * @param s UTF-8 string to iterate over
631 * @param length Length of s in bytes, or -1 if NUL-terminated
636 U_CAPI
void U_EXPORT2
637 uiter_setUTF8(UCharIterator
*iter
, const char *s
, int32_t length
);
642 * Set up a UCharIterator to wrap around a C++ CharacterIterator.
644 * Sets the UCharIterator function pointers for iteration using the
645 * CharacterIterator charIter.
647 * The CharacterIterator pointer charIter is set into UCharIterator.context
648 * without copying or cloning the CharacterIterator object.
649 * The other "protected" UCharIterator fields are set to 0 and will be ignored.
650 * The iteration index and boundaries are controlled by the CharacterIterator.
652 * getState() simply returns the current index.
653 * move() will always return the final index.
655 * @param iter UCharIterator structure to be set for iteration
656 * @param charIter CharacterIterator to wrap
661 U_CAPI
void U_EXPORT2
662 uiter_setCharacterIterator(UCharIterator
*iter
, CharacterIterator
*charIter
);
665 * Set up a UCharIterator to iterate over a C++ Replaceable.
667 * Sets the UCharIterator function pointers for iteration over the
668 * Replaceable rep with iteration boundaries start=index=0 and
669 * length=limit=rep->length().
670 * The "provider" may set the start, index, and limit values at any time
671 * within the range 0..length=rep->length().
672 * The length field will be ignored.
674 * The Replaceable pointer rep is set into UCharIterator.context without copying
675 * or cloning/reallocating the Replaceable object.
677 * getState() simply returns the current index.
678 * move() will always return the final index.
680 * @param iter UCharIterator structure to be set for iteration
681 * @param rep Replaceable to iterate over
686 U_CAPI
void U_EXPORT2
687 uiter_setReplaceable(UCharIterator
*iter
, const Replaceable
*rep
);