1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2002-2011 International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
12 * tab size: 8 (not used)
15 * created on: 2002jan18
16 * created by: Markus W. Scherer
24 * \brief C API: Unicode Character Iteration
29 #include "unicode/utypes.h"
31 #if U_SHOW_CPLUSPLUS_API
34 class CharacterIterator
;
43 typedef struct UCharIterator UCharIterator
; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
46 * Origin constants for UCharIterator.getIndex() and UCharIterator.move().
47 * @see UCharIteratorMove
51 typedef enum UCharIteratorOrigin
{
52 UITER_START
, UITER_CURRENT
, UITER_LIMIT
, UITER_ZERO
, UITER_LENGTH
53 } UCharIteratorOrigin
;
55 /** Constants for UCharIterator. @stable ICU 2.6 */
58 * Constant value that may be returned by UCharIteratorMove
59 * indicating that the final UTF-16 index is not known, but that the move succeeded.
60 * This can occur when moving relative to limit or length, or
61 * when moving relative to the current index after a setState()
62 * when the current UTF-16 index is not known.
64 * It would be very inefficient to have to count from the beginning of the text
65 * just to get the current/limit/length index after moving relative to it.
66 * The actual index can be determined with getIndex(UITER_CURRENT)
67 * which will count the UChars if necessary.
71 UITER_UNKNOWN_INDEX
=-2
76 * Constant for UCharIterator getState() indicating an error or
78 * Returned by uiter_getState()/UCharIteratorGetState
79 * when an error occurs.
80 * Also, some UCharIterator implementations may not be able to return
81 * a valid state for each position. This will be clearly documented
82 * for each such iterator (none of the public ones here).
86 #define UITER_NO_STATE ((uint32_t)0xffffffff)
89 * Function type declaration for UCharIterator.getIndex().
91 * Gets the current position, or the start or limit of the
94 * This function may perform slowly for UITER_CURRENT after setState() was called,
95 * or for UITER_LENGTH, because an iterator implementation may have to count
96 * UChars if the underlying storage is not UTF-16.
98 * @param iter the UCharIterator structure ("this pointer")
99 * @param origin get the 0, start, limit, length, or current index
100 * @return the requested index, or U_SENTINEL in an error condition
102 * @see UCharIteratorOrigin
106 typedef int32_t U_CALLCONV
107 UCharIteratorGetIndex(UCharIterator
*iter
, UCharIteratorOrigin origin
);
110 * Function type declaration for UCharIterator.move().
112 * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index).
114 * Moves the current position relative to the start or limit of the
115 * iteration range, or relative to the current position itself.
116 * The movement is expressed in numbers of code units forward
117 * or backward by specifying a positive or negative delta.
118 * Out of bounds movement will be pinned to the start or limit.
120 * This function may perform slowly for moving relative to UITER_LENGTH
121 * because an iterator implementation may have to count the rest of the
122 * UChars if the native storage is not UTF-16.
124 * When moving relative to the limit or length, or
125 * relative to the current position after setState() was called,
126 * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient
127 * determination of the actual UTF-16 index.
128 * The actual index can be determined with getIndex(UITER_CURRENT)
129 * which will count the UChars if necessary.
130 * See UITER_UNKNOWN_INDEX for details.
132 * @param iter the UCharIterator structure ("this pointer")
133 * @param delta can be positive, zero, or negative
134 * @param origin move relative to the 0, start, limit, length, or current index
135 * @return the new index, or U_SENTINEL on an error condition,
136 * or UITER_UNKNOWN_INDEX when the index is not known.
138 * @see UCharIteratorOrigin
140 * @see UITER_UNKNOWN_INDEX
143 typedef int32_t U_CALLCONV
144 UCharIteratorMove(UCharIterator
*iter
, int32_t delta
, UCharIteratorOrigin origin
);
147 * Function type declaration for UCharIterator.hasNext().
149 * Check if current() and next() can still
150 * return another code unit.
152 * @param iter the UCharIterator structure ("this pointer")
153 * @return boolean value for whether current() and next() can still return another code unit
158 typedef UBool U_CALLCONV
159 UCharIteratorHasNext(UCharIterator
*iter
);
162 * Function type declaration for UCharIterator.hasPrevious().
164 * Check if previous() can still return another code unit.
166 * @param iter the UCharIterator structure ("this pointer")
167 * @return boolean value for whether previous() can still return another code unit
172 typedef UBool U_CALLCONV
173 UCharIteratorHasPrevious(UCharIterator
*iter
);
176 * Function type declaration for UCharIterator.current().
178 * Return the code unit at the current position,
179 * or U_SENTINEL if there is none (index is at the limit).
181 * @param iter the UCharIterator structure ("this pointer")
182 * @return the current code unit
187 typedef UChar32 U_CALLCONV
188 UCharIteratorCurrent(UCharIterator
*iter
);
191 * Function type declaration for UCharIterator.next().
193 * Return the code unit at the current index and increment
194 * the index (post-increment, like s[i++]),
195 * or return U_SENTINEL if there is none (index is at the limit).
197 * @param iter the UCharIterator structure ("this pointer")
198 * @return the current code unit (and post-increment the current index)
203 typedef UChar32 U_CALLCONV
204 UCharIteratorNext(UCharIterator
*iter
);
207 * Function type declaration for UCharIterator.previous().
209 * Decrement the index and return the code unit from there
210 * (pre-decrement, like s[--i]),
211 * or return U_SENTINEL if there is none (index is at the start).
213 * @param iter the UCharIterator structure ("this pointer")
214 * @return the previous code unit (after pre-decrementing the current index)
219 typedef UChar32 U_CALLCONV
220 UCharIteratorPrevious(UCharIterator
*iter
);
223 * Function type declaration for UCharIterator.reservedFn().
224 * Reserved for future use.
226 * @param iter the UCharIterator structure ("this pointer")
227 * @param something some integer argument
228 * @return some integer
233 typedef int32_t U_CALLCONV
234 UCharIteratorReserved(UCharIterator
*iter
, int32_t something
);
237 * Function type declaration for UCharIterator.getState().
239 * Get the "state" of the iterator in the form of a single 32-bit word.
240 * It is recommended that the state value be calculated to be as small as
241 * is feasible. For strings with limited lengths, fewer than 32 bits may
244 * This is used together with setState()/UCharIteratorSetState
245 * to save and restore the iterator position more efficiently than with
248 * The iterator state is defined as a uint32_t value because it is designed
249 * for use in ucol_nextSortKeyPart() which provides 32 bits to store the state
250 * of the character iterator.
252 * With some UCharIterator implementations (e.g., UTF-8),
253 * getting and setting the UTF-16 index with existing functions
254 * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but
255 * relatively slow because the iterator has to "walk" from a known index
256 * to the requested one.
257 * This takes more time the farther it needs to go.
259 * An opaque state value allows an iterator implementation to provide
260 * an internal index (UTF-8: the source byte array index) for
261 * fast, constant-time restoration.
263 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
264 * the UTF-16 index may not be restored as well, but the iterator can deliver
265 * the correct text contents and move relative to the current position
266 * without performance degradation.
268 * Some UCharIterator implementations may not be able to return
269 * a valid state for each position, in which case they return UITER_NO_STATE instead.
270 * This will be clearly documented for each such iterator (none of the public ones here).
272 * @param iter the UCharIterator structure ("this pointer")
273 * @return the state word
276 * @see UCharIteratorSetState
277 * @see UITER_NO_STATE
280 typedef uint32_t U_CALLCONV
281 UCharIteratorGetState(const UCharIterator
*iter
);
284 * Function type declaration for UCharIterator.setState().
286 * Restore the "state" of the iterator using a state word from a getState() call.
287 * The iterator object need not be the same one as for which getState() was called,
288 * but it must be of the same type (set up using the same uiter_setXYZ function)
289 * and it must iterate over the same string
290 * (binary identical regardless of memory address).
291 * For more about the state word see UCharIteratorGetState.
293 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
294 * the UTF-16 index may not be restored as well, but the iterator can deliver
295 * the correct text contents and move relative to the current position
296 * without performance degradation.
298 * @param iter the UCharIterator structure ("this pointer")
299 * @param state the state word from a getState() call
300 * on a same-type, same-string iterator
301 * @param pErrorCode Must be a valid pointer to an error code value,
302 * which must not indicate a failure before the function call.
305 * @see UCharIteratorGetState
308 typedef void U_CALLCONV
309 UCharIteratorSetState(UCharIterator
*iter
, uint32_t state
, UErrorCode
*pErrorCode
);
313 * C API for code unit iteration.
314 * This can be used as a C wrapper around
315 * CharacterIterator, Replaceable, or implemented using simple strings, etc.
317 * There are two roles for using UCharIterator:
319 * A "provider" sets the necessary function pointers and controls the "protected"
320 * fields of the UCharIterator structure. A "provider" passes a UCharIterator
321 * into C APIs that need a UCharIterator as an abstract, flexible string interface.
323 * Implementations of such C APIs are "callers" of UCharIterator functions;
324 * they only use the "public" function pointers and never access the "protected"
327 * The current() and next() functions only check the current index against the
328 * limit, and previous() only checks the current index against the start,
329 * to see if the iterator already reached the end of the iteration range.
331 * The assumption - in all iterators - is that the index is moved via the API,
332 * which means it won't go out of bounds, or the index is modified by
333 * user code that knows enough about the iterator implementation to set valid
336 * UCharIterator functions return code unit values 0..0xffff,
337 * or U_SENTINEL if the iteration bounds are reached.
341 struct UCharIterator
{
343 * (protected) Pointer to string or wrapped object or similar.
344 * Not used by caller.
350 * (protected) Length of string or similar.
351 * Not used by caller.
357 * (protected) Start index or similar.
358 * Not used by caller.
364 * (protected) Current index or similar.
365 * Not used by caller.
371 * (protected) Limit index or similar.
372 * Not used by caller.
378 * (protected) Used by UTF-8 iterators and possibly others.
381 int32_t reservedField
;
384 * (public) Returns the current position or the
385 * start or limit index of the iteration range.
387 * @see UCharIteratorGetIndex
390 UCharIteratorGetIndex
*getIndex
;
393 * (public) Moves the current position relative to the start or limit of the
394 * iteration range, or relative to the current position itself.
395 * The movement is expressed in numbers of code units forward
396 * or backward by specifying a positive or negative delta.
398 * @see UCharIteratorMove
401 UCharIteratorMove
*move
;
404 * (public) Check if current() and next() can still
405 * return another code unit.
407 * @see UCharIteratorHasNext
410 UCharIteratorHasNext
*hasNext
;
413 * (public) Check if previous() can still return another code unit.
415 * @see UCharIteratorHasPrevious
418 UCharIteratorHasPrevious
*hasPrevious
;
421 * (public) Return the code unit at the current position,
422 * or U_SENTINEL if there is none (index is at the limit).
424 * @see UCharIteratorCurrent
427 UCharIteratorCurrent
*current
;
430 * (public) Return the code unit at the current index and increment
431 * the index (post-increment, like s[i++]),
432 * or return U_SENTINEL if there is none (index is at the limit).
434 * @see UCharIteratorNext
437 UCharIteratorNext
*next
;
440 * (public) Decrement the index and return the code unit from there
441 * (pre-decrement, like s[--i]),
442 * or return U_SENTINEL if there is none (index is at the start).
444 * @see UCharIteratorPrevious
447 UCharIteratorPrevious
*previous
;
450 * (public) Reserved for future use. Currently NULL.
452 * @see UCharIteratorReserved
455 UCharIteratorReserved
*reservedFn
;
458 * (public) Return the state of the iterator, to be restored later with setState().
459 * This function pointer is NULL if the iterator does not implement it.
461 * @see UCharIteratorGet
464 UCharIteratorGetState
*getState
;
467 * (public) Restore the iterator state from the state word from a call
469 * This function pointer is NULL if the iterator does not implement it.
471 * @see UCharIteratorSet
474 UCharIteratorSetState
*setState
;
478 * Helper function for UCharIterator to get the code point
479 * at the current index.
481 * Return the code point that includes the code unit at the current position,
482 * or U_SENTINEL if there is none (index is at the limit).
483 * If the current code unit is a lead or trail surrogate,
484 * then the following or preceding surrogate is used to form
485 * the code point value.
487 * @param iter the UCharIterator structure ("this pointer")
488 * @return the current code point
492 * @see UnicodeString::char32At()
495 U_STABLE UChar32 U_EXPORT2
496 uiter_current32(UCharIterator
*iter
);
499 * Helper function for UCharIterator to get the next code point.
501 * Return the code point at the current index and increment
502 * the index (post-increment, like s[i++]),
503 * or return U_SENTINEL if there is none (index is at the limit).
505 * @param iter the UCharIterator structure ("this pointer")
506 * @return the current code point (and post-increment the current index)
512 U_STABLE UChar32 U_EXPORT2
513 uiter_next32(UCharIterator
*iter
);
516 * Helper function for UCharIterator to get the previous code point.
518 * Decrement the index and return the code point from there
519 * (pre-decrement, like s[--i]),
520 * or return U_SENTINEL if there is none (index is at the start).
522 * @param iter the UCharIterator structure ("this pointer")
523 * @return the previous code point (after pre-decrementing the current index)
529 U_STABLE UChar32 U_EXPORT2
530 uiter_previous32(UCharIterator
*iter
);
533 * Get the "state" of the iterator in the form of a single 32-bit word.
534 * This is a convenience function that calls iter->getState(iter)
535 * if iter->getState is not NULL;
536 * if it is NULL or any other error occurs, then UITER_NO_STATE is returned.
538 * Some UCharIterator implementations may not be able to return
539 * a valid state for each position, in which case they return UITER_NO_STATE instead.
540 * This will be clearly documented for each such iterator (none of the public ones here).
542 * @param iter the UCharIterator structure ("this pointer")
543 * @return the state word
546 * @see UCharIteratorGetState
547 * @see UITER_NO_STATE
550 U_STABLE
uint32_t U_EXPORT2
551 uiter_getState(const UCharIterator
*iter
);
554 * Restore the "state" of the iterator using a state word from a getState() call.
555 * This is a convenience function that calls iter->setState(iter, state, pErrorCode)
556 * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set.
558 * @param iter the UCharIterator structure ("this pointer")
559 * @param state the state word from a getState() call
560 * on a same-type, same-string iterator
561 * @param pErrorCode Must be a valid pointer to an error code value,
562 * which must not indicate a failure before the function call.
565 * @see UCharIteratorSetState
568 U_STABLE
void U_EXPORT2
569 uiter_setState(UCharIterator
*iter
, uint32_t state
, UErrorCode
*pErrorCode
);
572 * Set up a UCharIterator to iterate over a string.
574 * Sets the UCharIterator function pointers for iteration over the string s
575 * with iteration boundaries start=index=0 and length=limit=string length.
576 * The "provider" may set the start, index, and limit values at any time
577 * within the range 0..length.
578 * The length field will be ignored.
580 * The string pointer s is set into UCharIterator.context without copying
581 * or reallocating the string contents.
583 * getState() simply returns the current index.
584 * move() will always return the final index.
586 * @param iter UCharIterator structure to be set for iteration
587 * @param s String to iterate over
588 * @param length Length of s, or -1 if NUL-terminated
593 U_STABLE
void U_EXPORT2
594 uiter_setString(UCharIterator
*iter
, const UChar
*s
, int32_t length
);
597 * Set up a UCharIterator to iterate over a UTF-16BE string
598 * (byte vector with a big-endian pair of bytes per UChar).
600 * Everything works just like with a normal UChar iterator (uiter_setString),
601 * except that UChars are assembled from byte pairs,
602 * and that the length argument here indicates an even number of bytes.
604 * getState() simply returns the current index.
605 * move() will always return the final index.
607 * @param iter UCharIterator structure to be set for iteration
608 * @param s UTF-16BE string to iterate over
609 * @param length Length of s as an even number of bytes, or -1 if NUL-terminated
610 * (NUL means pair of 0 bytes at even index from s)
613 * @see uiter_setString
616 U_STABLE
void U_EXPORT2
617 uiter_setUTF16BE(UCharIterator
*iter
, const char *s
, int32_t length
);
620 * Set up a UCharIterator to iterate over a UTF-8 string.
622 * Sets the UCharIterator function pointers for iteration over the UTF-8 string s
623 * with UTF-8 iteration boundaries 0 and length.
624 * The implementation counts the UTF-16 index on the fly and
625 * lazily evaluates the UTF-16 length of the text.
627 * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
628 * When the reservedField is not 0, then it contains a supplementary code point
629 * and the UTF-16 index is between the two corresponding surrogates.
630 * At that point, the UTF-8 index is behind that code point.
632 * The UTF-8 string pointer s is set into UCharIterator.context without copying
633 * or reallocating the string contents.
635 * getState() returns a state value consisting of
636 * - the current UTF-8 source byte index (bits 31..1)
637 * - a flag (bit 0) that indicates whether the UChar position is in the middle
638 * of a surrogate pair
639 * (from a 4-byte UTF-8 sequence for the corresponding supplementary code point)
641 * getState() cannot also encode the UTF-16 index in the state value.
642 * move(relative to limit or length), or
643 * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX.
645 * @param iter UCharIterator structure to be set for iteration
646 * @param s UTF-8 string to iterate over
647 * @param length Length of s in bytes, or -1 if NUL-terminated
652 U_STABLE
void U_EXPORT2
653 uiter_setUTF8(UCharIterator
*iter
, const char *s
, int32_t length
);
655 #if U_SHOW_CPLUSPLUS_API
658 * Set up a UCharIterator to wrap around a C++ CharacterIterator.
660 * Sets the UCharIterator function pointers for iteration using the
661 * CharacterIterator charIter.
663 * The CharacterIterator pointer charIter is set into UCharIterator.context
664 * without copying or cloning the CharacterIterator object.
665 * The other "protected" UCharIterator fields are set to 0 and will be ignored.
666 * The iteration index and boundaries are controlled by the CharacterIterator.
668 * getState() simply returns the current index.
669 * move() will always return the final index.
671 * @param iter UCharIterator structure to be set for iteration
672 * @param charIter CharacterIterator to wrap
677 U_STABLE
void U_EXPORT2
678 uiter_setCharacterIterator(UCharIterator
*iter
, icu::CharacterIterator
*charIter
);
681 * Set up a UCharIterator to iterate over a C++ Replaceable.
683 * Sets the UCharIterator function pointers for iteration over the
684 * Replaceable rep with iteration boundaries start=index=0 and
685 * length=limit=rep->length().
686 * The "provider" may set the start, index, and limit values at any time
687 * within the range 0..length=rep->length().
688 * The length field will be ignored.
690 * The Replaceable pointer rep is set into UCharIterator.context without copying
691 * or cloning/reallocating the Replaceable object.
693 * getState() simply returns the current index.
694 * move() will always return the final index.
696 * @param iter UCharIterator structure to be set for iteration
697 * @param rep Replaceable to iterate over
702 U_STABLE
void U_EXPORT2
703 uiter_setReplaceable(UCharIterator
*iter
, const icu::Replaceable
*rep
);