]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/unicode/uiter.h
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / common / unicode / uiter.h
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2002-2003, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: uiter.h
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002jan18
14 * created by: Markus W. Scherer
15 */
16
17 #ifndef __UITER_H__
18 #define __UITER_H__
19
20 /**
21 * \file
22 * \brief C API: Unicode Character Iteration
23 *
24 * @see UCharIterator
25 */
26
27 #include "unicode/utypes.h"
28
29 #ifdef XP_CPLUSPLUS
30 U_NAMESPACE_BEGIN
31
32 class CharacterIterator;
33 class Replaceable;
34
35 U_NAMESPACE_END
36 #endif
37
38 U_CDECL_BEGIN
39
40 struct UCharIterator;
41 typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
42
43 /**
44 * Origin constants for UCharIterator.getIndex() and UCharIterator.move().
45 * @see UCharIteratorMove
46 * @see UCharIterator
47 * @stable ICU 2.1
48 */
49 typedef enum UCharIteratorOrigin {
50 UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH
51 } UCharIteratorOrigin;
52
53 /** Constants for UCharIterator. @draft ICU 2.6 */
54 enum {
55 /**
56 * Constant value that may be returned by UCharIteratorMove
57 * indicating that the final UTF-16 index is not known, but that the move succeeded.
58 * This can occur when moving relative to limit or length, or
59 * when moving relative to the current index after a setState()
60 * when the current UTF-16 index is not known.
61 *
62 * It would be very inefficient to have to count from the beginning of the text
63 * just to get the current/limit/length index after moving relative to it.
64 * The actual index can be determined with getIndex(UITER_CURRENT)
65 * which will count the UChars if necessary.
66 *
67 * @draft ICU 2.6
68 */
69 UITER_UNKNOWN_INDEX=-2
70 };
71
72 /**
73 * Constant for UCharIterator getState() indicating an error or
74 * an unknown state.
75 * Returned by uiter_getState()/UCharIteratorGetState
76 * when an error occurs.
77 * Also, some UCharIterator implementations may not be able to return
78 * a valid state for each position. This will be clearly documented
79 * for each such iterator (none of the public ones here).
80 *
81 * @draft ICU 2.6
82 */
83 #define UITER_NO_STATE ((uint32_t)0xffffffff)
84
85 /**
86 * Function type declaration for UCharIterator.getIndex().
87 *
88 * Gets the current position, or the start or limit of the
89 * iteration range.
90 *
91 * This function may perform slowly for UITER_CURRENT after setState() was called,
92 * or for UITER_LENGTH, because an iterator implementation may have to count
93 * UChars if the underlying storage is not UTF-16.
94 *
95 * @param iter the UCharIterator structure ("this pointer")
96 * @param origin get the 0, start, limit, length, or current index
97 * @return the requested index, or U_SENTINEL in an error condition
98 *
99 * @see UCharIteratorOrigin
100 * @see UCharIterator
101 * @stable ICU 2.1
102 */
103 typedef int32_t U_CALLCONV
104 UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin);
105
106 /**
107 * Function type declaration for UCharIterator.move().
108 *
109 * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index).
110 *
111 * Moves the current position relative to the start or limit of the
112 * iteration range, or relative to the current position itself.
113 * The movement is expressed in numbers of code units forward
114 * or backward by specifying a positive or negative delta.
115 * Out of bounds movement will be pinned to the start or limit.
116 *
117 * This function may perform slowly for moving relative to UITER_LENGTH
118 * because an iterator implementation may have to count the rest of the
119 * UChars if the native storage is not UTF-16.
120 *
121 * When moving relative to the limit or length, or
122 * relative to the current position after setState() was called,
123 * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient
124 * determination of the actual UTF-16 index.
125 * The actual index can be determined with getIndex(UITER_CURRENT)
126 * which will count the UChars if necessary.
127 * See UITER_UNKNOWN_INDEX for details.
128 *
129 * @param iter the UCharIterator structure ("this pointer")
130 * @param delta can be positive, zero, or negative
131 * @param origin move relative to the 0, start, limit, length, or current index
132 * @return the new index, or U_SENTINEL on an error condition,
133 * or UITER_UNKNOWN_INDEX when the index is not known.
134 *
135 * @see UCharIteratorOrigin
136 * @see UCharIterator
137 * @see UITER_UNKNOWN_INDEX
138 * @stable ICU 2.1
139 */
140 typedef int32_t U_CALLCONV
141 UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin);
142
143 /**
144 * Function type declaration for UCharIterator.hasNext().
145 *
146 * Check if current() and next() can still
147 * return another code unit.
148 *
149 * @param iter the UCharIterator structure ("this pointer")
150 * @return boolean value for whether current() and next() can still return another code unit
151 *
152 * @see UCharIterator
153 * @stable ICU 2.1
154 */
155 typedef UBool U_CALLCONV
156 UCharIteratorHasNext(UCharIterator *iter);
157
158 /**
159 * Function type declaration for UCharIterator.hasPrevious().
160 *
161 * Check if previous() can still return another code unit.
162 *
163 * @param iter the UCharIterator structure ("this pointer")
164 * @return boolean value for whether previous() can still return another code unit
165 *
166 * @see UCharIterator
167 * @stable ICU 2.1
168 */
169 typedef UBool U_CALLCONV
170 UCharIteratorHasPrevious(UCharIterator *iter);
171
172 /**
173 * Function type declaration for UCharIterator.current().
174 *
175 * Return the code unit at the current position,
176 * or U_SENTINEL if there is none (index is at the limit).
177 *
178 * @param iter the UCharIterator structure ("this pointer")
179 * @return the current code unit
180 *
181 * @see UCharIterator
182 * @stable ICU 2.1
183 */
184 typedef UChar32 U_CALLCONV
185 UCharIteratorCurrent(UCharIterator *iter);
186
187 /**
188 * Function type declaration for UCharIterator.next().
189 *
190 * Return the code unit at the current index and increment
191 * the index (post-increment, like s[i++]),
192 * or return U_SENTINEL if there is none (index is at the limit).
193 *
194 * @param iter the UCharIterator structure ("this pointer")
195 * @return the current code unit (and post-increment the current index)
196 *
197 * @see UCharIterator
198 * @stable ICU 2.1
199 */
200 typedef UChar32 U_CALLCONV
201 UCharIteratorNext(UCharIterator *iter);
202
203 /**
204 * Function type declaration for UCharIterator.previous().
205 *
206 * Decrement the index and return the code unit from there
207 * (pre-decrement, like s[--i]),
208 * or return U_SENTINEL if there is none (index is at the start).
209 *
210 * @param iter the UCharIterator structure ("this pointer")
211 * @return the previous code unit (after pre-decrementing the current index)
212 *
213 * @see UCharIterator
214 * @stable ICU 2.1
215 */
216 typedef UChar32 U_CALLCONV
217 UCharIteratorPrevious(UCharIterator *iter);
218
219 /**
220 * Function type declaration for UCharIterator.reservedFn().
221 * Reserved for future use.
222 *
223 * @param iter the UCharIterator structure ("this pointer")
224 * @param something some integer argument
225 * @return some integer
226 *
227 * @see UCharIterator
228 * @stable ICU 2.1
229 */
230 typedef int32_t U_CALLCONV
231 UCharIteratorReserved(UCharIterator *iter, int32_t something);
232
233 /**
234 * Function type declaration for UCharIterator.getState().
235 *
236 * Get the "state" of the iterator in the form of a single 32-bit word.
237 * It is recommended that the state value be calculated to be as small as
238 * is feasible. For strings with limited lengths, fewer than 32 bits may
239 * be sufficient.
240 *
241 * This is used together with setState()/UCharIteratorSetState
242 * to save and restore the iterator position more efficiently than with
243 * getIndex()/move().
244 *
245 * With some UCharIterator implementations (e.g., UTF-8),
246 * getting and setting the UTF-16 index with existing functions
247 * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but
248 * relatively slow because the iterator has to "walk" from a known index
249 * to the requested one.
250 * This takes more time the farther it needs to go.
251 *
252 * An opaque state value allows an iterator implementation to provide
253 * an internal index (UTF-8: the source byte array index) for
254 * fast, constant-time restoration.
255 *
256 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
257 * the UTF-16 index may not be restored as well, but the iterator can deliver
258 * the correct text contents and move relative to the current position
259 * without performance degradation.
260 *
261 * Some UCharIterator implementations may not be able to return
262 * a valid state for each position, in which case they return UITER_NO_STATE instead.
263 * This will be clearly documented for each such iterator (none of the public ones here).
264 *
265 * @param iter the UCharIterator structure ("this pointer")
266 * @return the state word
267 *
268 * @see UCharIterator
269 * @see UCharIteratorSetState
270 * @see UITER_NO_STATE
271 * @draft ICU 2.6
272 */
273 typedef uint32_t U_CALLCONV
274 UCharIteratorGetState(const UCharIterator *iter);
275
276 /**
277 * Function type declaration for UCharIterator.setState().
278 *
279 * Restore the "state" of the iterator using a state word from a getState() call.
280 * The iterator object need not be the same one as for which getState() was called,
281 * but it must be of the same type (set up using the same uiter_setXYZ function)
282 * and it must iterate over the same string
283 * (binary identical regardless of memory address).
284 * For more about the state word see UCharIteratorGetState.
285 *
286 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
287 * the UTF-16 index may not be restored as well, but the iterator can deliver
288 * the correct text contents and move relative to the current position
289 * without performance degradation.
290 *
291 * @param iter the UCharIterator structure ("this pointer")
292 * @param state the state word from a getState() call
293 * on a same-type, same-string iterator
294 * @param pErrorCode Must be a valid pointer to an error code value,
295 * which must not indicate a failure before the function call.
296 *
297 * @see UCharIterator
298 * @see UCharIteratorGetState
299 * @draft ICU 2.6
300 */
301 typedef void U_CALLCONV
302 UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
303
304
305 /**
306 * C API for code unit iteration.
307 * This can be used as a C wrapper around
308 * CharacterIterator, Replaceable, or implemented using simple strings, etc.
309 *
310 * There are two roles for using UCharIterator:
311 *
312 * A "provider" sets the necessary function pointers and controls the "protected"
313 * fields of the UCharIterator structure. A "provider" passes a UCharIterator
314 * into C APIs that need a UCharIterator as an abstract, flexible string interface.
315 *
316 * Implementations of such C APIs are "callers" of UCharIterator functions;
317 * they only use the "public" function pointers and never access the "protected"
318 * fields directly.
319 *
320 * UCharIterator functions return code unit values 0..0xffff,
321 * or U_SENTINEL if the iteration bounds are reached.
322 *
323 * @stable ICU 2.1
324 */
325 struct UCharIterator {
326 /**
327 * (protected) Pointer to string or wrapped object or similar.
328 * Not used by caller.
329 * @stable ICU 2.1
330 */
331 const void *context;
332
333 /**
334 * (protected) Length of string or similar.
335 * Not used by caller.
336 * @stable ICU 2.1
337 */
338 int32_t length;
339
340 /**
341 * (protected) Start index or similar.
342 * Not used by caller.
343 * @stable ICU 2.1
344 */
345 int32_t start;
346
347 /**
348 * (protected) Current index or similar.
349 * Not used by caller.
350 * @stable ICU 2.1
351 */
352 int32_t index;
353
354 /**
355 * (protected) Limit index or similar.
356 * Not used by caller.
357 * @stable ICU 2.1
358 */
359 int32_t limit;
360
361 /**
362 * (protected) Used by UTF-8 iterators and possibly others.
363 * @stable ICU 2.1
364 */
365 int32_t reservedField;
366
367 /**
368 * (public) Returns the current position or the
369 * start or limit index of the iteration range.
370 *
371 * @see UCharIteratorGetIndex
372 * @stable ICU 2.1
373 */
374 UCharIteratorGetIndex *getIndex;
375
376 /**
377 * (public) Moves the current position relative to the start or limit of the
378 * iteration range, or relative to the current position itself.
379 * The movement is expressed in numbers of code units forward
380 * or backward by specifying a positive or negative delta.
381 *
382 * @see UCharIteratorMove
383 * @stable ICU 2.1
384 */
385 UCharIteratorMove *move;
386
387 /**
388 * (public) Check if current() and next() can still
389 * return another code unit.
390 *
391 * @see UCharIteratorHasNext
392 * @stable ICU 2.1
393 */
394 UCharIteratorHasNext *hasNext;
395
396 /**
397 * (public) Check if previous() can still return another code unit.
398 *
399 * @see UCharIteratorHasPrevious
400 * @stable ICU 2.1
401 */
402 UCharIteratorHasPrevious *hasPrevious;
403
404 /**
405 * (public) Return the code unit at the current position,
406 * or U_SENTINEL if there is none (index is at the limit).
407 *
408 * @see UCharIteratorCurrent
409 * @stable ICU 2.1
410 */
411 UCharIteratorCurrent *current;
412
413 /**
414 * (public) Return the code unit at the current index and increment
415 * the index (post-increment, like s[i++]),
416 * or return U_SENTINEL if there is none (index is at the limit).
417 *
418 * @see UCharIteratorNext
419 * @stable ICU 2.1
420 */
421 UCharIteratorNext *next;
422
423 /**
424 * (public) Decrement the index and return the code unit from there
425 * (pre-decrement, like s[--i]),
426 * or return U_SENTINEL if there is none (index is at the start).
427 *
428 * @see UCharIteratorPrevious
429 * @stable ICU 2.1
430 */
431 UCharIteratorPrevious *previous;
432
433 /**
434 * (public) Reserved for future use. Currently NULL.
435 *
436 * @see UCharIteratorReserved
437 * @stable ICU 2.1
438 */
439 UCharIteratorReserved *reservedFn;
440
441 /**
442 * (public) Return the state of the iterator, to be restored later with setState().
443 * This function pointer is NULL if the iterator does not implement it.
444 *
445 * @see UCharIteratorGet
446 * @draft ICU 2.6
447 */
448 UCharIteratorGetState *getState;
449
450 /**
451 * (public) Restore the iterator state from the state word from a call
452 * to getState().
453 * This function pointer is NULL if the iterator does not implement it.
454 *
455 * @see UCharIteratorSet
456 * @draft ICU 2.6
457 */
458 UCharIteratorSetState *setState;
459 };
460
461 /**
462 * Helper function for UCharIterator to get the code point
463 * at the current index.
464 *
465 * Return the code point that includes the code unit at the current position,
466 * or U_SENTINEL if there is none (index is at the limit).
467 * If the current code unit is a lead or trail surrogate,
468 * then the following or preceding surrogate is used to form
469 * the code point value.
470 *
471 * @param iter the UCharIterator structure ("this pointer")
472 * @return the current code point
473 *
474 * @see UCharIterator
475 * @see U16_GET
476 * @see UnicodeString::char32At()
477 * @stable ICU 2.1
478 */
479 U_CAPI UChar32 U_EXPORT2
480 uiter_current32(UCharIterator *iter);
481
482 /**
483 * Helper function for UCharIterator to get the next code point.
484 *
485 * Return the code point at the current index and increment
486 * the index (post-increment, like s[i++]),
487 * or return U_SENTINEL if there is none (index is at the limit).
488 *
489 * @param iter the UCharIterator structure ("this pointer")
490 * @return the current code point (and post-increment the current index)
491 *
492 * @see UCharIterator
493 * @see U16_NEXT
494 * @stable ICU 2.1
495 */
496 U_CAPI UChar32 U_EXPORT2
497 uiter_next32(UCharIterator *iter);
498
499 /**
500 * Helper function for UCharIterator to get the previous code point.
501 *
502 * Decrement the index and return the code point from there
503 * (pre-decrement, like s[--i]),
504 * or return U_SENTINEL if there is none (index is at the start).
505 *
506 * @param iter the UCharIterator structure ("this pointer")
507 * @return the previous code point (after pre-decrementing the current index)
508 *
509 * @see UCharIterator
510 * @see U16_PREV
511 * @stable ICU 2.1
512 */
513 U_CAPI UChar32 U_EXPORT2
514 uiter_previous32(UCharIterator *iter);
515
516 /**
517 * Get the "state" of the iterator in the form of a single 32-bit word.
518 * This is a convenience function that calls iter->getState(iter)
519 * if iter->getState is not NULL;
520 * if it is NULL or any other error occurs, then UITER_NO_STATE is returned.
521 *
522 * Some UCharIterator implementations may not be able to return
523 * a valid state for each position, in which case they return UITER_NO_STATE instead.
524 * This will be clearly documented for each such iterator (none of the public ones here).
525 *
526 * @param iter the UCharIterator structure ("this pointer")
527 * @return the state word
528 *
529 * @see UCharIterator
530 * @see UCharIteratorGetState
531 * @see UITER_NO_STATE
532 * @draft ICU 2.6
533 */
534 U_CAPI uint32_t U_EXPORT2
535 uiter_getState(const UCharIterator *iter);
536
537 /**
538 * Restore the "state" of the iterator using a state word from a getState() call.
539 * This is a convenience function that calls iter->setState(iter, state, pErrorCode)
540 * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set.
541 *
542 * @param iter the UCharIterator structure ("this pointer")
543 * @param state the state word from a getState() call
544 * on a same-type, same-string iterator
545 * @param pErrorCode Must be a valid pointer to an error code value,
546 * which must not indicate a failure before the function call.
547 *
548 * @see UCharIterator
549 * @see UCharIteratorSetState
550 * @draft ICU 2.6
551 */
552 U_CAPI void U_EXPORT2
553 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
554
555 /**
556 * Set up a UCharIterator to iterate over a string.
557 *
558 * Sets the UCharIterator function pointers for iteration over the string s
559 * with iteration boundaries start=index=0 and length=limit=string length.
560 * The "provider" may set the start, index, and limit values at any time
561 * within the range 0..length.
562 * The length field will be ignored.
563 *
564 * The string pointer s is set into UCharIterator.context without copying
565 * or reallocating the string contents.
566 *
567 * getState() simply returns the current index.
568 * move() will always return the final index.
569 *
570 * @param iter UCharIterator structure to be set for iteration
571 * @param s String to iterate over
572 * @param length Length of s, or -1 if NUL-terminated
573 *
574 * @see UCharIterator
575 * @stable ICU 2.1
576 */
577 U_CAPI void U_EXPORT2
578 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);
579
580 /**
581 * Set up a UCharIterator to iterate over a UTF-16BE string
582 * (byte vector with a big-endian pair of bytes per UChar).
583 *
584 * Everything works just like with a normal UChar iterator (uiter_setString),
585 * except that UChars are assembled from byte pairs,
586 * and that the length argument here indicates an even number of bytes.
587 *
588 * getState() simply returns the current index.
589 * move() will always return the final index.
590 *
591 * @param iter UCharIterator structure to be set for iteration
592 * @param s UTF-16BE string to iterate over
593 * @param length Length of s as an even number of bytes, or -1 if NUL-terminated
594 * (NUL means pair of 0 bytes at even index from s)
595 *
596 * @see UCharIterator
597 * @see uiter_setString
598 * @draft ICU 2.6
599 */
600 U_CAPI void U_EXPORT2
601 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length);
602
603 /**
604 * Set up a UCharIterator to iterate over a UTF-8 string.
605 *
606 * Sets the UCharIterator function pointers for iteration over the UTF-8 string s
607 * with UTF-8 iteration boundaries 0 and length.
608 * The implementation counts the UTF-16 index on the fly and
609 * lazily evaluates the UTF-16 length of the text.
610 *
611 * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
612 * When the reservedField is not 0, then it contains a supplementary code point
613 * and the UTF-16 index is between the two corresponding surrogates.
614 * At that point, the UTF-8 index is behind that code point.
615 *
616 * The UTF-8 string pointer s is set into UCharIterator.context without copying
617 * or reallocating the string contents.
618 *
619 * getState() returns a state value consisting of
620 * - the current UTF-8 source byte index (bits 31..1)
621 * - a flag (bit 0) that indicates whether the UChar position is in the middle
622 * of a surrogate pair
623 * (from a 4-byte UTF-8 sequence for the corresponding supplementary code point)
624 *
625 * getState() cannot also encode the UTF-16 index in the state value.
626 * move(relative to limit or length), or
627 * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX.
628 *
629 * @param iter UCharIterator structure to be set for iteration
630 * @param s UTF-8 string to iterate over
631 * @param length Length of s in bytes, or -1 if NUL-terminated
632 *
633 * @see UCharIterator
634 * @draft ICU 2.6
635 */
636 U_CAPI void U_EXPORT2
637 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length);
638
639 #ifdef XP_CPLUSPLUS
640
641 /**
642 * Set up a UCharIterator to wrap around a C++ CharacterIterator.
643 *
644 * Sets the UCharIterator function pointers for iteration using the
645 * CharacterIterator charIter.
646 *
647 * The CharacterIterator pointer charIter is set into UCharIterator.context
648 * without copying or cloning the CharacterIterator object.
649 * The other "protected" UCharIterator fields are set to 0 and will be ignored.
650 * The iteration index and boundaries are controlled by the CharacterIterator.
651 *
652 * getState() simply returns the current index.
653 * move() will always return the final index.
654 *
655 * @param iter UCharIterator structure to be set for iteration
656 * @param charIter CharacterIterator to wrap
657 *
658 * @see UCharIterator
659 * @stable ICU 2.1
660 */
661 U_CAPI void U_EXPORT2
662 uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter);
663
664 /**
665 * Set up a UCharIterator to iterate over a C++ Replaceable.
666 *
667 * Sets the UCharIterator function pointers for iteration over the
668 * Replaceable rep with iteration boundaries start=index=0 and
669 * length=limit=rep->length().
670 * The "provider" may set the start, index, and limit values at any time
671 * within the range 0..length=rep->length().
672 * The length field will be ignored.
673 *
674 * The Replaceable pointer rep is set into UCharIterator.context without copying
675 * or cloning/reallocating the Replaceable object.
676 *
677 * getState() simply returns the current index.
678 * move() will always return the final index.
679 *
680 * @param iter UCharIterator structure to be set for iteration
681 * @param rep Replaceable to iterate over
682 *
683 * @see UCharIterator
684 * @stable ICU 2.1
685 */
686 U_CAPI void U_EXPORT2
687 uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep);
688
689 #endif
690
691 U_CDECL_END
692
693 #endif