]>
Commit | Line | Data |
---|---|---|
1 | // © 2016 and later: Unicode, Inc. and others. | |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | /* | |
4 | ******************************************************************************* | |
5 | * | |
6 | * Copyright (C) 2004-2012, International Business Machines | |
7 | * Corporation and others. All Rights Reserved. | |
8 | * | |
9 | ******************************************************************************* | |
10 | * file name: utext.h | |
11 | * encoding: UTF-8 | |
12 | * tab size: 8 (not used) | |
13 | * indentation:4 | |
14 | * | |
15 | * created on: 2004oct06 | |
16 | * created by: Markus W. Scherer | |
17 | */ | |
18 | ||
19 | #ifndef __UTEXT_H__ | |
20 | #define __UTEXT_H__ | |
21 | ||
22 | /** | |
23 | * \file | |
24 | * \brief C API: Abstract Unicode Text API | |
25 | * | |
26 | * The Text Access API provides a means to allow text that is stored in alternative | |
27 | * formats to work with ICU services. ICU normally operates on text that is | |
28 | * stored in UTF-16 format, in (UChar *) arrays for the C APIs or as type | |
29 | * UnicodeString for C++ APIs. | |
30 | * | |
31 | * ICU Text Access allows other formats, such as UTF-8 or non-contiguous | |
32 | * UTF-16 strings, to be placed in a UText wrapper and then passed to ICU services. | |
33 | * | |
34 | * There are three general classes of usage for UText: | |
35 | * | |
36 | * Application Level Use. This is the simplest usage - applications would | |
37 | * use one of the utext_open() functions on their input text, and pass | |
38 | * the resulting UText to the desired ICU service. | |
39 | * | |
40 | * Second is usage in ICU Services, such as break iteration, that will need to | |
41 | * operate on input presented to them as a UText. These implementations | |
42 | * will need to use the iteration and related UText functions to gain | |
43 | * access to the actual text. | |
44 | * | |
45 | * The third class of UText users are "text providers." These are the | |
46 | * UText implementations for the various text storage formats. An application | |
47 | * or system with a unique text storage format can implement a set of | |
48 | * UText provider functions for that format, which will then allow | |
49 | * ICU services to operate on that format. | |
50 | * | |
51 | * | |
52 | * <em>Iterating over text</em> | |
53 | * | |
54 | * Here is sample code for a forward iteration over the contents of a UText | |
55 | * | |
56 | * \code | |
57 | * UChar32 c; | |
58 | * UText *ut = whatever(); | |
59 | * | |
60 | * for (c=utext_next32From(ut, 0); c>=0; c=utext_next32(ut)) { | |
61 | * // do whatever with the codepoint c here. | |
62 | * } | |
63 | * \endcode | |
64 | * | |
65 | * And here is similar code to iterate in the reverse direction, from the end | |
66 | * of the text towards the beginning. | |
67 | * | |
68 | * \code | |
69 | * UChar32 c; | |
70 | * UText *ut = whatever(); | |
71 | * int textLength = utext_nativeLength(ut); | |
72 | * for (c=utext_previous32From(ut, textLength); c>=0; c=utext_previous32(ut)) { | |
73 | * // do whatever with the codepoint c here. | |
74 | * } | |
75 | * \endcode | |
76 | * | |
77 | * <em>Characters and Indexing</em> | |
78 | * | |
79 | * Indexing into text by UText functions is nearly always in terms of the native | |
80 | * indexing of the underlying text storage. The storage format could be UTF-8 | |
81 | * or UTF-32, for example. When coding to the UText access API, no assumptions | |
82 | * can be made regarding the size of characters, or how far an index | |
83 | * may move when iterating between characters. | |
84 | * | |
85 | * All indices supplied to UText functions are pinned to the length of the | |
86 | * text. An out-of-bounds index is not considered to be an error, but is | |
87 | * adjusted to be in the range 0 <= index <= length of input text. | |
88 | * | |
89 | * | |
90 | * When an index position is returned from a UText function, it will be | |
91 | * a native index to the underlying text. In the case of multi-unit characters, | |
92 | * it will always refer to the first position of the character, | |
93 | * never to the interior. This is essentially the same thing as saying that | |
94 | * a returned index will always point to a boundary between characters. | |
95 | * | |
96 | * When a native index is supplied to a UText function, all indices that | |
97 | * refer to any part of a multi-unit character representation are considered | |
98 | * to be equivalent. In the case of multi-unit characters, an incoming index | |
99 | * will be logically normalized to refer to the start of the character. | |
100 | * | |
101 | * It is possible to test whether a native index is on a code point boundary | |
102 | * by doing a utext_setNativeIndex() followed by a utext_getNativeIndex(). | |
103 | * If the index is returned unchanged, it was on a code point boundary. If | |
104 | * an adjusted index is returned, the original index referred to the | |
105 | * interior of a character. | |
106 | * | |
107 | * <em>Conventions for calling UText functions</em> | |
108 | * | |
109 | * Most UText access functions have as their first parameter a (UText *) pointer, | |
110 | * which specifies the UText to be used. Unless otherwise noted, the | |
111 | * pointer must refer to a valid, open UText. Attempting to | |
112 | * use a closed UText or passing a NULL pointer is a programming error and | |
113 | * will produce undefined results or NULL pointer exceptions. | |
114 | * | |
115 | * The UText_Open family of functions can either open an existing (closed) | |
116 | * UText, or heap allocate a new UText. Here is sample code for creating | |
117 | * a stack-allocated UText. | |
118 | * | |
119 | * \code | |
120 | * char *s = whatever(); // A utf-8 string | |
121 | * U_ErrorCode status = U_ZERO_ERROR; | |
122 | * UText ut = UTEXT_INITIALIZER; | |
123 | * utext_openUTF8(ut, s, -1, &status); | |
124 | * if (U_FAILURE(status)) { | |
125 | * // error handling | |
126 | * } else { | |
127 | * // work with the UText | |
128 | * } | |
129 | * \endcode | |
130 | * | |
131 | * Any existing UText passed to an open function _must_ have been initialized, | |
132 | * either by the UTEXT_INITIALIZER, or by having been originally heap-allocated | |
133 | * by an open function. Passing NULL will cause the open function to | |
134 | * heap-allocate and fully initialize a new UText. | |
135 | * | |
136 | */ | |
137 | ||
138 | ||
139 | ||
140 | #include "unicode/utypes.h" | |
141 | #include "unicode/uchar.h" | |
142 | #if U_SHOW_CPLUSPLUS_API | |
143 | #include "unicode/localpointer.h" | |
144 | #include "unicode/rep.h" | |
145 | #include "unicode/unistr.h" | |
146 | #include "unicode/chariter.h" | |
147 | #endif | |
148 | ||
149 | ||
150 | U_CDECL_BEGIN | |
151 | ||
152 | struct UText; | |
153 | typedef struct UText UText; /**< C typedef for struct UText. @stable ICU 3.6 */ | |
154 | ||
155 | ||
156 | /*************************************************************************************** | |
157 | * | |
158 | * C Functions for creating UText wrappers around various kinds of text strings. | |
159 | * | |
160 | ****************************************************************************************/ | |
161 | ||
162 | ||
163 | /** | |
164 | * Close function for UText instances. | |
165 | * Cleans up, releases any resources being held by an open UText. | |
166 | * <p> | |
167 | * If the UText was originally allocated by one of the utext_open functions, | |
168 | * the storage associated with the utext will also be freed. | |
169 | * If the UText storage originated with the application, as it would with | |
170 | * a local or static instance, the storage will not be deleted. | |
171 | * | |
172 | * An open UText can be reset to refer to new string by using one of the utext_open() | |
173 | * functions without first closing the UText. | |
174 | * | |
175 | * @param ut The UText to be closed. | |
176 | * @return NULL if the UText struct was deleted by the close. If the UText struct | |
177 | * was originally provided by the caller to the open function, it is | |
178 | * returned by this function, and may be safely used again in | |
179 | * a subsequent utext_open. | |
180 | * | |
181 | * @stable ICU 3.4 | |
182 | */ | |
183 | U_STABLE UText * U_EXPORT2 | |
184 | utext_close(UText *ut); | |
185 | ||
186 | #if U_SHOW_CPLUSPLUS_API | |
187 | ||
188 | U_NAMESPACE_BEGIN | |
189 | ||
190 | /** | |
191 | * \class LocalUTextPointer | |
192 | * "Smart pointer" class, closes a UText via utext_close(). | |
193 | * For most methods see the LocalPointerBase base class. | |
194 | * | |
195 | * @see LocalPointerBase | |
196 | * @see LocalPointer | |
197 | * @stable ICU 4.4 | |
198 | */ | |
199 | U_DEFINE_LOCAL_OPEN_POINTER(LocalUTextPointer, UText, utext_close); | |
200 | ||
201 | U_NAMESPACE_END | |
202 | ||
203 | #endif | |
204 | ||
205 | /** | |
206 | * Open a read-only UText implementation for UTF-8 strings. | |
207 | * | |
208 | * \htmlonly | |
209 | * Any invalid UTF-8 in the input will be handled in this way: | |
210 | * a sequence of bytes that has the form of a truncated, but otherwise valid, | |
211 | * UTF-8 sequence will be replaced by a single unicode replacement character, \uFFFD. | |
212 | * Any other illegal bytes will each be replaced by a \uFFFD. | |
213 | * \endhtmlonly | |
214 | * | |
215 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. | |
216 | * If non-NULL, must refer to an initialized UText struct, which will then | |
217 | * be reset to reference the specified UTF-8 string. | |
218 | * @param s A UTF-8 string. Must not be NULL. | |
219 | * @param length The length of the UTF-8 string in bytes, or -1 if the string is | |
220 | * zero terminated. | |
221 | * @param status Errors are returned here. | |
222 | * @return A pointer to the UText. If a pre-allocated UText was provided, it | |
223 | * will always be used and returned. | |
224 | * @stable ICU 3.4 | |
225 | */ | |
226 | U_STABLE UText * U_EXPORT2 | |
227 | utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status); | |
228 | ||
229 | ||
230 | /** | |
231 | * Open a read-only UText for UChar * string. | |
232 | * | |
233 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. | |
234 | * If non-NULL, must refer to an initialized UText struct, which will then | |
235 | * be reset to reference the specified UChar string. | |
236 | * @param s A UChar (UTF-16) string | |
237 | * @param length The number of UChars in the input string, or -1 if the string is | |
238 | * zero terminated. | |
239 | * @param status Errors are returned here. | |
240 | * @return A pointer to the UText. If a pre-allocated UText was provided, it | |
241 | * will always be used and returned. | |
242 | * @stable ICU 3.4 | |
243 | */ | |
244 | U_STABLE UText * U_EXPORT2 | |
245 | utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status); | |
246 | ||
247 | ||
248 | #if U_SHOW_CPLUSPLUS_API | |
249 | /** | |
250 | * Open a writable UText for a non-const UnicodeString. | |
251 | * | |
252 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. | |
253 | * If non-NULL, must refer to an initialized UText struct, which will then | |
254 | * be reset to reference the specified input string. | |
255 | * @param s A UnicodeString. | |
256 | * @param status Errors are returned here. | |
257 | * @return Pointer to the UText. If a UText was supplied as input, this | |
258 | * will always be used and returned. | |
259 | * @stable ICU 3.4 | |
260 | */ | |
261 | U_STABLE UText * U_EXPORT2 | |
262 | utext_openUnicodeString(UText *ut, icu::UnicodeString *s, UErrorCode *status); | |
263 | ||
264 | ||
265 | /** | |
266 | * Open a UText for a const UnicodeString. The resulting UText will not be writable. | |
267 | * | |
268 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. | |
269 | * If non-NULL, must refer to an initialized UText struct, which will then | |
270 | * be reset to reference the specified input string. | |
271 | * @param s A const UnicodeString to be wrapped. | |
272 | * @param status Errors are returned here. | |
273 | * @return Pointer to the UText. If a UText was supplied as input, this | |
274 | * will always be used and returned. | |
275 | * @stable ICU 3.4 | |
276 | */ | |
277 | U_STABLE UText * U_EXPORT2 | |
278 | utext_openConstUnicodeString(UText *ut, const icu::UnicodeString *s, UErrorCode *status); | |
279 | ||
280 | ||
281 | /** | |
282 | * Open a writable UText implementation for an ICU Replaceable object. | |
283 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. | |
284 | * If non-NULL, must refer to an already existing UText, which will then | |
285 | * be reset to reference the specified replaceable text. | |
286 | * @param rep A Replaceable text object. | |
287 | * @param status Errors are returned here. | |
288 | * @return Pointer to the UText. If a UText was supplied as input, this | |
289 | * will always be used and returned. | |
290 | * @see Replaceable | |
291 | * @stable ICU 3.4 | |
292 | */ | |
293 | U_STABLE UText * U_EXPORT2 | |
294 | utext_openReplaceable(UText *ut, icu::Replaceable *rep, UErrorCode *status); | |
295 | ||
296 | /** | |
297 | * Open a UText implementation over an ICU CharacterIterator. | |
298 | * @param ut Pointer to a UText struct. If NULL, a new UText will be created. | |
299 | * If non-NULL, must refer to an already existing UText, which will then | |
300 | * be reset to reference the specified replaceable text. | |
301 | * @param ci A Character Iterator. | |
302 | * @param status Errors are returned here. | |
303 | * @return Pointer to the UText. If a UText was supplied as input, this | |
304 | * will always be used and returned. | |
305 | * @see Replaceable | |
306 | * @stable ICU 3.4 | |
307 | */ | |
308 | U_STABLE UText * U_EXPORT2 | |
309 | utext_openCharacterIterator(UText *ut, icu::CharacterIterator *ci, UErrorCode *status); | |
310 | ||
311 | #endif | |
312 | ||
313 | ||
314 | /** | |
315 | * Clone a UText. This is much like opening a UText where the source text is itself | |
316 | * another UText. | |
317 | * | |
318 | * A deep clone will copy both the UText data structures and the underlying text. | |
319 | * The original and cloned UText will operate completely independently; modifications | |
320 | * made to the text in one will not affect the other. Text providers are not | |
321 | * required to support deep clones. The user of clone() must check the status return | |
322 | * and be prepared to handle failures. | |
323 | * | |
324 | * The standard UText implementations for UTF8, UChar *, UnicodeString and | |
325 | * Replaceable all support deep cloning. | |
326 | * | |
327 | * The UText returned from a deep clone will be writable, assuming that the text | |
328 | * provider is able to support writing, even if the source UText had been made | |
329 | * non-writable by means of UText_freeze(). | |
330 | * | |
331 | * A shallow clone replicates only the UText data structures; it does not make | |
332 | * a copy of the underlying text. Shallow clones can be used as an efficient way to | |
333 | * have multiple iterators active in a single text string that is not being | |
334 | * modified. | |
335 | * | |
336 | * A shallow clone operation will not fail, barring truly exceptional conditions such | |
337 | * as memory allocation failures. | |
338 | * | |
339 | * Shallow UText clones should be avoided if the UText functions that modify the | |
340 | * text are expected to be used, either on the original or the cloned UText. | |
341 | * Any such modifications can cause unpredictable behavior. Read Only | |
342 | * shallow clones provide some protection against errors of this type by | |
343 | * disabling text modification via the cloned UText. | |
344 | * | |
345 | * A shallow clone made with the readOnly parameter == FALSE will preserve the | |
346 | * utext_isWritable() state of the source object. Note, however, that | |
347 | * write operations must be avoided while more than one UText exists that refer | |
348 | * to the same underlying text. | |
349 | * | |
350 | * A UText and its clone may be safely concurrently accessed by separate threads. | |
351 | * This is true for read access only with shallow clones, and for both read and | |
352 | * write access with deep clones. | |
353 | * It is the responsibility of the Text Provider to ensure that this thread safety | |
354 | * constraint is met. | |
355 | * | |
356 | * @param dest A UText struct to be filled in with the result of the clone operation, | |
357 | * or NULL if the clone function should heap-allocate a new UText struct. | |
358 | * If non-NULL, must refer to an already existing UText, which will then | |
359 | * be reset to become the clone. | |
360 | * @param src The UText to be cloned. | |
361 | * @param deep TRUE to request a deep clone, FALSE for a shallow clone. | |
362 | * @param readOnly TRUE to request that the cloned UText have read only access to the | |
363 | * underlying text. | |
364 | ||
365 | * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR | |
366 | * will be returned if the text provider is unable to clone the | |
367 | * original text. | |
368 | * @return The newly created clone, or NULL if the clone operation failed. | |
369 | * @stable ICU 3.4 | |
370 | */ | |
371 | U_STABLE UText * U_EXPORT2 | |
372 | utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status); | |
373 | ||
374 | ||
375 | /** | |
376 | * Compare two UText objects for equality. | |
377 | * UTexts are equal if they are iterating over the same text, and | |
378 | * have the same iteration position within the text. | |
379 | * If either or both of the parameters are NULL, the comparison is FALSE. | |
380 | * | |
381 | * @param a The first of the two UTexts to compare. | |
382 | * @param b The other UText to be compared. | |
383 | * @return TRUE if the two UTexts are equal. | |
384 | * @stable ICU 3.6 | |
385 | */ | |
386 | U_STABLE UBool U_EXPORT2 | |
387 | utext_equals(const UText *a, const UText *b); | |
388 | ||
389 | ||
390 | /***************************************************************************** | |
391 | * | |
392 | * Functions to work with the text represented by a UText wrapper | |
393 | * | |
394 | *****************************************************************************/ | |
395 | ||
396 | /** | |
397 | * Get the length of the text. Depending on the characteristics | |
398 | * of the underlying text representation, this may be expensive. | |
399 | * @see utext_isLengthExpensive() | |
400 | * | |
401 | * | |
402 | * @param ut the text to be accessed. | |
403 | * @return the length of the text, expressed in native units. | |
404 | * | |
405 | * @stable ICU 3.4 | |
406 | */ | |
407 | U_STABLE int64_t U_EXPORT2 | |
408 | utext_nativeLength(UText *ut); | |
409 | ||
410 | /** | |
411 | * Return TRUE if calculating the length of the text could be expensive. | |
412 | * Finding the length of NUL terminated strings is considered to be expensive. | |
413 | * | |
414 | * Note that the value of this function may change | |
415 | * as the result of other operations on a UText. | |
416 | * Once the length of a string has been discovered, it will no longer | |
417 | * be expensive to report it. | |
418 | * | |
419 | * @param ut the text to be accessed. | |
420 | * @return TRUE if determining the length of the text could be time consuming. | |
421 | * @stable ICU 3.4 | |
422 | */ | |
423 | U_STABLE UBool U_EXPORT2 | |
424 | utext_isLengthExpensive(const UText *ut); | |
425 | ||
426 | /** | |
427 | * Returns the code point at the requested index, | |
428 | * or U_SENTINEL (-1) if it is out of bounds. | |
429 | * | |
430 | * If the specified index points to the interior of a multi-unit | |
431 | * character - one of the trail bytes of a UTF-8 sequence, for example - | |
432 | * the complete code point will be returned. | |
433 | * | |
434 | * The iteration position will be set to the start of the returned code point. | |
435 | * | |
436 | * This function is roughly equivalent to the sequence | |
437 | * utext_setNativeIndex(index); | |
438 | * utext_current32(); | |
439 | * (There is a subtle difference if the index is out of bounds by being less than zero - | |
440 | * utext_setNativeIndex(negative value) sets the index to zero, after which utext_current() | |
441 | * will return the char at zero. utext_char32At(negative index), on the other hand, will | |
442 | * return the U_SENTINEL value of -1.) | |
443 | * | |
444 | * @param ut the text to be accessed | |
445 | * @param nativeIndex the native index of the character to be accessed. If the index points | |
446 | * to other than the first unit of a multi-unit character, it will be adjusted | |
447 | * to the start of the character. | |
448 | * @return the code point at the specified index. | |
449 | * @stable ICU 3.4 | |
450 | */ | |
451 | U_STABLE UChar32 U_EXPORT2 | |
452 | utext_char32At(UText *ut, int64_t nativeIndex); | |
453 | ||
454 | ||
455 | /** | |
456 | * | |
457 | * Get the code point at the current iteration position, | |
458 | * or U_SENTINEL (-1) if the iteration has reached the end of | |
459 | * the input text. | |
460 | * | |
461 | * @param ut the text to be accessed. | |
462 | * @return the Unicode code point at the current iterator position. | |
463 | * @stable ICU 3.4 | |
464 | */ | |
465 | U_STABLE UChar32 U_EXPORT2 | |
466 | utext_current32(UText *ut); | |
467 | ||
468 | ||
469 | /** | |
470 | * Get the code point at the current iteration position of the UText, and | |
471 | * advance the position to the first index following the character. | |
472 | * | |
473 | * If the position is at the end of the text (the index following | |
474 | * the last character, which is also the length of the text), | |
475 | * return U_SENTINEL (-1) and do not advance the index. | |
476 | * | |
477 | * This is a post-increment operation. | |
478 | * | |
479 | * An inline macro version of this function, UTEXT_NEXT32(), | |
480 | * is available for performance critical use. | |
481 | * | |
482 | * @param ut the text to be accessed. | |
483 | * @return the Unicode code point at the iteration position. | |
484 | * @see UTEXT_NEXT32 | |
485 | * @stable ICU 3.4 | |
486 | */ | |
487 | U_STABLE UChar32 U_EXPORT2 | |
488 | utext_next32(UText *ut); | |
489 | ||
490 | ||
491 | /** | |
492 | * Move the iterator position to the character (code point) whose | |
493 | * index precedes the current position, and return that character. | |
494 | * This is a pre-decrement operation. | |
495 | * | |
496 | * If the initial position is at the start of the text (index of 0) | |
497 | * return U_SENTINEL (-1), and leave the position unchanged. | |
498 | * | |
499 | * An inline macro version of this function, UTEXT_PREVIOUS32(), | |
500 | * is available for performance critical use. | |
501 | * | |
502 | * @param ut the text to be accessed. | |
503 | * @return the previous UChar32 code point, or U_SENTINEL (-1) | |
504 | * if the iteration has reached the start of the text. | |
505 | * @see UTEXT_PREVIOUS32 | |
506 | * @stable ICU 3.4 | |
507 | */ | |
508 | U_STABLE UChar32 U_EXPORT2 | |
509 | utext_previous32(UText *ut); | |
510 | ||
511 | ||
512 | /** | |
513 | * Set the iteration index and return the code point at that index. | |
514 | * Leave the iteration index at the start of the following code point. | |
515 | * | |
516 | * This function is the most efficient and convenient way to | |
517 | * begin a forward iteration. The results are identical to the those | |
518 | * from the sequence | |
519 | * \code | |
520 | * utext_setIndex(); | |
521 | * utext_next32(); | |
522 | * \endcode | |
523 | * | |
524 | * @param ut the text to be accessed. | |
525 | * @param nativeIndex Iteration index, in the native units of the text provider. | |
526 | * @return Code point which starts at or before index, | |
527 | * or U_SENTINEL (-1) if it is out of bounds. | |
528 | * @stable ICU 3.4 | |
529 | */ | |
530 | U_STABLE UChar32 U_EXPORT2 | |
531 | utext_next32From(UText *ut, int64_t nativeIndex); | |
532 | ||
533 | ||
534 | ||
535 | /** | |
536 | * Set the iteration index, and return the code point preceding the | |
537 | * one specified by the initial index. Leave the iteration position | |
538 | * at the start of the returned code point. | |
539 | * | |
540 | * This function is the most efficient and convenient way to | |
541 | * begin a backwards iteration. | |
542 | * | |
543 | * @param ut the text to be accessed. | |
544 | * @param nativeIndex Iteration index in the native units of the text provider. | |
545 | * @return Code point preceding the one at the initial index, | |
546 | * or U_SENTINEL (-1) if it is out of bounds. | |
547 | * | |
548 | * @stable ICU 3.4 | |
549 | */ | |
550 | U_STABLE UChar32 U_EXPORT2 | |
551 | utext_previous32From(UText *ut, int64_t nativeIndex); | |
552 | ||
553 | /** | |
554 | * Get the current iterator position, which can range from 0 to | |
555 | * the length of the text. | |
556 | * The position is a native index into the input text, in whatever format it | |
557 | * may have (possibly UTF-8 for example), and may not always be the same as | |
558 | * the corresponding UChar (UTF-16) index. | |
559 | * The returned position will always be aligned to a code point boundary. | |
560 | * | |
561 | * @param ut the text to be accessed. | |
562 | * @return the current index position, in the native units of the text provider. | |
563 | * @stable ICU 3.4 | |
564 | */ | |
565 | U_STABLE int64_t U_EXPORT2 | |
566 | utext_getNativeIndex(const UText *ut); | |
567 | ||
568 | /** | |
569 | * Set the current iteration position to the nearest code point | |
570 | * boundary at or preceding the specified index. | |
571 | * The index is in the native units of the original input text. | |
572 | * If the index is out of range, it will be pinned to be within | |
573 | * the range of the input text. | |
574 | * <p> | |
575 | * It will usually be more efficient to begin an iteration | |
576 | * using the functions utext_next32From() or utext_previous32From() | |
577 | * rather than setIndex(). | |
578 | * <p> | |
579 | * Moving the index position to an adjacent character is best done | |
580 | * with utext_next32(), utext_previous32() or utext_moveIndex32(). | |
581 | * Attempting to do direct arithmetic on the index position is | |
582 | * complicated by the fact that the size (in native units) of a | |
583 | * character depends on the underlying representation of the character | |
584 | * (UTF-8, UTF-16, UTF-32, arbitrary codepage), and is not | |
585 | * easily knowable. | |
586 | * | |
587 | * @param ut the text to be accessed. | |
588 | * @param nativeIndex the native unit index of the new iteration position. | |
589 | * @stable ICU 3.4 | |
590 | */ | |
591 | U_STABLE void U_EXPORT2 | |
592 | utext_setNativeIndex(UText *ut, int64_t nativeIndex); | |
593 | ||
594 | /** | |
595 | * Move the iterator position by delta code points. The number of code points | |
596 | * is a signed number; a negative delta will move the iterator backwards, | |
597 | * towards the start of the text. | |
598 | * <p> | |
599 | * The index is moved by <code>delta</code> code points | |
600 | * forward or backward, but no further backward than to 0 and | |
601 | * no further forward than to utext_nativeLength(). | |
602 | * The resulting index value will be in between 0 and length, inclusive. | |
603 | * | |
604 | * @param ut the text to be accessed. | |
605 | * @param delta the signed number of code points to move the iteration position. | |
606 | * @return TRUE if the position could be moved the requested number of positions while | |
607 | * staying within the range [0 - text length]. | |
608 | * @stable ICU 3.4 | |
609 | */ | |
610 | U_STABLE UBool U_EXPORT2 | |
611 | utext_moveIndex32(UText *ut, int32_t delta); | |
612 | ||
613 | /** | |
614 | * Get the native index of the character preceding the current position. | |
615 | * If the iteration position is already at the start of the text, zero | |
616 | * is returned. | |
617 | * The value returned is the same as that obtained from the following sequence, | |
618 | * but without the side effect of changing the iteration position. | |
619 | * | |
620 | * \code | |
621 | * UText *ut = whatever; | |
622 | * ... | |
623 | * utext_previous(ut) | |
624 | * utext_getNativeIndex(ut); | |
625 | * \endcode | |
626 | * | |
627 | * This function is most useful during forwards iteration, where it will get the | |
628 | * native index of the character most recently returned from utext_next(). | |
629 | * | |
630 | * @param ut the text to be accessed | |
631 | * @return the native index of the character preceding the current index position, | |
632 | * or zero if the current position is at the start of the text. | |
633 | * @stable ICU 3.6 | |
634 | */ | |
635 | U_STABLE int64_t U_EXPORT2 | |
636 | utext_getPreviousNativeIndex(UText *ut); | |
637 | ||
638 | ||
639 | /** | |
640 | * | |
641 | * Extract text from a UText into a UChar buffer. The range of text to be extracted | |
642 | * is specified in the native indices of the UText provider. These may not necessarily | |
643 | * be UTF-16 indices. | |
644 | * <p> | |
645 | * The size (number of 16 bit UChars) of the data to be extracted is returned. The | |
646 | * full number of UChars is returned, even when the extracted text is truncated | |
647 | * because the specified buffer size is too small. | |
648 | * <p> | |
649 | * The extracted string will (if you are a user) / must (if you are a text provider) | |
650 | * be NUL-terminated if there is sufficient space in the destination buffer. This | |
651 | * terminating NUL is not included in the returned length. | |
652 | * <p> | |
653 | * The iteration index is left at the position following the last extracted character. | |
654 | * | |
655 | * @param ut the UText from which to extract data. | |
656 | * @param nativeStart the native index of the first character to extract.\ | |
657 | * If the specified index is out of range, | |
658 | * it will be pinned to be within 0 <= index <= textLength | |
659 | * @param nativeLimit the native string index of the position following the last | |
660 | * character to extract. If the specified index is out of range, | |
661 | * it will be pinned to be within 0 <= index <= textLength. | |
662 | * nativeLimit must be >= nativeStart. | |
663 | * @param dest the UChar (UTF-16) buffer into which the extracted text is placed | |
664 | * @param destCapacity The size, in UChars, of the destination buffer. May be zero | |
665 | * for precomputing the required size. | |
666 | * @param status receives any error status. | |
667 | * U_BUFFER_OVERFLOW_ERROR: the extracted text was truncated because the | |
668 | * buffer was too small. Returns number of UChars for preflighting. | |
669 | * @return Number of UChars in the data to be extracted. Does not include a trailing NUL. | |
670 | * | |
671 | * @stable ICU 3.4 | |
672 | */ | |
673 | U_STABLE int32_t U_EXPORT2 | |
674 | utext_extract(UText *ut, | |
675 | int64_t nativeStart, int64_t nativeLimit, | |
676 | UChar *dest, int32_t destCapacity, | |
677 | UErrorCode *status); | |
678 | ||
679 | ||
680 | ||
681 | /************************************************************************************ | |
682 | * | |
683 | * #define inline versions of selected performance-critical text access functions | |
684 | * Caution: do not use auto increment++ or decrement-- expressions | |
685 | * as parameters to these macros. | |
686 | * | |
687 | * For most use, where there is no extreme performance constraint, the | |
688 | * normal, non-inline functions are a better choice. The resulting code | |
689 | * will be smaller, and, if the need ever arises, easier to debug. | |
690 | * | |
691 | * These are implemented as #defines rather than real functions | |
692 | * because there is no fully portable way to do inline functions in plain C. | |
693 | * | |
694 | ************************************************************************************/ | |
695 | ||
696 | #ifndef U_HIDE_INTERNAL_API | |
697 | /** | |
698 | * inline version of utext_current32(), for performance-critical situations. | |
699 | * | |
700 | * Get the code point at the current iteration position of the UText. | |
701 | * Returns U_SENTINEL (-1) if the position is at the end of the | |
702 | * text. | |
703 | * | |
704 | * @internal ICU 4.4 technology preview | |
705 | */ | |
706 | #define UTEXT_CURRENT32(ut) \ | |
707 | ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ | |
708 | ((ut)->chunkContents)[((ut)->chunkOffset)] : utext_current32(ut)) | |
709 | #endif /* U_HIDE_INTERNAL_API */ | |
710 | ||
711 | /** | |
712 | * inline version of utext_next32(), for performance-critical situations. | |
713 | * | |
714 | * Get the code point at the current iteration position of the UText, and | |
715 | * advance the position to the first index following the character. | |
716 | * This is a post-increment operation. | |
717 | * Returns U_SENTINEL (-1) if the position is at the end of the | |
718 | * text. | |
719 | * | |
720 | * @stable ICU 3.4 | |
721 | */ | |
722 | #define UTEXT_NEXT32(ut) \ | |
723 | ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ | |
724 | ((ut)->chunkContents)[((ut)->chunkOffset)++] : utext_next32(ut)) | |
725 | ||
726 | /** | |
727 | * inline version of utext_previous32(), for performance-critical situations. | |
728 | * | |
729 | * Move the iterator position to the character (code point) whose | |
730 | * index precedes the current position, and return that character. | |
731 | * This is a pre-decrement operation. | |
732 | * Returns U_SENTINEL (-1) if the position is at the start of the text. | |
733 | * | |
734 | * @stable ICU 3.4 | |
735 | */ | |
736 | #define UTEXT_PREVIOUS32(ut) \ | |
737 | ((ut)->chunkOffset > 0 && \ | |
738 | (ut)->chunkContents[(ut)->chunkOffset-1] < 0xd800 ? \ | |
739 | (ut)->chunkContents[--((ut)->chunkOffset)] : utext_previous32(ut)) | |
740 | ||
741 | /** | |
742 | * inline version of utext_getNativeIndex(), for performance-critical situations. | |
743 | * | |
744 | * Get the current iterator position, which can range from 0 to | |
745 | * the length of the text. | |
746 | * The position is a native index into the input text, in whatever format it | |
747 | * may have (possibly UTF-8 for example), and may not always be the same as | |
748 | * the corresponding UChar (UTF-16) index. | |
749 | * The returned position will always be aligned to a code point boundary. | |
750 | * | |
751 | * @stable ICU 3.6 | |
752 | */ | |
753 | #define UTEXT_GETNATIVEINDEX(ut) \ | |
754 | ((ut)->chunkOffset <= (ut)->nativeIndexingLimit? \ | |
755 | (ut)->chunkNativeStart+(ut)->chunkOffset : \ | |
756 | (ut)->pFuncs->mapOffsetToNative(ut)) | |
757 | ||
758 | /** | |
759 | * inline version of utext_setNativeIndex(), for performance-critical situations. | |
760 | * | |
761 | * Set the current iteration position to the nearest code point | |
762 | * boundary at or preceding the specified index. | |
763 | * The index is in the native units of the original input text. | |
764 | * If the index is out of range, it will be pinned to be within | |
765 | * the range of the input text. | |
766 | * | |
767 | * @stable ICU 3.8 | |
768 | */ | |
769 | #if LOG_UTEXT_SETNATIVEINDEX | |
770 | /* Add logging for <rdar://problem/44884660> */ | |
771 | #define UTEXT_SETNATIVEINDEX(ut, ix) UPRV_BLOCK_MACRO_BEGIN { \ | |
772 | int64_t __offset = (ix) - (ut)->chunkNativeStart; \ | |
773 | if ((ut)->chunkContents!=0 && __offset>=0 && __offset<(int64_t)(ut)->nativeIndexingLimit && (ut)->chunkContents[__offset]<0xdc00) { \ | |
774 | (ut)->chunkOffset=(int32_t)__offset; \ | |
775 | } else if ((ut)->chunkContents==0 && __offset>=0 && __offset<(int64_t)(ut)->nativeIndexingLimit) { \ | |
776 | os_log(OS_LOG_DEFAULT, "# UTEXT_SETNATIVEINDEX (ut) %p, (ut)->chunkContents 0, __offset %lld", (ut), __offset); \ | |
777 | } else { \ | |
778 | utext_setNativeIndex((ut), (ix)); | |
779 | } \ | |
780 | } UPRV_BLOCK_MACRO_END | |
781 | #else | |
782 | #define UTEXT_SETNATIVEINDEX(ut, ix) UPRV_BLOCK_MACRO_BEGIN { \ | |
783 | int64_t __offset = (ix) - (ut)->chunkNativeStart; \ | |
784 | if (__offset>=0 && __offset<(int64_t)(ut)->nativeIndexingLimit && (ut)->chunkContents[__offset]<0xdc00) { \ | |
785 | (ut)->chunkOffset=(int32_t)__offset; \ | |
786 | } else { \ | |
787 | utext_setNativeIndex((ut), (ix)); \ | |
788 | } \ | |
789 | } UPRV_BLOCK_MACRO_END | |
790 | #endif | |
791 | ||
792 | ||
793 | ||
794 | /************************************************************************************ | |
795 | * | |
796 | * Functions related to writing or modifying the text. | |
797 | * These will work only with modifiable UTexts. Attempting to | |
798 | * modify a read-only UText will return an error status. | |
799 | * | |
800 | ************************************************************************************/ | |
801 | ||
802 | ||
803 | /** | |
804 | * Return TRUE if the text can be written (modified) with utext_replace() or | |
805 | * utext_copy(). For the text to be writable, the text provider must | |
806 | * be of a type that supports writing and the UText must not be frozen. | |
807 | * | |
808 | * Attempting to modify text when utext_isWriteable() is FALSE will fail - | |
809 | * the text will not be modified, and an error will be returned from the function | |
810 | * that attempted the modification. | |
811 | * | |
812 | * @param ut the UText to be tested. | |
813 | * @return TRUE if the text is modifiable. | |
814 | * | |
815 | * @see utext_freeze() | |
816 | * @see utext_replace() | |
817 | * @see utext_copy() | |
818 | * @stable ICU 3.4 | |
819 | * | |
820 | */ | |
821 | U_STABLE UBool U_EXPORT2 | |
822 | utext_isWritable(const UText *ut); | |
823 | ||
824 | ||
825 | /** | |
826 | * Test whether there is meta data associated with the text. | |
827 | * @see Replaceable::hasMetaData() | |
828 | * | |
829 | * @param ut The UText to be tested | |
830 | * @return TRUE if the underlying text includes meta data. | |
831 | * @stable ICU 3.4 | |
832 | */ | |
833 | U_STABLE UBool U_EXPORT2 | |
834 | utext_hasMetaData(const UText *ut); | |
835 | ||
836 | ||
837 | /** | |
838 | * Replace a range of the original text with a replacement text. | |
839 | * | |
840 | * Leaves the current iteration position at the position following the | |
841 | * newly inserted replacement text. | |
842 | * | |
843 | * This function is only available on UText types that support writing, | |
844 | * that is, ones where utext_isWritable() returns TRUE. | |
845 | * | |
846 | * When using this function, there should be only a single UText opened onto the | |
847 | * underlying native text string. Behavior after a replace operation | |
848 | * on a UText is undefined for any other additional UTexts that refer to the | |
849 | * modified string. | |
850 | * | |
851 | * @param ut the UText representing the text to be operated on. | |
852 | * @param nativeStart the native index of the start of the region to be replaced | |
853 | * @param nativeLimit the native index of the character following the region to be replaced. | |
854 | * @param replacementText pointer to the replacement text | |
855 | * @param replacementLength length of the replacement text, or -1 if the text is NUL terminated. | |
856 | * @param status receives any error status. Possible errors include | |
857 | * U_NO_WRITE_PERMISSION | |
858 | * | |
859 | * @return The signed number of (native) storage units by which | |
860 | * the length of the text expanded or contracted. | |
861 | * | |
862 | * @stable ICU 3.4 | |
863 | */ | |
864 | U_STABLE int32_t U_EXPORT2 | |
865 | utext_replace(UText *ut, | |
866 | int64_t nativeStart, int64_t nativeLimit, | |
867 | const UChar *replacementText, int32_t replacementLength, | |
868 | UErrorCode *status); | |
869 | ||
870 | ||
871 | ||
872 | /** | |
873 | * | |
874 | * Copy or move a substring from one position to another within the text, | |
875 | * while retaining any metadata associated with the text. | |
876 | * This function is used to duplicate or reorder substrings. | |
877 | * The destination index must not overlap the source range. | |
878 | * | |
879 | * The text to be copied or moved is inserted at destIndex; | |
880 | * it does not replace or overwrite any existing text. | |
881 | * | |
882 | * The iteration position is left following the newly inserted text | |
883 | * at the destination position. | |
884 | * | |
885 | * This function is only available on UText types that support writing, | |
886 | * that is, ones where utext_isWritable() returns TRUE. | |
887 | * | |
888 | * When using this function, there should be only a single UText opened onto the | |
889 | * underlying native text string. Behavior after a copy operation | |
890 | * on a UText is undefined in any other additional UTexts that refer to the | |
891 | * modified string. | |
892 | * | |
893 | * @param ut The UText representing the text to be operated on. | |
894 | * @param nativeStart The native index of the start of the region to be copied or moved | |
895 | * @param nativeLimit The native index of the character position following the region | |
896 | * to be copied. | |
897 | * @param destIndex The native destination index to which the source substring is | |
898 | * copied or moved. | |
899 | * @param move If TRUE, then the substring is moved, not copied/duplicated. | |
900 | * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION | |
901 | * | |
902 | * @stable ICU 3.4 | |
903 | */ | |
904 | U_STABLE void U_EXPORT2 | |
905 | utext_copy(UText *ut, | |
906 | int64_t nativeStart, int64_t nativeLimit, | |
907 | int64_t destIndex, | |
908 | UBool move, | |
909 | UErrorCode *status); | |
910 | ||
911 | ||
912 | /** | |
913 | * <p> | |
914 | * Freeze a UText. This prevents any modification to the underlying text itself | |
915 | * by means of functions operating on this UText. | |
916 | * </p> | |
917 | * <p> | |
918 | * Once frozen, a UText can not be unfrozen. The intent is to ensure | |
919 | * that a the text underlying a frozen UText wrapper cannot be modified via that UText. | |
920 | * </p> | |
921 | * <p> | |
922 | * Caution: freezing a UText will disable changes made via the specific | |
923 | * frozen UText wrapper only; it will not have any effect on the ability to | |
924 | * directly modify the text by bypassing the UText. Any such backdoor modifications | |
925 | * are always an error while UText access is occurring because the underlying | |
926 | * text can get out of sync with UText's buffering. | |
927 | * </p> | |
928 | * | |
929 | * @param ut The UText to be frozen. | |
930 | * @see utext_isWritable() | |
931 | * @stable ICU 3.6 | |
932 | */ | |
933 | U_STABLE void U_EXPORT2 | |
934 | utext_freeze(UText *ut); | |
935 | ||
936 | ||
937 | /** | |
938 | * UText provider properties (bit field indexes). | |
939 | * | |
940 | * @see UText | |
941 | * @stable ICU 3.4 | |
942 | */ | |
943 | enum { | |
944 | /** | |
945 | * It is potentially time consuming for the provider to determine the length of the text. | |
946 | * @stable ICU 3.4 | |
947 | */ | |
948 | UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE = 1, | |
949 | /** | |
950 | * Text chunks remain valid and usable until the text object is modified or | |
951 | * deleted, not just until the next time the access() function is called | |
952 | * (which is the default). | |
953 | * @stable ICU 3.4 | |
954 | */ | |
955 | UTEXT_PROVIDER_STABLE_CHUNKS = 2, | |
956 | /** | |
957 | * The provider supports modifying the text via the replace() and copy() | |
958 | * functions. | |
959 | * @see Replaceable | |
960 | * @stable ICU 3.4 | |
961 | */ | |
962 | UTEXT_PROVIDER_WRITABLE = 3, | |
963 | /** | |
964 | * There is meta data associated with the text. | |
965 | * @see Replaceable::hasMetaData() | |
966 | * @stable ICU 3.4 | |
967 | */ | |
968 | UTEXT_PROVIDER_HAS_META_DATA = 4, | |
969 | /** | |
970 | * Text provider owns the text storage. | |
971 | * Generally occurs as the result of a deep clone of the UText. | |
972 | * When closing the UText, the associated text must | |
973 | * also be closed/deleted/freed/ whatever is appropriate. | |
974 | * @stable ICU 3.6 | |
975 | */ | |
976 | UTEXT_PROVIDER_OWNS_TEXT = 5 | |
977 | }; | |
978 | ||
979 | /** | |
980 | * Function type declaration for UText.clone(). | |
981 | * | |
982 | * clone a UText. Much like opening a UText where the source text is itself | |
983 | * another UText. | |
984 | * | |
985 | * A deep clone will copy both the UText data structures and the underlying text. | |
986 | * The original and cloned UText will operate completely independently; modifications | |
987 | * made to the text in one will not effect the other. Text providers are not | |
988 | * required to support deep clones. The user of clone() must check the status return | |
989 | * and be prepared to handle failures. | |
990 | * | |
991 | * A shallow clone replicates only the UText data structures; it does not make | |
992 | * a copy of the underlying text. Shallow clones can be used as an efficient way to | |
993 | * have multiple iterators active in a single text string that is not being | |
994 | * modified. | |
995 | * | |
996 | * A shallow clone operation must not fail except for truly exceptional conditions such | |
997 | * as memory allocation failures. | |
998 | * | |
999 | * A UText and its clone may be safely concurrently accessed by separate threads. | |
1000 | * This is true for both shallow and deep clones. | |
1001 | * It is the responsibility of the Text Provider to ensure that this thread safety | |
1002 | * constraint is met. | |
1003 | ||
1004 | * | |
1005 | * @param dest A UText struct to be filled in with the result of the clone operation, | |
1006 | * or NULL if the clone function should heap-allocate a new UText struct. | |
1007 | * @param src The UText to be cloned. | |
1008 | * @param deep TRUE to request a deep clone, FALSE for a shallow clone. | |
1009 | * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR | |
1010 | * should be returned if the text provider is unable to clone the | |
1011 | * original text. | |
1012 | * @return The newly created clone, or NULL if the clone operation failed. | |
1013 | * | |
1014 | * @stable ICU 3.4 | |
1015 | */ | |
1016 | typedef UText * U_CALLCONV | |
1017 | UTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status); | |
1018 | ||
1019 | ||
1020 | /** | |
1021 | * Function type declaration for UText.nativeLength(). | |
1022 | * | |
1023 | * @param ut the UText to get the length of. | |
1024 | * @return the length, in the native units of the original text string. | |
1025 | * @see UText | |
1026 | * @stable ICU 3.4 | |
1027 | */ | |
1028 | typedef int64_t U_CALLCONV | |
1029 | UTextNativeLength(UText *ut); | |
1030 | ||
1031 | /** | |
1032 | * Function type declaration for UText.access(). Get the description of the text chunk | |
1033 | * containing the text at a requested native index. The UText's iteration | |
1034 | * position will be left at the requested index. If the index is out | |
1035 | * of bounds, the iteration position will be left at the start or end | |
1036 | * of the string, as appropriate. | |
1037 | * | |
1038 | * Chunks must begin and end on code point boundaries. A single code point | |
1039 | * comprised of multiple storage units must never span a chunk boundary. | |
1040 | * | |
1041 | * | |
1042 | * @param ut the UText being accessed. | |
1043 | * @param nativeIndex Requested index of the text to be accessed. | |
1044 | * @param forward If TRUE, then the returned chunk must contain text | |
1045 | * starting from the index, so that start<=index<limit. | |
1046 | * If FALSE, then the returned chunk must contain text | |
1047 | * before the index, so that start<index<=limit. | |
1048 | * @return True if the requested index could be accessed. The chunk | |
1049 | * will contain the requested text. | |
1050 | * False value if a chunk cannot be accessed | |
1051 | * (the requested index is out of bounds). | |
1052 | * | |
1053 | * @see UText | |
1054 | * @stable ICU 3.4 | |
1055 | */ | |
1056 | typedef UBool U_CALLCONV | |
1057 | UTextAccess(UText *ut, int64_t nativeIndex, UBool forward); | |
1058 | ||
1059 | /** | |
1060 | * Function type declaration for UText.extract(). | |
1061 | * | |
1062 | * Extract text from a UText into a UChar buffer. The range of text to be extracted | |
1063 | * is specified in the native indices of the UText provider. These may not necessarily | |
1064 | * be UTF-16 indices. | |
1065 | * <p> | |
1066 | * The size (number of 16 bit UChars) in the data to be extracted is returned. The | |
1067 | * full amount is returned, even when the specified buffer size is smaller. | |
1068 | * <p> | |
1069 | * The extracted string will (if you are a user) / must (if you are a text provider) | |
1070 | * be NUL-terminated if there is sufficient space in the destination buffer. | |
1071 | * | |
1072 | * @param ut the UText from which to extract data. | |
1073 | * @param nativeStart the native index of the first character to extract. | |
1074 | * @param nativeLimit the native string index of the position following the last | |
1075 | * character to extract. | |
1076 | * @param dest the UChar (UTF-16) buffer into which the extracted text is placed | |
1077 | * @param destCapacity The size, in UChars, of the destination buffer. May be zero | |
1078 | * for precomputing the required size. | |
1079 | * @param status receives any error status. | |
1080 | * If U_BUFFER_OVERFLOW_ERROR: Returns number of UChars for | |
1081 | * preflighting. | |
1082 | * @return Number of UChars in the data. Does not include a trailing NUL. | |
1083 | * | |
1084 | * @stable ICU 3.4 | |
1085 | */ | |
1086 | typedef int32_t U_CALLCONV | |
1087 | UTextExtract(UText *ut, | |
1088 | int64_t nativeStart, int64_t nativeLimit, | |
1089 | UChar *dest, int32_t destCapacity, | |
1090 | UErrorCode *status); | |
1091 | ||
1092 | /** | |
1093 | * Function type declaration for UText.replace(). | |
1094 | * | |
1095 | * Replace a range of the original text with a replacement text. | |
1096 | * | |
1097 | * Leaves the current iteration position at the position following the | |
1098 | * newly inserted replacement text. | |
1099 | * | |
1100 | * This function need only be implemented on UText types that support writing. | |
1101 | * | |
1102 | * When using this function, there should be only a single UText opened onto the | |
1103 | * underlying native text string. The function is responsible for updating the | |
1104 | * text chunk within the UText to reflect the updated iteration position, | |
1105 | * taking into account any changes to the underlying string's structure caused | |
1106 | * by the replace operation. | |
1107 | * | |
1108 | * @param ut the UText representing the text to be operated on. | |
1109 | * @param nativeStart the index of the start of the region to be replaced | |
1110 | * @param nativeLimit the index of the character following the region to be replaced. | |
1111 | * @param replacementText pointer to the replacement text | |
1112 | * @param replacmentLength length of the replacement text in UChars, or -1 if the text is NUL terminated. | |
1113 | * @param status receives any error status. Possible errors include | |
1114 | * U_NO_WRITE_PERMISSION | |
1115 | * | |
1116 | * @return The signed number of (native) storage units by which | |
1117 | * the length of the text expanded or contracted. | |
1118 | * | |
1119 | * @stable ICU 3.4 | |
1120 | */ | |
1121 | typedef int32_t U_CALLCONV | |
1122 | UTextReplace(UText *ut, | |
1123 | int64_t nativeStart, int64_t nativeLimit, | |
1124 | const UChar *replacementText, int32_t replacmentLength, | |
1125 | UErrorCode *status); | |
1126 | ||
1127 | /** | |
1128 | * Function type declaration for UText.copy(). | |
1129 | * | |
1130 | * Copy or move a substring from one position to another within the text, | |
1131 | * while retaining any metadata associated with the text. | |
1132 | * This function is used to duplicate or reorder substrings. | |
1133 | * The destination index must not overlap the source range. | |
1134 | * | |
1135 | * The text to be copied or moved is inserted at destIndex; | |
1136 | * it does not replace or overwrite any existing text. | |
1137 | * | |
1138 | * This function need only be implemented for UText types that support writing. | |
1139 | * | |
1140 | * When using this function, there should be only a single UText opened onto the | |
1141 | * underlying native text string. The function is responsible for updating the | |
1142 | * text chunk within the UText to reflect the updated iteration position, | |
1143 | * taking into account any changes to the underlying string's structure caused | |
1144 | * by the replace operation. | |
1145 | * | |
1146 | * @param ut The UText representing the text to be operated on. | |
1147 | * @param nativeStart The index of the start of the region to be copied or moved | |
1148 | * @param nativeLimit The index of the character following the region to be replaced. | |
1149 | * @param nativeDest The destination index to which the source substring is copied or moved. | |
1150 | * @param move If TRUE, then the substring is moved, not copied/duplicated. | |
1151 | * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION | |
1152 | * | |
1153 | * @stable ICU 3.4 | |
1154 | */ | |
1155 | typedef void U_CALLCONV | |
1156 | UTextCopy(UText *ut, | |
1157 | int64_t nativeStart, int64_t nativeLimit, | |
1158 | int64_t nativeDest, | |
1159 | UBool move, | |
1160 | UErrorCode *status); | |
1161 | ||
1162 | /** | |
1163 | * Function type declaration for UText.mapOffsetToNative(). | |
1164 | * Map from the current UChar offset within the current text chunk to | |
1165 | * the corresponding native index in the original source text. | |
1166 | * | |
1167 | * This is required only for text providers that do not use native UTF-16 indexes. | |
1168 | * | |
1169 | * @param ut the UText. | |
1170 | * @return Absolute (native) index corresponding to chunkOffset in the current chunk. | |
1171 | * The returned native index should always be to a code point boundary. | |
1172 | * | |
1173 | * @stable ICU 3.4 | |
1174 | */ | |
1175 | typedef int64_t U_CALLCONV | |
1176 | UTextMapOffsetToNative(const UText *ut); | |
1177 | ||
1178 | /** | |
1179 | * Function type declaration for UText.mapIndexToUTF16(). | |
1180 | * Map from a native index to a UChar offset within a text chunk. | |
1181 | * Behavior is undefined if the native index does not fall within the | |
1182 | * current chunk. | |
1183 | * | |
1184 | * This function is required only for text providers that do not use native UTF-16 indexes. | |
1185 | * | |
1186 | * @param ut The UText containing the text chunk. | |
1187 | * @param nativeIndex Absolute (native) text index, chunk->start<=index<=chunk->limit. | |
1188 | * @return Chunk-relative UTF-16 offset corresponding to the specified native | |
1189 | * index. | |
1190 | * | |
1191 | * @stable ICU 3.4 | |
1192 | */ | |
1193 | typedef int32_t U_CALLCONV | |
1194 | UTextMapNativeIndexToUTF16(const UText *ut, int64_t nativeIndex); | |
1195 | ||
1196 | ||
1197 | /** | |
1198 | * Function type declaration for UText.utextClose(). | |
1199 | * | |
1200 | * A Text Provider close function is only required for provider types that make | |
1201 | * allocations in their open function (or other functions) that must be | |
1202 | * cleaned when the UText is closed. | |
1203 | * | |
1204 | * The allocation of the UText struct itself and any "extra" storage | |
1205 | * associated with the UText is handled by the common UText implementation | |
1206 | * and does not require provider specific cleanup in a close function. | |
1207 | * | |
1208 | * Most UText provider implementations do not need to implement this function. | |
1209 | * | |
1210 | * @param ut A UText object to be closed. | |
1211 | * | |
1212 | * @stable ICU 3.4 | |
1213 | */ | |
1214 | typedef void U_CALLCONV | |
1215 | UTextClose(UText *ut); | |
1216 | ||
1217 | ||
1218 | /** | |
1219 | * (public) Function dispatch table for UText. | |
1220 | * Conceptually very much like a C++ Virtual Function Table. | |
1221 | * This struct defines the organization of the table. | |
1222 | * Each text provider implementation must provide an | |
1223 | * actual table that is initialized with the appropriate functions | |
1224 | * for the type of text being handled. | |
1225 | * @stable ICU 3.6 | |
1226 | */ | |
1227 | struct UTextFuncs { | |
1228 | /** | |
1229 | * (public) Function table size, sizeof(UTextFuncs) | |
1230 | * Intended for use should the table grow to accommodate added | |
1231 | * functions in the future, to allow tests for older format | |
1232 | * function tables that do not contain the extensions. | |
1233 | * | |
1234 | * Fields are placed for optimal alignment on | |
1235 | * 32/64/128-bit-pointer machines, by normally grouping together | |
1236 | * 4 32-bit fields, | |
1237 | * 4 pointers, | |
1238 | * 2 64-bit fields | |
1239 | * in sequence. | |
1240 | * @stable ICU 3.6 | |
1241 | */ | |
1242 | int32_t tableSize; | |
1243 | ||
1244 | /** | |
1245 | * (private) Alignment padding. | |
1246 | * Do not use, reserved for use by the UText framework only. | |
1247 | * @internal | |
1248 | */ | |
1249 | int32_t reserved1, /** @internal */ reserved2, /** @internal */ reserved3; | |
1250 | ||
1251 | ||
1252 | /** | |
1253 | * (public) Function pointer for UTextClone | |
1254 | * | |
1255 | * @see UTextClone | |
1256 | * @stable ICU 3.6 | |
1257 | */ | |
1258 | UTextClone *clone; | |
1259 | ||
1260 | /** | |
1261 | * (public) function pointer for UTextLength | |
1262 | * May be expensive to compute! | |
1263 | * | |
1264 | * @see UTextLength | |
1265 | * @stable ICU 3.6 | |
1266 | */ | |
1267 | UTextNativeLength *nativeLength; | |
1268 | ||
1269 | /** | |
1270 | * (public) Function pointer for UTextAccess. | |
1271 | * | |
1272 | * @see UTextAccess | |
1273 | * @stable ICU 3.6 | |
1274 | */ | |
1275 | UTextAccess *access; | |
1276 | ||
1277 | /** | |
1278 | * (public) Function pointer for UTextExtract. | |
1279 | * | |
1280 | * @see UTextExtract | |
1281 | * @stable ICU 3.6 | |
1282 | */ | |
1283 | UTextExtract *extract; | |
1284 | ||
1285 | /** | |
1286 | * (public) Function pointer for UTextReplace. | |
1287 | * | |
1288 | * @see UTextReplace | |
1289 | * @stable ICU 3.6 | |
1290 | */ | |
1291 | UTextReplace *replace; | |
1292 | ||
1293 | /** | |
1294 | * (public) Function pointer for UTextCopy. | |
1295 | * | |
1296 | * @see UTextCopy | |
1297 | * @stable ICU 3.6 | |
1298 | */ | |
1299 | UTextCopy *copy; | |
1300 | ||
1301 | /** | |
1302 | * (public) Function pointer for UTextMapOffsetToNative. | |
1303 | * | |
1304 | * @see UTextMapOffsetToNative | |
1305 | * @stable ICU 3.6 | |
1306 | */ | |
1307 | UTextMapOffsetToNative *mapOffsetToNative; | |
1308 | ||
1309 | /** | |
1310 | * (public) Function pointer for UTextMapNativeIndexToUTF16. | |
1311 | * | |
1312 | * @see UTextMapNativeIndexToUTF16 | |
1313 | * @stable ICU 3.6 | |
1314 | */ | |
1315 | UTextMapNativeIndexToUTF16 *mapNativeIndexToUTF16; | |
1316 | ||
1317 | /** | |
1318 | * (public) Function pointer for UTextClose. | |
1319 | * | |
1320 | * @see UTextClose | |
1321 | * @stable ICU 3.6 | |
1322 | */ | |
1323 | UTextClose *close; | |
1324 | ||
1325 | /** | |
1326 | * (private) Spare function pointer | |
1327 | * @internal | |
1328 | */ | |
1329 | UTextClose *spare1; | |
1330 | ||
1331 | /** | |
1332 | * (private) Spare function pointer | |
1333 | * @internal | |
1334 | */ | |
1335 | UTextClose *spare2; | |
1336 | ||
1337 | /** | |
1338 | * (private) Spare function pointer | |
1339 | * @internal | |
1340 | */ | |
1341 | UTextClose *spare3; | |
1342 | ||
1343 | }; | |
1344 | /** | |
1345 | * Function dispatch table for UText | |
1346 | * @see UTextFuncs | |
1347 | */ | |
1348 | typedef struct UTextFuncs UTextFuncs; | |
1349 | ||
1350 | /** | |
1351 | * UText struct. Provides the interface between the generic UText access code | |
1352 | * and the UText provider code that works on specific kinds of | |
1353 | * text (UTF-8, noncontiguous UTF-16, whatever.) | |
1354 | * | |
1355 | * Applications that are using predefined types of text providers | |
1356 | * to pass text data to ICU services will have no need to view the | |
1357 | * internals of the UText structs that they open. | |
1358 | * | |
1359 | * @stable ICU 3.6 | |
1360 | */ | |
1361 | struct UText { | |
1362 | /** | |
1363 | * (private) Magic. Used to help detect when UText functions are handed | |
1364 | * invalid or uninitialized UText structs. | |
1365 | * utext_openXYZ() functions take an initialized, | |
1366 | * but not necessarily open, UText struct as an | |
1367 | * optional fill-in parameter. This magic field | |
1368 | * is used to check for that initialization. | |
1369 | * Text provider close functions must NOT clear | |
1370 | * the magic field because that would prevent | |
1371 | * reuse of the UText struct. | |
1372 | * @internal | |
1373 | */ | |
1374 | uint32_t magic; | |
1375 | ||
1376 | ||
1377 | /** | |
1378 | * (private) Flags for managing the allocation and freeing of | |
1379 | * memory associated with this UText. | |
1380 | * @internal | |
1381 | */ | |
1382 | int32_t flags; | |
1383 | ||
1384 | ||
1385 | /** | |
1386 | * Text provider properties. This set of flags is maintained by the | |
1387 | * text provider implementation. | |
1388 | * @stable ICU 3.4 | |
1389 | */ | |
1390 | int32_t providerProperties; | |
1391 | ||
1392 | /** | |
1393 | * (public) sizeOfStruct=sizeof(UText) | |
1394 | * Allows possible backward compatible extension. | |
1395 | * | |
1396 | * @stable ICU 3.4 | |
1397 | */ | |
1398 | int32_t sizeOfStruct; | |
1399 | ||
1400 | /* ------ 16 byte alignment boundary ----------- */ | |
1401 | ||
1402 | ||
1403 | /** | |
1404 | * (protected) Native index of the first character position following | |
1405 | * the current chunk. | |
1406 | * @stable ICU 3.6 | |
1407 | */ | |
1408 | int64_t chunkNativeLimit; | |
1409 | ||
1410 | /** | |
1411 | * (protected) Size in bytes of the extra space (pExtra). | |
1412 | * @stable ICU 3.4 | |
1413 | */ | |
1414 | int32_t extraSize; | |
1415 | ||
1416 | /** | |
1417 | * (protected) The highest chunk offset where native indexing and | |
1418 | * chunk (UTF-16) indexing correspond. For UTF-16 sources, value | |
1419 | * will be equal to chunkLength. | |
1420 | * | |
1421 | * @stable ICU 3.6 | |
1422 | */ | |
1423 | int32_t nativeIndexingLimit; | |
1424 | ||
1425 | /* ---- 16 byte alignment boundary------ */ | |
1426 | ||
1427 | /** | |
1428 | * (protected) Native index of the first character in the text chunk. | |
1429 | * @stable ICU 3.6 | |
1430 | */ | |
1431 | int64_t chunkNativeStart; | |
1432 | ||
1433 | /** | |
1434 | * (protected) Current iteration position within the text chunk (UTF-16 buffer). | |
1435 | * This is the index to the character that will be returned by utext_next32(). | |
1436 | * @stable ICU 3.6 | |
1437 | */ | |
1438 | int32_t chunkOffset; | |
1439 | ||
1440 | /** | |
1441 | * (protected) Length the text chunk (UTF-16 buffer), in UChars. | |
1442 | * @stable ICU 3.6 | |
1443 | */ | |
1444 | int32_t chunkLength; | |
1445 | ||
1446 | /* ---- 16 byte alignment boundary-- */ | |
1447 | ||
1448 | ||
1449 | /** | |
1450 | * (protected) pointer to a chunk of text in UTF-16 format. | |
1451 | * May refer either to original storage of the source of the text, or | |
1452 | * if conversion was required, to a buffer owned by the UText. | |
1453 | * @stable ICU 3.6 | |
1454 | */ | |
1455 | const UChar *chunkContents; | |
1456 | ||
1457 | /** | |
1458 | * (public) Pointer to Dispatch table for accessing functions for this UText. | |
1459 | * @stable ICU 3.6 | |
1460 | */ | |
1461 | const UTextFuncs *pFuncs; | |
1462 | ||
1463 | /** | |
1464 | * (protected) Pointer to additional space requested by the | |
1465 | * text provider during the utext_open operation. | |
1466 | * @stable ICU 3.4 | |
1467 | */ | |
1468 | void *pExtra; | |
1469 | ||
1470 | /** | |
1471 | * (protected) Pointer to string or text-containing object or similar. | |
1472 | * This is the source of the text that this UText is wrapping, in a format | |
1473 | * that is known to the text provider functions. | |
1474 | * @stable ICU 3.4 | |
1475 | */ | |
1476 | const void *context; | |
1477 | ||
1478 | /* --- 16 byte alignment boundary--- */ | |
1479 | ||
1480 | /** | |
1481 | * (protected) Pointer fields available for use by the text provider. | |
1482 | * Not used by UText common code. | |
1483 | * @stable ICU 3.6 | |
1484 | */ | |
1485 | const void *p; | |
1486 | /** | |
1487 | * (protected) Pointer fields available for use by the text provider. | |
1488 | * Not used by UText common code. | |
1489 | * @stable ICU 3.6 | |
1490 | */ | |
1491 | const void *q; | |
1492 | /** | |
1493 | * (protected) Pointer fields available for use by the text provider. | |
1494 | * Not used by UText common code. | |
1495 | * @stable ICU 3.6 | |
1496 | */ | |
1497 | const void *r; | |
1498 | ||
1499 | /** | |
1500 | * Private field reserved for future use by the UText framework | |
1501 | * itself. This is not to be touched by the text providers. | |
1502 | * @internal ICU 3.4 | |
1503 | */ | |
1504 | void *privP; | |
1505 | ||
1506 | ||
1507 | /* --- 16 byte alignment boundary--- */ | |
1508 | ||
1509 | ||
1510 | /** | |
1511 | * (protected) Integer field reserved for use by the text provider. | |
1512 | * Not used by the UText framework, or by the client (user) of the UText. | |
1513 | * @stable ICU 3.4 | |
1514 | */ | |
1515 | int64_t a; | |
1516 | ||
1517 | /** | |
1518 | * (protected) Integer field reserved for use by the text provider. | |
1519 | * Not used by the UText framework, or by the client (user) of the UText. | |
1520 | * @stable ICU 3.4 | |
1521 | */ | |
1522 | int32_t b; | |
1523 | ||
1524 | /** | |
1525 | * (protected) Integer field reserved for use by the text provider. | |
1526 | * Not used by the UText framework, or by the client (user) of the UText. | |
1527 | * @stable ICU 3.4 | |
1528 | */ | |
1529 | int32_t c; | |
1530 | ||
1531 | /* ---- 16 byte alignment boundary---- */ | |
1532 | ||
1533 | ||
1534 | /** | |
1535 | * Private field reserved for future use by the UText framework | |
1536 | * itself. This is not to be touched by the text providers. | |
1537 | * @internal ICU 3.4 | |
1538 | */ | |
1539 | int64_t privA; | |
1540 | /** | |
1541 | * Private field reserved for future use by the UText framework | |
1542 | * itself. This is not to be touched by the text providers. | |
1543 | * @internal ICU 3.4 | |
1544 | */ | |
1545 | int32_t privB; | |
1546 | /** | |
1547 | * Private field reserved for future use by the UText framework | |
1548 | * itself. This is not to be touched by the text providers. | |
1549 | * @internal ICU 3.4 | |
1550 | */ | |
1551 | int32_t privC; | |
1552 | }; | |
1553 | ||
1554 | ||
1555 | /** | |
1556 | * Common function for use by Text Provider implementations to allocate and/or initialize | |
1557 | * a new UText struct. To be called in the implementation of utext_open() functions. | |
1558 | * If the supplied UText parameter is null, a new UText struct will be allocated on the heap. | |
1559 | * If the supplied UText is already open, the provider's close function will be called | |
1560 | * so that the struct can be reused by the open that is in progress. | |
1561 | * | |
1562 | * @param ut pointer to a UText struct to be re-used, or null if a new UText | |
1563 | * should be allocated. | |
1564 | * @param extraSpace The amount of additional space to be allocated as part | |
1565 | * of this UText, for use by types of providers that require | |
1566 | * additional storage. | |
1567 | * @param status Errors are returned here. | |
1568 | * @return pointer to the UText, allocated if necessary, with extra space set up if requested. | |
1569 | * @stable ICU 3.4 | |
1570 | */ | |
1571 | U_STABLE UText * U_EXPORT2 | |
1572 | utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status); | |
1573 | ||
1574 | // do not use #ifndef U_HIDE_INTERNAL_API around the following! | |
1575 | /** | |
1576 | * @internal | |
1577 | * Value used to help identify correctly initialized UText structs. | |
1578 | * Note: must be publicly visible so that UTEXT_INITIALIZER can access it. | |
1579 | */ | |
1580 | enum { | |
1581 | UTEXT_MAGIC = 0x345ad82c | |
1582 | }; | |
1583 | ||
1584 | /** | |
1585 | * initializer to be used with local (stack) instances of a UText | |
1586 | * struct. UText structs must be initialized before passing | |
1587 | * them to one of the utext_open functions. | |
1588 | * | |
1589 | * @stable ICU 3.6 | |
1590 | */ | |
1591 | #define UTEXT_INITIALIZER { \ | |
1592 | UTEXT_MAGIC, /* magic */ \ | |
1593 | 0, /* flags */ \ | |
1594 | 0, /* providerProps */ \ | |
1595 | sizeof(UText), /* sizeOfStruct */ \ | |
1596 | 0, /* chunkNativeLimit */ \ | |
1597 | 0, /* extraSize */ \ | |
1598 | 0, /* nativeIndexingLimit */ \ | |
1599 | 0, /* chunkNativeStart */ \ | |
1600 | 0, /* chunkOffset */ \ | |
1601 | 0, /* chunkLength */ \ | |
1602 | NULL, /* chunkContents */ \ | |
1603 | NULL, /* pFuncs */ \ | |
1604 | NULL, /* pExtra */ \ | |
1605 | NULL, /* context */ \ | |
1606 | NULL, NULL, NULL, /* p, q, r */ \ | |
1607 | NULL, /* privP */ \ | |
1608 | 0, 0, 0, /* a, b, c */ \ | |
1609 | 0, 0, 0 /* privA,B,C, */ \ | |
1610 | } | |
1611 | ||
1612 | ||
1613 | U_CDECL_END | |
1614 | ||
1615 | ||
1616 | ||
1617 | #endif |