1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2009-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: normalizer2.h
12 * tab size: 8 (not used)
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
19 #ifndef __NORMALIZER2_H__
20 #define __NORMALIZER2_H__
24 * \brief C++ API: New API for Unicode Normalization.
27 #include "unicode/utypes.h"
29 #if !UCONFIG_NO_NORMALIZATION
31 #include "unicode/stringpiece.h"
32 #include "unicode/uniset.h"
33 #include "unicode/unistr.h"
34 #include "unicode/unorm2.h"
36 #if U_SHOW_CPLUSPLUS_API
42 * Unicode normalization functionality for standard Unicode normalization or
43 * for using custom mapping tables.
44 * All instances of this class are unmodifiable/immutable.
45 * Instances returned by getInstance() are singletons that must not be deleted by the caller.
46 * The Normalizer2 class is not intended for public subclassing.
48 * The primary functions are to produce a normalized string and to detect whether
49 * a string is already normalized.
50 * The most commonly used normalization forms are those defined in
51 * http://www.unicode.org/unicode/reports/tr15/
52 * However, this API supports additional normalization forms for specialized purposes.
53 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
54 * and can be used in implementations of UTS #46.
56 * Not only are the standard compose and decompose modes supplied,
57 * but additional modes are provided as documented in the Mode enum.
59 * Some of the functions in this class identify normalization boundaries.
60 * At a normalization boundary, the portions of the string
61 * before it and starting from it do not interact and can be handled independently.
63 * The spanQuickCheckYes() stops at a normalization boundary.
64 * When the goal is a normalized string, then the text before the boundary
65 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
67 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
68 * a character is guaranteed to be at a normalization boundary,
69 * regardless of context.
70 * This is used for moving from one normalization boundary to the next
71 * or preceding boundary, and for performing iterative normalization.
73 * Iterative normalization is useful when only a small portion of a
74 * longer string needs to be processed.
75 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
76 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
77 * (to process only the substring for which sort key bytes are computed).
79 * The set of normalization boundaries returned by these functions may not be
80 * complete: There may be more boundaries that could be returned.
81 * Different functions may return different boundaries.
84 class U_COMMON_API Normalizer2
: public UObject
{
93 * Returns a Normalizer2 instance for Unicode NFC normalization.
94 * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).
95 * Returns an unmodifiable singleton instance. Do not delete it.
96 * @param errorCode Standard ICU error code. Its input value must
97 * pass the U_SUCCESS() test, or else the function returns
98 * immediately. Check for U_FAILURE() on output or use with
99 * function chaining. (See User Guide for details.)
100 * @return the requested Normalizer2, if successful
103 static const Normalizer2
*
104 getNFCInstance(UErrorCode
&errorCode
);
107 * Returns a Normalizer2 instance for Unicode NFD normalization.
108 * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).
109 * Returns an unmodifiable singleton instance. Do not delete it.
110 * @param errorCode Standard ICU error code. Its input value must
111 * pass the U_SUCCESS() test, or else the function returns
112 * immediately. Check for U_FAILURE() on output or use with
113 * function chaining. (See User Guide for details.)
114 * @return the requested Normalizer2, if successful
117 static const Normalizer2
*
118 getNFDInstance(UErrorCode
&errorCode
);
121 * Returns a Normalizer2 instance for Unicode NFKC normalization.
122 * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).
123 * Returns an unmodifiable singleton instance. Do not delete it.
124 * @param errorCode Standard ICU error code. Its input value must
125 * pass the U_SUCCESS() test, or else the function returns
126 * immediately. Check for U_FAILURE() on output or use with
127 * function chaining. (See User Guide for details.)
128 * @return the requested Normalizer2, if successful
131 static const Normalizer2
*
132 getNFKCInstance(UErrorCode
&errorCode
);
135 * Returns a Normalizer2 instance for Unicode NFKD normalization.
136 * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).
137 * Returns an unmodifiable singleton instance. Do not delete it.
138 * @param errorCode Standard ICU error code. Its input value must
139 * pass the U_SUCCESS() test, or else the function returns
140 * immediately. Check for U_FAILURE() on output or use with
141 * function chaining. (See User Guide for details.)
142 * @return the requested Normalizer2, if successful
145 static const Normalizer2
*
146 getNFKDInstance(UErrorCode
&errorCode
);
149 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
150 * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).
151 * Returns an unmodifiable singleton instance. Do not delete it.
152 * @param errorCode Standard ICU error code. Its input value must
153 * pass the U_SUCCESS() test, or else the function returns
154 * immediately. Check for U_FAILURE() on output or use with
155 * function chaining. (See User Guide for details.)
156 * @return the requested Normalizer2, if successful
159 static const Normalizer2
*
160 getNFKCCasefoldInstance(UErrorCode
&errorCode
);
163 * Returns a Normalizer2 instance which uses the specified data file
164 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
165 * and which composes or decomposes text according to the specified mode.
166 * Returns an unmodifiable singleton instance. Do not delete it.
168 * Use packageName=NULL for data files that are part of ICU's own data.
169 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
170 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
171 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
173 * @param packageName NULL for ICU built-in data, otherwise application data package name
174 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
175 * @param mode normalization mode (compose or decompose etc.)
176 * @param errorCode Standard ICU error code. Its input value must
177 * pass the U_SUCCESS() test, or else the function returns
178 * immediately. Check for U_FAILURE() on output or use with
179 * function chaining. (See User Guide for details.)
180 * @return the requested Normalizer2, if successful
183 static const Normalizer2
*
184 getInstance(const char *packageName
,
186 UNormalization2Mode mode
,
187 UErrorCode
&errorCode
);
190 * Returns the normalized form of the source string.
191 * @param src source string
192 * @param errorCode Standard ICU error code. Its input value must
193 * pass the U_SUCCESS() test, or else the function returns
194 * immediately. Check for U_FAILURE() on output or use with
195 * function chaining. (See User Guide for details.)
196 * @return normalized src
200 normalize(const UnicodeString
&src
, UErrorCode
&errorCode
) const {
201 UnicodeString result
;
202 normalize(src
, result
, errorCode
);
206 * Writes the normalized form of the source string to the destination string
207 * (replacing its contents) and returns the destination string.
208 * The source and destination strings must be different objects.
209 * @param src source string
210 * @param dest destination string; its contents is replaced with normalized src
211 * @param errorCode Standard ICU error code. Its input value must
212 * pass the U_SUCCESS() test, or else the function returns
213 * immediately. Check for U_FAILURE() on output or use with
214 * function chaining. (See User Guide for details.)
218 virtual UnicodeString
&
219 normalize(const UnicodeString
&src
,
221 UErrorCode
&errorCode
) const = 0;
224 * Normalizes a UTF-8 string and optionally records how source substrings
225 * relate to changed and unchanged result substrings.
227 * Currently implemented completely only for "compose" modes,
228 * such as for NFC, NFKC, and NFKC_Casefold
229 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
230 * Otherwise currently converts to & from UTF-16 and does not support edits.
232 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
233 * @param src Source UTF-8 string.
234 * @param sink A ByteSink to which the normalized UTF-8 result string is written.
235 * sink.Flush() is called at the end.
236 * @param edits Records edits for index mapping, working with styled text,
237 * and getting only changes (if any).
238 * The Edits contents is undefined if any error occurs.
239 * This function calls edits->reset() first unless
240 * options includes U_EDITS_NO_RESET. edits can be nullptr.
241 * @param errorCode Standard ICU error code. Its input value must
242 * pass the U_SUCCESS() test, or else the function returns
243 * immediately. Check for U_FAILURE() on output or use with
244 * function chaining. (See User Guide for details.)
248 normalizeUTF8(uint32_t options
, StringPiece src
, ByteSink
&sink
,
249 Edits
*edits
, UErrorCode
&errorCode
) const;
252 * Appends the normalized form of the second string to the first string
253 * (merging them at the boundary) and returns the first string.
254 * The result is normalized if the first string was normalized.
255 * The first and second strings must be different objects.
256 * @param first string, should be normalized
257 * @param second string, will be normalized
258 * @param errorCode Standard ICU error code. Its input value must
259 * pass the U_SUCCESS() test, or else the function returns
260 * immediately. Check for U_FAILURE() on output or use with
261 * function chaining. (See User Guide for details.)
265 virtual UnicodeString
&
266 normalizeSecondAndAppend(UnicodeString
&first
,
267 const UnicodeString
&second
,
268 UErrorCode
&errorCode
) const = 0;
270 * Appends the second string to the first string
271 * (merging them at the boundary) and returns the first string.
272 * The result is normalized if both the strings were normalized.
273 * The first and second strings must be different objects.
274 * @param first string, should be normalized
275 * @param second string, should be normalized
276 * @param errorCode Standard ICU error code. Its input value must
277 * pass the U_SUCCESS() test, or else the function returns
278 * immediately. Check for U_FAILURE() on output or use with
279 * function chaining. (See User Guide for details.)
283 virtual UnicodeString
&
284 append(UnicodeString
&first
,
285 const UnicodeString
&second
,
286 UErrorCode
&errorCode
) const = 0;
289 * Gets the decomposition mapping of c.
290 * Roughly equivalent to normalizing the String form of c
291 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
292 * returns FALSE and does not write a string
293 * if c does not have a decomposition mapping in this instance's data.
294 * This function is independent of the mode of the Normalizer2.
295 * @param c code point
296 * @param decomposition String object which will be set to c's
297 * decomposition mapping, if there is one.
298 * @return TRUE if c has a decomposition, otherwise FALSE
302 getDecomposition(UChar32 c
, UnicodeString
&decomposition
) const = 0;
305 * Gets the raw decomposition mapping of c.
307 * This is similar to the getDecomposition() method but returns the
308 * raw decomposition mapping as specified in UnicodeData.txt or
309 * (for custom data) in the mapping files processed by the gennorm2 tool.
310 * By contrast, getDecomposition() returns the processed,
311 * recursively-decomposed version of this mapping.
313 * When used on a standard NFKC Normalizer2 instance,
314 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
316 * When used on a standard NFC Normalizer2 instance,
317 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
318 * in this case, the result contains either one or two code points (=1..4 char16_ts).
320 * This function is independent of the mode of the Normalizer2.
321 * The default implementation returns FALSE.
322 * @param c code point
323 * @param decomposition String object which will be set to c's
324 * raw decomposition mapping, if there is one.
325 * @return TRUE if c has a decomposition, otherwise FALSE
329 getRawDecomposition(UChar32 c
, UnicodeString
&decomposition
) const;
332 * Performs pairwise composition of a & b and returns the composite if there is one.
334 * Returns a composite code point c only if c has a two-way mapping to a+b.
335 * In standard Unicode normalization, this means that
336 * c has a canonical decomposition to a+b
337 * and c does not have the Full_Composition_Exclusion property.
339 * This function is independent of the mode of the Normalizer2.
340 * The default implementation returns a negative value.
341 * @param a A (normalization starter) code point.
342 * @param b Another code point.
343 * @return The non-negative composite code point if there is one; otherwise a negative value.
347 composePair(UChar32 a
, UChar32 b
) const;
350 * Gets the combining class of c.
351 * The default implementation returns 0
352 * but all standard implementations return the Unicode Canonical_Combining_Class value.
353 * @param c code point
354 * @return c's combining class
358 getCombiningClass(UChar32 c
) const;
361 * Tests if the string is normalized.
362 * Internally, in cases where the quickCheck() method would return "maybe"
363 * (which is only possible for the two COMPOSE modes) this method
364 * resolves to "yes" or "no" to provide a definitive result,
365 * at the cost of doing more work in those cases.
366 * @param s input string
367 * @param errorCode Standard ICU error code. Its input value must
368 * pass the U_SUCCESS() test, or else the function returns
369 * immediately. Check for U_FAILURE() on output or use with
370 * function chaining. (See User Guide for details.)
371 * @return TRUE if s is normalized
375 isNormalized(const UnicodeString
&s
, UErrorCode
&errorCode
) const = 0;
377 * Tests if the UTF-8 string is normalized.
378 * Internally, in cases where the quickCheck() method would return "maybe"
379 * (which is only possible for the two COMPOSE modes) this method
380 * resolves to "yes" or "no" to provide a definitive result,
381 * at the cost of doing more work in those cases.
383 * This works for all normalization modes,
384 * but it is currently optimized for UTF-8 only for "compose" modes,
385 * such as for NFC, NFKC, and NFKC_Casefold
386 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
387 * For other modes it currently converts to UTF-16 and calls isNormalized().
389 * @param s UTF-8 input string
390 * @param errorCode Standard ICU error code. Its input value must
391 * pass the U_SUCCESS() test, or else the function returns
392 * immediately. Check for U_FAILURE() on output or use with
393 * function chaining. (See User Guide for details.)
394 * @return TRUE if s is normalized
398 isNormalizedUTF8(StringPiece s
, UErrorCode
&errorCode
) const;
402 * Tests if the string is normalized.
403 * For the two COMPOSE modes, the result could be "maybe" in cases that
404 * would take a little more work to resolve definitively.
405 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
406 * combination of quick check + normalization, to avoid
407 * re-checking the "yes" prefix.
408 * @param s input string
409 * @param errorCode Standard ICU error code. Its input value must
410 * pass the U_SUCCESS() test, or else the function returns
411 * immediately. Check for U_FAILURE() on output or use with
412 * function chaining. (See User Guide for details.)
413 * @return UNormalizationCheckResult
416 virtual UNormalizationCheckResult
417 quickCheck(const UnicodeString
&s
, UErrorCode
&errorCode
) const = 0;
420 * Returns the end of the normalized substring of the input string.
421 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
422 * the substring <code>UnicodeString(s, 0, end)</code>
423 * will pass the quick check with a "yes" result.
425 * The returned end index is usually one or more characters before the
426 * "no" or "maybe" character: The end index is at a normalization boundary.
427 * (See the class documentation for more about normalization boundaries.)
429 * When the goal is a normalized string and most input strings are expected
430 * to be normalized already, then call this method,
431 * and if it returns a prefix shorter than the input string,
432 * copy that prefix and use normalizeSecondAndAppend() for the remainder.
433 * @param s input string
434 * @param errorCode Standard ICU error code. Its input value must
435 * pass the U_SUCCESS() test, or else the function returns
436 * immediately. Check for U_FAILURE() on output or use with
437 * function chaining. (See User Guide for details.)
438 * @return "yes" span end index
442 spanQuickCheckYes(const UnicodeString
&s
, UErrorCode
&errorCode
) const = 0;
445 * Tests if the character always has a normalization boundary before it,
446 * regardless of context.
447 * If true, then the character does not normalization-interact with
448 * preceding characters.
449 * In other words, a string containing this character can be normalized
450 * by processing portions before this character and starting from this
451 * character independently.
452 * This is used for iterative normalization. See the class documentation for details.
453 * @param c character to test
454 * @return TRUE if c has a normalization boundary before it
457 virtual UBool
hasBoundaryBefore(UChar32 c
) const = 0;
460 * Tests if the character always has a normalization boundary after it,
461 * regardless of context.
462 * If true, then the character does not normalization-interact with
463 * following characters.
464 * In other words, a string containing this character can be normalized
465 * by processing portions up to this character and after this
466 * character independently.
467 * This is used for iterative normalization. See the class documentation for details.
468 * Note that this operation may be significantly slower than hasBoundaryBefore().
469 * @param c character to test
470 * @return TRUE if c has a normalization boundary after it
473 virtual UBool
hasBoundaryAfter(UChar32 c
) const = 0;
476 * Tests if the character is normalization-inert.
477 * If true, then the character does not change, nor normalization-interact with
478 * preceding or following characters.
479 * In other words, a string containing this character can be normalized
480 * by processing portions before this character and after this
481 * character independently.
482 * This is used for iterative normalization. See the class documentation for details.
483 * Note that this operation may be significantly slower than hasBoundaryBefore().
484 * @param c character to test
485 * @return TRUE if c is normalization-inert
488 virtual UBool
isInert(UChar32 c
) const = 0;
492 * Normalization filtered by a UnicodeSet.
493 * Normalizes portions of the text contained in the filter set and leaves
494 * portions not contained in the filter set unchanged.
495 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
496 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
497 * This class implements all of (and only) the Normalizer2 API.
498 * An instance of this class is unmodifiable/immutable but is constructed and
499 * must be destructed by the owner.
502 class U_COMMON_API FilteredNormalizer2
: public Normalizer2
{
505 * Constructs a filtered normalizer wrapping any Normalizer2 instance
507 * Both are aliased and must not be modified or deleted while this object
509 * The filter set should be frozen; otherwise the performance will suffer greatly.
510 * @param n2 wrapped Normalizer2 instance
511 * @param filterSet UnicodeSet which determines the characters to be normalized
514 FilteredNormalizer2(const Normalizer2
&n2
, const UnicodeSet
&filterSet
) :
515 norm2(n2
), set(filterSet
) {}
521 ~FilteredNormalizer2();
524 * Writes the normalized form of the source string to the destination string
525 * (replacing its contents) and returns the destination string.
526 * The source and destination strings must be different objects.
527 * @param src source string
528 * @param dest destination string; its contents is replaced with normalized src
529 * @param errorCode Standard ICU error code. Its input value must
530 * pass the U_SUCCESS() test, or else the function returns
531 * immediately. Check for U_FAILURE() on output or use with
532 * function chaining. (See User Guide for details.)
536 virtual UnicodeString
&
537 normalize(const UnicodeString
&src
,
539 UErrorCode
&errorCode
) const U_OVERRIDE
;
542 * Normalizes a UTF-8 string and optionally records how source substrings
543 * relate to changed and unchanged result substrings.
545 * Currently implemented completely only for "compose" modes,
546 * such as for NFC, NFKC, and NFKC_Casefold
547 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
548 * Otherwise currently converts to & from UTF-16 and does not support edits.
550 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
551 * @param src Source UTF-8 string.
552 * @param sink A ByteSink to which the normalized UTF-8 result string is written.
553 * sink.Flush() is called at the end.
554 * @param edits Records edits for index mapping, working with styled text,
555 * and getting only changes (if any).
556 * The Edits contents is undefined if any error occurs.
557 * This function calls edits->reset() first unless
558 * options includes U_EDITS_NO_RESET. edits can be nullptr.
559 * @param errorCode Standard ICU error code. Its input value must
560 * pass the U_SUCCESS() test, or else the function returns
561 * immediately. Check for U_FAILURE() on output or use with
562 * function chaining. (See User Guide for details.)
566 normalizeUTF8(uint32_t options
, StringPiece src
, ByteSink
&sink
,
567 Edits
*edits
, UErrorCode
&errorCode
) const U_OVERRIDE
;
570 * Appends the normalized form of the second string to the first string
571 * (merging them at the boundary) and returns the first string.
572 * The result is normalized if the first string was normalized.
573 * The first and second strings must be different objects.
574 * @param first string, should be normalized
575 * @param second string, will be normalized
576 * @param errorCode Standard ICU error code. Its input value must
577 * pass the U_SUCCESS() test, or else the function returns
578 * immediately. Check for U_FAILURE() on output or use with
579 * function chaining. (See User Guide for details.)
583 virtual UnicodeString
&
584 normalizeSecondAndAppend(UnicodeString
&first
,
585 const UnicodeString
&second
,
586 UErrorCode
&errorCode
) const U_OVERRIDE
;
588 * Appends the second string to the first string
589 * (merging them at the boundary) and returns the first string.
590 * The result is normalized if both the strings were normalized.
591 * The first and second strings must be different objects.
592 * @param first string, should be normalized
593 * @param second string, should be normalized
594 * @param errorCode Standard ICU error code. Its input value must
595 * pass the U_SUCCESS() test, or else the function returns
596 * immediately. Check for U_FAILURE() on output or use with
597 * function chaining. (See User Guide for details.)
601 virtual UnicodeString
&
602 append(UnicodeString
&first
,
603 const UnicodeString
&second
,
604 UErrorCode
&errorCode
) const U_OVERRIDE
;
607 * Gets the decomposition mapping of c.
608 * For details see the base class documentation.
610 * This function is independent of the mode of the Normalizer2.
611 * @param c code point
612 * @param decomposition String object which will be set to c's
613 * decomposition mapping, if there is one.
614 * @return TRUE if c has a decomposition, otherwise FALSE
618 getDecomposition(UChar32 c
, UnicodeString
&decomposition
) const U_OVERRIDE
;
621 * Gets the raw decomposition mapping of c.
622 * For details see the base class documentation.
624 * This function is independent of the mode of the Normalizer2.
625 * @param c code point
626 * @param decomposition String object which will be set to c's
627 * raw decomposition mapping, if there is one.
628 * @return TRUE if c has a decomposition, otherwise FALSE
632 getRawDecomposition(UChar32 c
, UnicodeString
&decomposition
) const U_OVERRIDE
;
635 * Performs pairwise composition of a & b and returns the composite if there is one.
636 * For details see the base class documentation.
638 * This function is independent of the mode of the Normalizer2.
639 * @param a A (normalization starter) code point.
640 * @param b Another code point.
641 * @return The non-negative composite code point if there is one; otherwise a negative value.
645 composePair(UChar32 a
, UChar32 b
) const U_OVERRIDE
;
648 * Gets the combining class of c.
649 * The default implementation returns 0
650 * but all standard implementations return the Unicode Canonical_Combining_Class value.
651 * @param c code point
652 * @return c's combining class
656 getCombiningClass(UChar32 c
) const U_OVERRIDE
;
659 * Tests if the string is normalized.
660 * For details see the Normalizer2 base class documentation.
661 * @param s input string
662 * @param errorCode Standard ICU error code. Its input value must
663 * pass the U_SUCCESS() test, or else the function returns
664 * immediately. Check for U_FAILURE() on output or use with
665 * function chaining. (See User Guide for details.)
666 * @return TRUE if s is normalized
670 isNormalized(const UnicodeString
&s
, UErrorCode
&errorCode
) const U_OVERRIDE
;
672 * Tests if the UTF-8 string is normalized.
673 * Internally, in cases where the quickCheck() method would return "maybe"
674 * (which is only possible for the two COMPOSE modes) this method
675 * resolves to "yes" or "no" to provide a definitive result,
676 * at the cost of doing more work in those cases.
678 * This works for all normalization modes,
679 * but it is currently optimized for UTF-8 only for "compose" modes,
680 * such as for NFC, NFKC, and NFKC_Casefold
681 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
682 * For other modes it currently converts to UTF-16 and calls isNormalized().
684 * @param s UTF-8 input string
685 * @param errorCode Standard ICU error code. Its input value must
686 * pass the U_SUCCESS() test, or else the function returns
687 * immediately. Check for U_FAILURE() on output or use with
688 * function chaining. (See User Guide for details.)
689 * @return TRUE if s is normalized
693 isNormalizedUTF8(StringPiece s
, UErrorCode
&errorCode
) const U_OVERRIDE
;
695 * Tests if the string is normalized.
696 * For details see the Normalizer2 base class documentation.
697 * @param s input string
698 * @param errorCode Standard ICU error code. Its input value must
699 * pass the U_SUCCESS() test, or else the function returns
700 * immediately. Check for U_FAILURE() on output or use with
701 * function chaining. (See User Guide for details.)
702 * @return UNormalizationCheckResult
705 virtual UNormalizationCheckResult
706 quickCheck(const UnicodeString
&s
, UErrorCode
&errorCode
) const U_OVERRIDE
;
708 * Returns the end of the normalized substring of the input string.
709 * For details see the Normalizer2 base class documentation.
710 * @param s input string
711 * @param errorCode Standard ICU error code. Its input value must
712 * pass the U_SUCCESS() test, or else the function returns
713 * immediately. Check for U_FAILURE() on output or use with
714 * function chaining. (See User Guide for details.)
715 * @return "yes" span end index
719 spanQuickCheckYes(const UnicodeString
&s
, UErrorCode
&errorCode
) const U_OVERRIDE
;
722 * Tests if the character always has a normalization boundary before it,
723 * regardless of context.
724 * For details see the Normalizer2 base class documentation.
725 * @param c character to test
726 * @return TRUE if c has a normalization boundary before it
729 virtual UBool
hasBoundaryBefore(UChar32 c
) const U_OVERRIDE
;
732 * Tests if the character always has a normalization boundary after it,
733 * regardless of context.
734 * For details see the Normalizer2 base class documentation.
735 * @param c character to test
736 * @return TRUE if c has a normalization boundary after it
739 virtual UBool
hasBoundaryAfter(UChar32 c
) const U_OVERRIDE
;
742 * Tests if the character is normalization-inert.
743 * For details see the Normalizer2 base class documentation.
744 * @param c character to test
745 * @return TRUE if c is normalization-inert
748 virtual UBool
isInert(UChar32 c
) const U_OVERRIDE
;
751 normalize(const UnicodeString
&src
,
753 USetSpanCondition spanCondition
,
754 UErrorCode
&errorCode
) const;
757 normalizeUTF8(uint32_t options
, const char *src
, int32_t length
,
758 ByteSink
&sink
, Edits
*edits
,
759 USetSpanCondition spanCondition
,
760 UErrorCode
&errorCode
) const;
763 normalizeSecondAndAppend(UnicodeString
&first
,
764 const UnicodeString
&second
,
766 UErrorCode
&errorCode
) const;
768 const Normalizer2
&norm2
;
769 const UnicodeSet
&set
;
773 #endif // U_SHOW_CPLUSPLUS_API
775 #endif // !UCONFIG_NO_NORMALIZATION
776 #endif // __NORMALIZER2_H__