1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2009-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: normalizer2.h
12 * tab size: 8 (not used)
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
19 #ifndef __NORMALIZER2_H__
20 #define __NORMALIZER2_H__
24 * \brief C++ API: New API for Unicode Normalization.
27 #include "unicode/utypes.h"
29 #if U_SHOW_CPLUSPLUS_API
31 #if !UCONFIG_NO_NORMALIZATION
33 #include "unicode/stringpiece.h"
34 #include "unicode/uniset.h"
35 #include "unicode/unistr.h"
36 #include "unicode/unorm2.h"
43 * Unicode normalization functionality for standard Unicode normalization or
44 * for using custom mapping tables.
45 * All instances of this class are unmodifiable/immutable.
46 * Instances returned by getInstance() are singletons that must not be deleted by the caller.
47 * The Normalizer2 class is not intended for public subclassing.
49 * The primary functions are to produce a normalized string and to detect whether
50 * a string is already normalized.
51 * The most commonly used normalization forms are those defined in
52 * http://www.unicode.org/unicode/reports/tr15/
53 * However, this API supports additional normalization forms for specialized purposes.
54 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
55 * and can be used in implementations of UTS #46.
57 * Not only are the standard compose and decompose modes supplied,
58 * but additional modes are provided as documented in the Mode enum.
60 * Some of the functions in this class identify normalization boundaries.
61 * At a normalization boundary, the portions of the string
62 * before it and starting from it do not interact and can be handled independently.
64 * The spanQuickCheckYes() stops at a normalization boundary.
65 * When the goal is a normalized string, then the text before the boundary
66 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
68 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
69 * a character is guaranteed to be at a normalization boundary,
70 * regardless of context.
71 * This is used for moving from one normalization boundary to the next
72 * or preceding boundary, and for performing iterative normalization.
74 * Iterative normalization is useful when only a small portion of a
75 * longer string needs to be processed.
76 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
77 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
78 * (to process only the substring for which sort key bytes are computed).
80 * The set of normalization boundaries returned by these functions may not be
81 * complete: There may be more boundaries that could be returned.
82 * Different functions may return different boundaries.
85 class U_COMMON_API Normalizer2
: public UObject
{
94 * Returns a Normalizer2 instance for Unicode NFC normalization.
95 * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).
96 * Returns an unmodifiable singleton instance. Do not delete it.
97 * @param errorCode Standard ICU error code. Its input value must
98 * pass the U_SUCCESS() test, or else the function returns
99 * immediately. Check for U_FAILURE() on output or use with
100 * function chaining. (See User Guide for details.)
101 * @return the requested Normalizer2, if successful
104 static const Normalizer2
*
105 getNFCInstance(UErrorCode
&errorCode
);
108 * Returns a Normalizer2 instance for Unicode NFD normalization.
109 * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).
110 * Returns an unmodifiable singleton instance. Do not delete it.
111 * @param errorCode Standard ICU error code. Its input value must
112 * pass the U_SUCCESS() test, or else the function returns
113 * immediately. Check for U_FAILURE() on output or use with
114 * function chaining. (See User Guide for details.)
115 * @return the requested Normalizer2, if successful
118 static const Normalizer2
*
119 getNFDInstance(UErrorCode
&errorCode
);
122 * Returns a Normalizer2 instance for Unicode NFKC normalization.
123 * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).
124 * Returns an unmodifiable singleton instance. Do not delete it.
125 * @param errorCode Standard ICU error code. Its input value must
126 * pass the U_SUCCESS() test, or else the function returns
127 * immediately. Check for U_FAILURE() on output or use with
128 * function chaining. (See User Guide for details.)
129 * @return the requested Normalizer2, if successful
132 static const Normalizer2
*
133 getNFKCInstance(UErrorCode
&errorCode
);
136 * Returns a Normalizer2 instance for Unicode NFKD normalization.
137 * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).
138 * Returns an unmodifiable singleton instance. Do not delete it.
139 * @param errorCode Standard ICU error code. Its input value must
140 * pass the U_SUCCESS() test, or else the function returns
141 * immediately. Check for U_FAILURE() on output or use with
142 * function chaining. (See User Guide for details.)
143 * @return the requested Normalizer2, if successful
146 static const Normalizer2
*
147 getNFKDInstance(UErrorCode
&errorCode
);
150 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
151 * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).
152 * Returns an unmodifiable singleton instance. Do not delete it.
153 * @param errorCode Standard ICU error code. Its input value must
154 * pass the U_SUCCESS() test, or else the function returns
155 * immediately. Check for U_FAILURE() on output or use with
156 * function chaining. (See User Guide for details.)
157 * @return the requested Normalizer2, if successful
160 static const Normalizer2
*
161 getNFKCCasefoldInstance(UErrorCode
&errorCode
);
164 * Returns a Normalizer2 instance which uses the specified data file
165 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
166 * and which composes or decomposes text according to the specified mode.
167 * Returns an unmodifiable singleton instance. Do not delete it.
169 * Use packageName=NULL for data files that are part of ICU's own data.
170 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
171 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
172 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
174 * @param packageName NULL for ICU built-in data, otherwise application data package name
175 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
176 * @param mode normalization mode (compose or decompose etc.)
177 * @param errorCode Standard ICU error code. Its input value must
178 * pass the U_SUCCESS() test, or else the function returns
179 * immediately. Check for U_FAILURE() on output or use with
180 * function chaining. (See User Guide for details.)
181 * @return the requested Normalizer2, if successful
184 static const Normalizer2
*
185 getInstance(const char *packageName
,
187 UNormalization2Mode mode
,
188 UErrorCode
&errorCode
);
191 * Returns the normalized form of the source string.
192 * @param src source string
193 * @param errorCode Standard ICU error code. Its input value must
194 * pass the U_SUCCESS() test, or else the function returns
195 * immediately. Check for U_FAILURE() on output or use with
196 * function chaining. (See User Guide for details.)
197 * @return normalized src
201 normalize(const UnicodeString
&src
, UErrorCode
&errorCode
) const {
202 UnicodeString result
;
203 normalize(src
, result
, errorCode
);
207 * Writes the normalized form of the source string to the destination string
208 * (replacing its contents) and returns the destination string.
209 * The source and destination strings must be different objects.
210 * @param src source string
211 * @param dest destination string; its contents is replaced with normalized src
212 * @param errorCode Standard ICU error code. Its input value must
213 * pass the U_SUCCESS() test, or else the function returns
214 * immediately. Check for U_FAILURE() on output or use with
215 * function chaining. (See User Guide for details.)
219 virtual UnicodeString
&
220 normalize(const UnicodeString
&src
,
222 UErrorCode
&errorCode
) const = 0;
225 * Normalizes a UTF-8 string and optionally records how source substrings
226 * relate to changed and unchanged result substrings.
228 * Currently implemented completely only for "compose" modes,
229 * such as for NFC, NFKC, and NFKC_Casefold
230 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
231 * Otherwise currently converts to & from UTF-16 and does not support edits.
233 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
234 * @param src Source UTF-8 string.
235 * @param sink A ByteSink to which the normalized UTF-8 result string is written.
236 * sink.Flush() is called at the end.
237 * @param edits Records edits for index mapping, working with styled text,
238 * and getting only changes (if any).
239 * The Edits contents is undefined if any error occurs.
240 * This function calls edits->reset() first unless
241 * options includes U_EDITS_NO_RESET. edits can be nullptr.
242 * @param errorCode Standard ICU error code. Its input value must
243 * pass the U_SUCCESS() test, or else the function returns
244 * immediately. Check for U_FAILURE() on output or use with
245 * function chaining. (See User Guide for details.)
249 normalizeUTF8(uint32_t options
, StringPiece src
, ByteSink
&sink
,
250 Edits
*edits
, UErrorCode
&errorCode
) const;
253 * Appends the normalized form of the second string to the first string
254 * (merging them at the boundary) and returns the first string.
255 * The result is normalized if the first string was normalized.
256 * The first and second strings must be different objects.
257 * @param first string, should be normalized
258 * @param second string, will be normalized
259 * @param errorCode Standard ICU error code. Its input value must
260 * pass the U_SUCCESS() test, or else the function returns
261 * immediately. Check for U_FAILURE() on output or use with
262 * function chaining. (See User Guide for details.)
266 virtual UnicodeString
&
267 normalizeSecondAndAppend(UnicodeString
&first
,
268 const UnicodeString
&second
,
269 UErrorCode
&errorCode
) const = 0;
271 * Appends the second string to the first string
272 * (merging them at the boundary) and returns the first string.
273 * The result is normalized if both the strings were normalized.
274 * The first and second strings must be different objects.
275 * @param first string, should be normalized
276 * @param second string, should be normalized
277 * @param errorCode Standard ICU error code. Its input value must
278 * pass the U_SUCCESS() test, or else the function returns
279 * immediately. Check for U_FAILURE() on output or use with
280 * function chaining. (See User Guide for details.)
284 virtual UnicodeString
&
285 append(UnicodeString
&first
,
286 const UnicodeString
&second
,
287 UErrorCode
&errorCode
) const = 0;
290 * Gets the decomposition mapping of c.
291 * Roughly equivalent to normalizing the String form of c
292 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
293 * returns FALSE and does not write a string
294 * if c does not have a decomposition mapping in this instance's data.
295 * This function is independent of the mode of the Normalizer2.
296 * @param c code point
297 * @param decomposition String object which will be set to c's
298 * decomposition mapping, if there is one.
299 * @return TRUE if c has a decomposition, otherwise FALSE
303 getDecomposition(UChar32 c
, UnicodeString
&decomposition
) const = 0;
306 * Gets the raw decomposition mapping of c.
308 * This is similar to the getDecomposition() method but returns the
309 * raw decomposition mapping as specified in UnicodeData.txt or
310 * (for custom data) in the mapping files processed by the gennorm2 tool.
311 * By contrast, getDecomposition() returns the processed,
312 * recursively-decomposed version of this mapping.
314 * When used on a standard NFKC Normalizer2 instance,
315 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
317 * When used on a standard NFC Normalizer2 instance,
318 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
319 * in this case, the result contains either one or two code points (=1..4 char16_ts).
321 * This function is independent of the mode of the Normalizer2.
322 * The default implementation returns FALSE.
323 * @param c code point
324 * @param decomposition String object which will be set to c's
325 * raw decomposition mapping, if there is one.
326 * @return TRUE if c has a decomposition, otherwise FALSE
330 getRawDecomposition(UChar32 c
, UnicodeString
&decomposition
) const;
333 * Performs pairwise composition of a & b and returns the composite if there is one.
335 * Returns a composite code point c only if c has a two-way mapping to a+b.
336 * In standard Unicode normalization, this means that
337 * c has a canonical decomposition to a+b
338 * and c does not have the Full_Composition_Exclusion property.
340 * This function is independent of the mode of the Normalizer2.
341 * The default implementation returns a negative value.
342 * @param a A (normalization starter) code point.
343 * @param b Another code point.
344 * @return The non-negative composite code point if there is one; otherwise a negative value.
348 composePair(UChar32 a
, UChar32 b
) const;
351 * Gets the combining class of c.
352 * The default implementation returns 0
353 * but all standard implementations return the Unicode Canonical_Combining_Class value.
354 * @param c code point
355 * @return c's combining class
359 getCombiningClass(UChar32 c
) const;
362 * Tests if the string is normalized.
363 * Internally, in cases where the quickCheck() method would return "maybe"
364 * (which is only possible for the two COMPOSE modes) this method
365 * resolves to "yes" or "no" to provide a definitive result,
366 * at the cost of doing more work in those cases.
367 * @param s input string
368 * @param errorCode Standard ICU error code. Its input value must
369 * pass the U_SUCCESS() test, or else the function returns
370 * immediately. Check for U_FAILURE() on output or use with
371 * function chaining. (See User Guide for details.)
372 * @return TRUE if s is normalized
376 isNormalized(const UnicodeString
&s
, UErrorCode
&errorCode
) const = 0;
378 * Tests if the UTF-8 string is normalized.
379 * Internally, in cases where the quickCheck() method would return "maybe"
380 * (which is only possible for the two COMPOSE modes) this method
381 * resolves to "yes" or "no" to provide a definitive result,
382 * at the cost of doing more work in those cases.
384 * This works for all normalization modes,
385 * but it is currently optimized for UTF-8 only for "compose" modes,
386 * such as for NFC, NFKC, and NFKC_Casefold
387 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
388 * For other modes it currently converts to UTF-16 and calls isNormalized().
390 * @param s UTF-8 input string
391 * @param errorCode Standard ICU error code. Its input value must
392 * pass the U_SUCCESS() test, or else the function returns
393 * immediately. Check for U_FAILURE() on output or use with
394 * function chaining. (See User Guide for details.)
395 * @return TRUE if s is normalized
399 isNormalizedUTF8(StringPiece s
, UErrorCode
&errorCode
) const;
403 * Tests if the string is normalized.
404 * For the two COMPOSE modes, the result could be "maybe" in cases that
405 * would take a little more work to resolve definitively.
406 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
407 * combination of quick check + normalization, to avoid
408 * re-checking the "yes" prefix.
409 * @param s input string
410 * @param errorCode Standard ICU error code. Its input value must
411 * pass the U_SUCCESS() test, or else the function returns
412 * immediately. Check for U_FAILURE() on output or use with
413 * function chaining. (See User Guide for details.)
414 * @return UNormalizationCheckResult
417 virtual UNormalizationCheckResult
418 quickCheck(const UnicodeString
&s
, UErrorCode
&errorCode
) const = 0;
421 * Returns the end of the normalized substring of the input string.
422 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
423 * the substring <code>UnicodeString(s, 0, end)</code>
424 * will pass the quick check with a "yes" result.
426 * The returned end index is usually one or more characters before the
427 * "no" or "maybe" character: The end index is at a normalization boundary.
428 * (See the class documentation for more about normalization boundaries.)
430 * When the goal is a normalized string and most input strings are expected
431 * to be normalized already, then call this method,
432 * and if it returns a prefix shorter than the input string,
433 * copy that prefix and use normalizeSecondAndAppend() for the remainder.
434 * @param s input string
435 * @param errorCode Standard ICU error code. Its input value must
436 * pass the U_SUCCESS() test, or else the function returns
437 * immediately. Check for U_FAILURE() on output or use with
438 * function chaining. (See User Guide for details.)
439 * @return "yes" span end index
443 spanQuickCheckYes(const UnicodeString
&s
, UErrorCode
&errorCode
) const = 0;
446 * Tests if the character always has a normalization boundary before it,
447 * regardless of context.
448 * If true, then the character does not normalization-interact with
449 * preceding characters.
450 * In other words, a string containing this character can be normalized
451 * by processing portions before this character and starting from this
452 * character independently.
453 * This is used for iterative normalization. See the class documentation for details.
454 * @param c character to test
455 * @return TRUE if c has a normalization boundary before it
458 virtual UBool
hasBoundaryBefore(UChar32 c
) const = 0;
461 * Tests if the character always has a normalization boundary after it,
462 * regardless of context.
463 * If true, then the character does not normalization-interact with
464 * following characters.
465 * In other words, a string containing this character can be normalized
466 * by processing portions up to this character and after this
467 * character independently.
468 * This is used for iterative normalization. See the class documentation for details.
469 * Note that this operation may be significantly slower than hasBoundaryBefore().
470 * @param c character to test
471 * @return TRUE if c has a normalization boundary after it
474 virtual UBool
hasBoundaryAfter(UChar32 c
) const = 0;
477 * Tests if the character is normalization-inert.
478 * If true, then the character does not change, nor normalization-interact with
479 * preceding or following characters.
480 * In other words, a string containing this character can be normalized
481 * by processing portions before this character and after this
482 * character independently.
483 * This is used for iterative normalization. See the class documentation for details.
484 * Note that this operation may be significantly slower than hasBoundaryBefore().
485 * @param c character to test
486 * @return TRUE if c is normalization-inert
489 virtual UBool
isInert(UChar32 c
) const = 0;
493 * Normalization filtered by a UnicodeSet.
494 * Normalizes portions of the text contained in the filter set and leaves
495 * portions not contained in the filter set unchanged.
496 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
497 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
498 * This class implements all of (and only) the Normalizer2 API.
499 * An instance of this class is unmodifiable/immutable but is constructed and
500 * must be destructed by the owner.
503 class U_COMMON_API FilteredNormalizer2
: public Normalizer2
{
506 * Constructs a filtered normalizer wrapping any Normalizer2 instance
508 * Both are aliased and must not be modified or deleted while this object
510 * The filter set should be frozen; otherwise the performance will suffer greatly.
511 * @param n2 wrapped Normalizer2 instance
512 * @param filterSet UnicodeSet which determines the characters to be normalized
515 FilteredNormalizer2(const Normalizer2
&n2
, const UnicodeSet
&filterSet
) :
516 norm2(n2
), set(filterSet
) {}
522 ~FilteredNormalizer2();
525 * Writes the normalized form of the source string to the destination string
526 * (replacing its contents) and returns the destination string.
527 * The source and destination strings must be different objects.
528 * @param src source string
529 * @param dest destination string; its contents is replaced with normalized src
530 * @param errorCode Standard ICU error code. Its input value must
531 * pass the U_SUCCESS() test, or else the function returns
532 * immediately. Check for U_FAILURE() on output or use with
533 * function chaining. (See User Guide for details.)
537 virtual UnicodeString
&
538 normalize(const UnicodeString
&src
,
540 UErrorCode
&errorCode
) const U_OVERRIDE
;
543 * Normalizes a UTF-8 string and optionally records how source substrings
544 * relate to changed and unchanged result substrings.
546 * Currently implemented completely only for "compose" modes,
547 * such as for NFC, NFKC, and NFKC_Casefold
548 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
549 * Otherwise currently converts to & from UTF-16 and does not support edits.
551 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
552 * @param src Source UTF-8 string.
553 * @param sink A ByteSink to which the normalized UTF-8 result string is written.
554 * sink.Flush() is called at the end.
555 * @param edits Records edits for index mapping, working with styled text,
556 * and getting only changes (if any).
557 * The Edits contents is undefined if any error occurs.
558 * This function calls edits->reset() first unless
559 * options includes U_EDITS_NO_RESET. edits can be nullptr.
560 * @param errorCode Standard ICU error code. Its input value must
561 * pass the U_SUCCESS() test, or else the function returns
562 * immediately. Check for U_FAILURE() on output or use with
563 * function chaining. (See User Guide for details.)
567 normalizeUTF8(uint32_t options
, StringPiece src
, ByteSink
&sink
,
568 Edits
*edits
, UErrorCode
&errorCode
) const U_OVERRIDE
;
571 * Appends the normalized form of the second string to the first string
572 * (merging them at the boundary) and returns the first string.
573 * The result is normalized if the first string was normalized.
574 * The first and second strings must be different objects.
575 * @param first string, should be normalized
576 * @param second string, will be normalized
577 * @param errorCode Standard ICU error code. Its input value must
578 * pass the U_SUCCESS() test, or else the function returns
579 * immediately. Check for U_FAILURE() on output or use with
580 * function chaining. (See User Guide for details.)
584 virtual UnicodeString
&
585 normalizeSecondAndAppend(UnicodeString
&first
,
586 const UnicodeString
&second
,
587 UErrorCode
&errorCode
) const U_OVERRIDE
;
589 * Appends the second string to the first string
590 * (merging them at the boundary) and returns the first string.
591 * The result is normalized if both the strings were normalized.
592 * The first and second strings must be different objects.
593 * @param first string, should be normalized
594 * @param second string, should be normalized
595 * @param errorCode Standard ICU error code. Its input value must
596 * pass the U_SUCCESS() test, or else the function returns
597 * immediately. Check for U_FAILURE() on output or use with
598 * function chaining. (See User Guide for details.)
602 virtual UnicodeString
&
603 append(UnicodeString
&first
,
604 const UnicodeString
&second
,
605 UErrorCode
&errorCode
) const U_OVERRIDE
;
608 * Gets the decomposition mapping of c.
609 * For details see the base class documentation.
611 * This function is independent of the mode of the Normalizer2.
612 * @param c code point
613 * @param decomposition String object which will be set to c's
614 * decomposition mapping, if there is one.
615 * @return TRUE if c has a decomposition, otherwise FALSE
619 getDecomposition(UChar32 c
, UnicodeString
&decomposition
) const U_OVERRIDE
;
622 * Gets the raw decomposition mapping of c.
623 * For details see the base class documentation.
625 * This function is independent of the mode of the Normalizer2.
626 * @param c code point
627 * @param decomposition String object which will be set to c's
628 * raw decomposition mapping, if there is one.
629 * @return TRUE if c has a decomposition, otherwise FALSE
633 getRawDecomposition(UChar32 c
, UnicodeString
&decomposition
) const U_OVERRIDE
;
636 * Performs pairwise composition of a & b and returns the composite if there is one.
637 * For details see the base class documentation.
639 * This function is independent of the mode of the Normalizer2.
640 * @param a A (normalization starter) code point.
641 * @param b Another code point.
642 * @return The non-negative composite code point if there is one; otherwise a negative value.
646 composePair(UChar32 a
, UChar32 b
) const U_OVERRIDE
;
649 * Gets the combining class of c.
650 * The default implementation returns 0
651 * but all standard implementations return the Unicode Canonical_Combining_Class value.
652 * @param c code point
653 * @return c's combining class
657 getCombiningClass(UChar32 c
) const U_OVERRIDE
;
660 * Tests if the string is normalized.
661 * For details see the Normalizer2 base class documentation.
662 * @param s input string
663 * @param errorCode Standard ICU error code. Its input value must
664 * pass the U_SUCCESS() test, or else the function returns
665 * immediately. Check for U_FAILURE() on output or use with
666 * function chaining. (See User Guide for details.)
667 * @return TRUE if s is normalized
671 isNormalized(const UnicodeString
&s
, UErrorCode
&errorCode
) const U_OVERRIDE
;
673 * Tests if the UTF-8 string is normalized.
674 * Internally, in cases where the quickCheck() method would return "maybe"
675 * (which is only possible for the two COMPOSE modes) this method
676 * resolves to "yes" or "no" to provide a definitive result,
677 * at the cost of doing more work in those cases.
679 * This works for all normalization modes,
680 * but it is currently optimized for UTF-8 only for "compose" modes,
681 * such as for NFC, NFKC, and NFKC_Casefold
682 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
683 * For other modes it currently converts to UTF-16 and calls isNormalized().
685 * @param s UTF-8 input string
686 * @param errorCode Standard ICU error code. Its input value must
687 * pass the U_SUCCESS() test, or else the function returns
688 * immediately. Check for U_FAILURE() on output or use with
689 * function chaining. (See User Guide for details.)
690 * @return TRUE if s is normalized
694 isNormalizedUTF8(StringPiece s
, UErrorCode
&errorCode
) const U_OVERRIDE
;
696 * Tests if the string is normalized.
697 * For details see the Normalizer2 base class documentation.
698 * @param s input string
699 * @param errorCode Standard ICU error code. Its input value must
700 * pass the U_SUCCESS() test, or else the function returns
701 * immediately. Check for U_FAILURE() on output or use with
702 * function chaining. (See User Guide for details.)
703 * @return UNormalizationCheckResult
706 virtual UNormalizationCheckResult
707 quickCheck(const UnicodeString
&s
, UErrorCode
&errorCode
) const U_OVERRIDE
;
709 * Returns the end of the normalized substring of the input string.
710 * For details see the Normalizer2 base class documentation.
711 * @param s input string
712 * @param errorCode Standard ICU error code. Its input value must
713 * pass the U_SUCCESS() test, or else the function returns
714 * immediately. Check for U_FAILURE() on output or use with
715 * function chaining. (See User Guide for details.)
716 * @return "yes" span end index
720 spanQuickCheckYes(const UnicodeString
&s
, UErrorCode
&errorCode
) const U_OVERRIDE
;
723 * Tests if the character always has a normalization boundary before it,
724 * regardless of context.
725 * For details see the Normalizer2 base class documentation.
726 * @param c character to test
727 * @return TRUE if c has a normalization boundary before it
730 virtual UBool
hasBoundaryBefore(UChar32 c
) const U_OVERRIDE
;
733 * Tests if the character always has a normalization boundary after it,
734 * regardless of context.
735 * For details see the Normalizer2 base class documentation.
736 * @param c character to test
737 * @return TRUE if c has a normalization boundary after it
740 virtual UBool
hasBoundaryAfter(UChar32 c
) const U_OVERRIDE
;
743 * Tests if the character is normalization-inert.
744 * For details see the Normalizer2 base class documentation.
745 * @param c character to test
746 * @return TRUE if c is normalization-inert
749 virtual UBool
isInert(UChar32 c
) const U_OVERRIDE
;
752 normalize(const UnicodeString
&src
,
754 USetSpanCondition spanCondition
,
755 UErrorCode
&errorCode
) const;
758 normalizeUTF8(uint32_t options
, const char *src
, int32_t length
,
759 ByteSink
&sink
, Edits
*edits
,
760 USetSpanCondition spanCondition
,
761 UErrorCode
&errorCode
) const;
764 normalizeSecondAndAppend(UnicodeString
&first
,
765 const UnicodeString
&second
,
767 UErrorCode
&errorCode
) const;
769 const Normalizer2
&norm2
;
770 const UnicodeSet
&set
;
775 #endif // !UCONFIG_NO_NORMALIZATION
777 #endif /* U_SHOW_CPLUSPLUS_API */
779 #endif // __NORMALIZER2_H__