1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2009-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: normalizer2.h
12 * tab size: 8 (not used)
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
19 #ifndef __NORMALIZER2_H__
20 #define __NORMALIZER2_H__
24 * \brief C++ API: New API for Unicode Normalization.
27 #include "unicode/utypes.h"
29 #if !UCONFIG_NO_NORMALIZATION
31 #include "unicode/uniset.h"
32 #include "unicode/unistr.h"
33 #include "unicode/unorm2.h"
35 #if U_SHOW_CPLUSPLUS_API
39 * Unicode normalization functionality for standard Unicode normalization or
40 * for using custom mapping tables.
41 * All instances of this class are unmodifiable/immutable.
42 * Instances returned by getInstance() are singletons that must not be deleted by the caller.
43 * The Normalizer2 class is not intended for public subclassing.
45 * The primary functions are to produce a normalized string and to detect whether
46 * a string is already normalized.
47 * The most commonly used normalization forms are those defined in
48 * http://www.unicode.org/unicode/reports/tr15/
49 * However, this API supports additional normalization forms for specialized purposes.
50 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
51 * and can be used in implementations of UTS #46.
53 * Not only are the standard compose and decompose modes supplied,
54 * but additional modes are provided as documented in the Mode enum.
56 * Some of the functions in this class identify normalization boundaries.
57 * At a normalization boundary, the portions of the string
58 * before it and starting from it do not interact and can be handled independently.
60 * The spanQuickCheckYes() stops at a normalization boundary.
61 * When the goal is a normalized string, then the text before the boundary
62 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
64 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
65 * a character is guaranteed to be at a normalization boundary,
66 * regardless of context.
67 * This is used for moving from one normalization boundary to the next
68 * or preceding boundary, and for performing iterative normalization.
70 * Iterative normalization is useful when only a small portion of a
71 * longer string needs to be processed.
72 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
73 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
74 * (to process only the substring for which sort key bytes are computed).
76 * The set of normalization boundaries returned by these functions may not be
77 * complete: There may be more boundaries that could be returned.
78 * Different functions may return different boundaries.
81 class U_COMMON_API Normalizer2
: public UObject
{
90 * Returns a Normalizer2 instance for Unicode NFC normalization.
91 * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).
92 * Returns an unmodifiable singleton instance. Do not delete it.
93 * @param errorCode Standard ICU error code. Its input value must
94 * pass the U_SUCCESS() test, or else the function returns
95 * immediately. Check for U_FAILURE() on output or use with
96 * function chaining. (See User Guide for details.)
97 * @return the requested Normalizer2, if successful
100 static const Normalizer2
*
101 getNFCInstance(UErrorCode
&errorCode
);
104 * Returns a Normalizer2 instance for Unicode NFD normalization.
105 * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).
106 * Returns an unmodifiable singleton instance. Do not delete it.
107 * @param errorCode Standard ICU error code. Its input value must
108 * pass the U_SUCCESS() test, or else the function returns
109 * immediately. Check for U_FAILURE() on output or use with
110 * function chaining. (See User Guide for details.)
111 * @return the requested Normalizer2, if successful
114 static const Normalizer2
*
115 getNFDInstance(UErrorCode
&errorCode
);
118 * Returns a Normalizer2 instance for Unicode NFKC normalization.
119 * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).
120 * Returns an unmodifiable singleton instance. Do not delete it.
121 * @param errorCode Standard ICU error code. Its input value must
122 * pass the U_SUCCESS() test, or else the function returns
123 * immediately. Check for U_FAILURE() on output or use with
124 * function chaining. (See User Guide for details.)
125 * @return the requested Normalizer2, if successful
128 static const Normalizer2
*
129 getNFKCInstance(UErrorCode
&errorCode
);
132 * Returns a Normalizer2 instance for Unicode NFKD normalization.
133 * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).
134 * Returns an unmodifiable singleton instance. Do not delete it.
135 * @param errorCode Standard ICU error code. Its input value must
136 * pass the U_SUCCESS() test, or else the function returns
137 * immediately. Check for U_FAILURE() on output or use with
138 * function chaining. (See User Guide for details.)
139 * @return the requested Normalizer2, if successful
142 static const Normalizer2
*
143 getNFKDInstance(UErrorCode
&errorCode
);
146 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
147 * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).
148 * Returns an unmodifiable singleton instance. Do not delete it.
149 * @param errorCode Standard ICU error code. Its input value must
150 * pass the U_SUCCESS() test, or else the function returns
151 * immediately. Check for U_FAILURE() on output or use with
152 * function chaining. (See User Guide for details.)
153 * @return the requested Normalizer2, if successful
156 static const Normalizer2
*
157 getNFKCCasefoldInstance(UErrorCode
&errorCode
);
160 * Returns a Normalizer2 instance which uses the specified data file
161 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
162 * and which composes or decomposes text according to the specified mode.
163 * Returns an unmodifiable singleton instance. Do not delete it.
165 * Use packageName=NULL for data files that are part of ICU's own data.
166 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
167 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
168 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
170 * @param packageName NULL for ICU built-in data, otherwise application data package name
171 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
172 * @param mode normalization mode (compose or decompose etc.)
173 * @param errorCode Standard ICU error code. Its input value must
174 * pass the U_SUCCESS() test, or else the function returns
175 * immediately. Check for U_FAILURE() on output or use with
176 * function chaining. (See User Guide for details.)
177 * @return the requested Normalizer2, if successful
180 static const Normalizer2
*
181 getInstance(const char *packageName
,
183 UNormalization2Mode mode
,
184 UErrorCode
&errorCode
);
187 * Returns the normalized form of the source string.
188 * @param src source string
189 * @param errorCode Standard ICU error code. Its input value must
190 * pass the U_SUCCESS() test, or else the function returns
191 * immediately. Check for U_FAILURE() on output or use with
192 * function chaining. (See User Guide for details.)
193 * @return normalized src
197 normalize(const UnicodeString
&src
, UErrorCode
&errorCode
) const {
198 UnicodeString result
;
199 normalize(src
, result
, errorCode
);
203 * Writes the normalized form of the source string to the destination string
204 * (replacing its contents) and returns the destination string.
205 * The source and destination strings must be different objects.
206 * @param src source string
207 * @param dest destination string; its contents is replaced with normalized src
208 * @param errorCode Standard ICU error code. Its input value must
209 * pass the U_SUCCESS() test, or else the function returns
210 * immediately. Check for U_FAILURE() on output or use with
211 * function chaining. (See User Guide for details.)
215 virtual UnicodeString
&
216 normalize(const UnicodeString
&src
,
218 UErrorCode
&errorCode
) const = 0;
220 * Appends the normalized form of the second string to the first string
221 * (merging them at the boundary) and returns the first string.
222 * The result is normalized if the first string was normalized.
223 * The first and second strings must be different objects.
224 * @param first string, should be normalized
225 * @param second string, will be normalized
226 * @param errorCode Standard ICU error code. Its input value must
227 * pass the U_SUCCESS() test, or else the function returns
228 * immediately. Check for U_FAILURE() on output or use with
229 * function chaining. (See User Guide for details.)
233 virtual UnicodeString
&
234 normalizeSecondAndAppend(UnicodeString
&first
,
235 const UnicodeString
&second
,
236 UErrorCode
&errorCode
) const = 0;
238 * Appends the second string to the first string
239 * (merging them at the boundary) and returns the first string.
240 * The result is normalized if both the strings were normalized.
241 * The first and second strings must be different objects.
242 * @param first string, should be normalized
243 * @param second string, should be normalized
244 * @param errorCode Standard ICU error code. Its input value must
245 * pass the U_SUCCESS() test, or else the function returns
246 * immediately. Check for U_FAILURE() on output or use with
247 * function chaining. (See User Guide for details.)
251 virtual UnicodeString
&
252 append(UnicodeString
&first
,
253 const UnicodeString
&second
,
254 UErrorCode
&errorCode
) const = 0;
257 * Gets the decomposition mapping of c.
258 * Roughly equivalent to normalizing the String form of c
259 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
260 * returns FALSE and does not write a string
261 * if c does not have a decomposition mapping in this instance's data.
262 * This function is independent of the mode of the Normalizer2.
263 * @param c code point
264 * @param decomposition String object which will be set to c's
265 * decomposition mapping, if there is one.
266 * @return TRUE if c has a decomposition, otherwise FALSE
270 getDecomposition(UChar32 c
, UnicodeString
&decomposition
) const = 0;
273 * Gets the raw decomposition mapping of c.
275 * This is similar to the getDecomposition() method but returns the
276 * raw decomposition mapping as specified in UnicodeData.txt or
277 * (for custom data) in the mapping files processed by the gennorm2 tool.
278 * By contrast, getDecomposition() returns the processed,
279 * recursively-decomposed version of this mapping.
281 * When used on a standard NFKC Normalizer2 instance,
282 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
284 * When used on a standard NFC Normalizer2 instance,
285 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
286 * in this case, the result contains either one or two code points (=1..4 char16_ts).
288 * This function is independent of the mode of the Normalizer2.
289 * The default implementation returns FALSE.
290 * @param c code point
291 * @param decomposition String object which will be set to c's
292 * raw decomposition mapping, if there is one.
293 * @return TRUE if c has a decomposition, otherwise FALSE
297 getRawDecomposition(UChar32 c
, UnicodeString
&decomposition
) const;
300 * Performs pairwise composition of a & b and returns the composite if there is one.
302 * Returns a composite code point c only if c has a two-way mapping to a+b.
303 * In standard Unicode normalization, this means that
304 * c has a canonical decomposition to a+b
305 * and c does not have the Full_Composition_Exclusion property.
307 * This function is independent of the mode of the Normalizer2.
308 * The default implementation returns a negative value.
309 * @param a A (normalization starter) code point.
310 * @param b Another code point.
311 * @return The non-negative composite code point if there is one; otherwise a negative value.
315 composePair(UChar32 a
, UChar32 b
) const;
318 * Gets the combining class of c.
319 * The default implementation returns 0
320 * but all standard implementations return the Unicode Canonical_Combining_Class value.
321 * @param c code point
322 * @return c's combining class
326 getCombiningClass(UChar32 c
) const;
329 * Tests if the string is normalized.
330 * Internally, in cases where the quickCheck() method would return "maybe"
331 * (which is only possible for the two COMPOSE modes) this method
332 * resolves to "yes" or "no" to provide a definitive result,
333 * at the cost of doing more work in those cases.
334 * @param s input string
335 * @param errorCode Standard ICU error code. Its input value must
336 * pass the U_SUCCESS() test, or else the function returns
337 * immediately. Check for U_FAILURE() on output or use with
338 * function chaining. (See User Guide for details.)
339 * @return TRUE if s is normalized
343 isNormalized(const UnicodeString
&s
, UErrorCode
&errorCode
) const = 0;
346 * Tests if the string is normalized.
347 * For the two COMPOSE modes, the result could be "maybe" in cases that
348 * would take a little more work to resolve definitively.
349 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
350 * combination of quick check + normalization, to avoid
351 * re-checking the "yes" prefix.
352 * @param s input string
353 * @param errorCode Standard ICU error code. Its input value must
354 * pass the U_SUCCESS() test, or else the function returns
355 * immediately. Check for U_FAILURE() on output or use with
356 * function chaining. (See User Guide for details.)
357 * @return UNormalizationCheckResult
360 virtual UNormalizationCheckResult
361 quickCheck(const UnicodeString
&s
, UErrorCode
&errorCode
) const = 0;
364 * Returns the end of the normalized substring of the input string.
365 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
366 * the substring <code>UnicodeString(s, 0, end)</code>
367 * will pass the quick check with a "yes" result.
369 * The returned end index is usually one or more characters before the
370 * "no" or "maybe" character: The end index is at a normalization boundary.
371 * (See the class documentation for more about normalization boundaries.)
373 * When the goal is a normalized string and most input strings are expected
374 * to be normalized already, then call this method,
375 * and if it returns a prefix shorter than the input string,
376 * copy that prefix and use normalizeSecondAndAppend() for the remainder.
377 * @param s input string
378 * @param errorCode Standard ICU error code. Its input value must
379 * pass the U_SUCCESS() test, or else the function returns
380 * immediately. Check for U_FAILURE() on output or use with
381 * function chaining. (See User Guide for details.)
382 * @return "yes" span end index
386 spanQuickCheckYes(const UnicodeString
&s
, UErrorCode
&errorCode
) const = 0;
389 * Tests if the character always has a normalization boundary before it,
390 * regardless of context.
391 * If true, then the character does not normalization-interact with
392 * preceding characters.
393 * In other words, a string containing this character can be normalized
394 * by processing portions before this character and starting from this
395 * character independently.
396 * This is used for iterative normalization. See the class documentation for details.
397 * @param c character to test
398 * @return TRUE if c has a normalization boundary before it
401 virtual UBool
hasBoundaryBefore(UChar32 c
) const = 0;
404 * Tests if the character always has a normalization boundary after it,
405 * regardless of context.
406 * If true, then the character does not normalization-interact with
407 * following characters.
408 * In other words, a string containing this character can be normalized
409 * by processing portions up to this character and after this
410 * character independently.
411 * This is used for iterative normalization. See the class documentation for details.
412 * Note that this operation may be significantly slower than hasBoundaryBefore().
413 * @param c character to test
414 * @return TRUE if c has a normalization boundary after it
417 virtual UBool
hasBoundaryAfter(UChar32 c
) const = 0;
420 * Tests if the character is normalization-inert.
421 * If true, then the character does not change, nor normalization-interact with
422 * preceding or following characters.
423 * In other words, a string containing this character can be normalized
424 * by processing portions before this character and after this
425 * character independently.
426 * This is used for iterative normalization. See the class documentation for details.
427 * Note that this operation may be significantly slower than hasBoundaryBefore().
428 * @param c character to test
429 * @return TRUE if c is normalization-inert
432 virtual UBool
isInert(UChar32 c
) const = 0;
436 * Normalization filtered by a UnicodeSet.
437 * Normalizes portions of the text contained in the filter set and leaves
438 * portions not contained in the filter set unchanged.
439 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
440 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
441 * This class implements all of (and only) the Normalizer2 API.
442 * An instance of this class is unmodifiable/immutable but is constructed and
443 * must be destructed by the owner.
446 class U_COMMON_API FilteredNormalizer2
: public Normalizer2
{
449 * Constructs a filtered normalizer wrapping any Normalizer2 instance
451 * Both are aliased and must not be modified or deleted while this object
453 * The filter set should be frozen; otherwise the performance will suffer greatly.
454 * @param n2 wrapped Normalizer2 instance
455 * @param filterSet UnicodeSet which determines the characters to be normalized
458 FilteredNormalizer2(const Normalizer2
&n2
, const UnicodeSet
&filterSet
) :
459 norm2(n2
), set(filterSet
) {}
465 ~FilteredNormalizer2();
468 * Writes the normalized form of the source string to the destination string
469 * (replacing its contents) and returns the destination string.
470 * The source and destination strings must be different objects.
471 * @param src source string
472 * @param dest destination string; its contents is replaced with normalized src
473 * @param errorCode Standard ICU error code. Its input value must
474 * pass the U_SUCCESS() test, or else the function returns
475 * immediately. Check for U_FAILURE() on output or use with
476 * function chaining. (See User Guide for details.)
480 virtual UnicodeString
&
481 normalize(const UnicodeString
&src
,
483 UErrorCode
&errorCode
) const;
485 * Appends the normalized form of the second string to the first string
486 * (merging them at the boundary) and returns the first string.
487 * The result is normalized if the first string was normalized.
488 * The first and second strings must be different objects.
489 * @param first string, should be normalized
490 * @param second string, will be normalized
491 * @param errorCode Standard ICU error code. Its input value must
492 * pass the U_SUCCESS() test, or else the function returns
493 * immediately. Check for U_FAILURE() on output or use with
494 * function chaining. (See User Guide for details.)
498 virtual UnicodeString
&
499 normalizeSecondAndAppend(UnicodeString
&first
,
500 const UnicodeString
&second
,
501 UErrorCode
&errorCode
) const;
503 * Appends the second string to the first string
504 * (merging them at the boundary) and returns the first string.
505 * The result is normalized if both the strings were normalized.
506 * The first and second strings must be different objects.
507 * @param first string, should be normalized
508 * @param second string, should be normalized
509 * @param errorCode Standard ICU error code. Its input value must
510 * pass the U_SUCCESS() test, or else the function returns
511 * immediately. Check for U_FAILURE() on output or use with
512 * function chaining. (See User Guide for details.)
516 virtual UnicodeString
&
517 append(UnicodeString
&first
,
518 const UnicodeString
&second
,
519 UErrorCode
&errorCode
) const;
522 * Gets the decomposition mapping of c.
523 * For details see the base class documentation.
525 * This function is independent of the mode of the Normalizer2.
526 * @param c code point
527 * @param decomposition String object which will be set to c's
528 * decomposition mapping, if there is one.
529 * @return TRUE if c has a decomposition, otherwise FALSE
533 getDecomposition(UChar32 c
, UnicodeString
&decomposition
) const;
536 * Gets the raw decomposition mapping of c.
537 * For details see the base class documentation.
539 * This function is independent of the mode of the Normalizer2.
540 * @param c code point
541 * @param decomposition String object which will be set to c's
542 * raw decomposition mapping, if there is one.
543 * @return TRUE if c has a decomposition, otherwise FALSE
547 getRawDecomposition(UChar32 c
, UnicodeString
&decomposition
) const;
550 * Performs pairwise composition of a & b and returns the composite if there is one.
551 * For details see the base class documentation.
553 * This function is independent of the mode of the Normalizer2.
554 * @param a A (normalization starter) code point.
555 * @param b Another code point.
556 * @return The non-negative composite code point if there is one; otherwise a negative value.
560 composePair(UChar32 a
, UChar32 b
) const;
563 * Gets the combining class of c.
564 * The default implementation returns 0
565 * but all standard implementations return the Unicode Canonical_Combining_Class value.
566 * @param c code point
567 * @return c's combining class
571 getCombiningClass(UChar32 c
) const;
574 * Tests if the string is normalized.
575 * For details see the Normalizer2 base class documentation.
576 * @param s input string
577 * @param errorCode Standard ICU error code. Its input value must
578 * pass the U_SUCCESS() test, or else the function returns
579 * immediately. Check for U_FAILURE() on output or use with
580 * function chaining. (See User Guide for details.)
581 * @return TRUE if s is normalized
585 isNormalized(const UnicodeString
&s
, UErrorCode
&errorCode
) const;
587 * Tests if the string is normalized.
588 * For details see the Normalizer2 base class documentation.
589 * @param s input string
590 * @param errorCode Standard ICU error code. Its input value must
591 * pass the U_SUCCESS() test, or else the function returns
592 * immediately. Check for U_FAILURE() on output or use with
593 * function chaining. (See User Guide for details.)
594 * @return UNormalizationCheckResult
597 virtual UNormalizationCheckResult
598 quickCheck(const UnicodeString
&s
, UErrorCode
&errorCode
) const;
600 * Returns the end of the normalized substring of the input string.
601 * For details see the Normalizer2 base class documentation.
602 * @param s input string
603 * @param errorCode Standard ICU error code. Its input value must
604 * pass the U_SUCCESS() test, or else the function returns
605 * immediately. Check for U_FAILURE() on output or use with
606 * function chaining. (See User Guide for details.)
607 * @return "yes" span end index
611 spanQuickCheckYes(const UnicodeString
&s
, UErrorCode
&errorCode
) const;
614 * Tests if the character always has a normalization boundary before it,
615 * regardless of context.
616 * For details see the Normalizer2 base class documentation.
617 * @param c character to test
618 * @return TRUE if c has a normalization boundary before it
621 virtual UBool
hasBoundaryBefore(UChar32 c
) const;
624 * Tests if the character always has a normalization boundary after it,
625 * regardless of context.
626 * For details see the Normalizer2 base class documentation.
627 * @param c character to test
628 * @return TRUE if c has a normalization boundary after it
631 virtual UBool
hasBoundaryAfter(UChar32 c
) const;
634 * Tests if the character is normalization-inert.
635 * For details see the Normalizer2 base class documentation.
636 * @param c character to test
637 * @return TRUE if c is normalization-inert
640 virtual UBool
isInert(UChar32 c
) const;
643 normalize(const UnicodeString
&src
,
645 USetSpanCondition spanCondition
,
646 UErrorCode
&errorCode
) const;
649 normalizeSecondAndAppend(UnicodeString
&first
,
650 const UnicodeString
&second
,
652 UErrorCode
&errorCode
) const;
654 const Normalizer2
&norm2
;
655 const UnicodeSet
&set
;
659 #endif // U_SHOW_CPLUSPLUS_API
661 #endif // !UCONFIG_NO_NORMALIZATION
662 #endif // __NORMALIZER2_H__