]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/unicode/normalizer2.h
ICU-62141.0.1.tar.gz
[apple/icu.git] / icuSources / common / unicode / normalizer2.h
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: normalizer2.h
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
17 */
18
19 #ifndef __NORMALIZER2_H__
20 #define __NORMALIZER2_H__
21
22 /**
23 * \file
24 * \brief C++ API: New API for Unicode Normalization.
25 */
26
27 #include "unicode/utypes.h"
28
29 #if !UCONFIG_NO_NORMALIZATION
30
31 #include "unicode/stringpiece.h"
32 #include "unicode/uniset.h"
33 #include "unicode/unistr.h"
34 #include "unicode/unorm2.h"
35
36 #if U_SHOW_CPLUSPLUS_API
37 U_NAMESPACE_BEGIN
38
39 class ByteSink;
40
41 /**
42 * Unicode normalization functionality for standard Unicode normalization or
43 * for using custom mapping tables.
44 * All instances of this class are unmodifiable/immutable.
45 * Instances returned by getInstance() are singletons that must not be deleted by the caller.
46 * The Normalizer2 class is not intended for public subclassing.
47 *
48 * The primary functions are to produce a normalized string and to detect whether
49 * a string is already normalized.
50 * The most commonly used normalization forms are those defined in
51 * http://www.unicode.org/unicode/reports/tr15/
52 * However, this API supports additional normalization forms for specialized purposes.
53 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
54 * and can be used in implementations of UTS #46.
55 *
56 * Not only are the standard compose and decompose modes supplied,
57 * but additional modes are provided as documented in the Mode enum.
58 *
59 * Some of the functions in this class identify normalization boundaries.
60 * At a normalization boundary, the portions of the string
61 * before it and starting from it do not interact and can be handled independently.
62 *
63 * The spanQuickCheckYes() stops at a normalization boundary.
64 * When the goal is a normalized string, then the text before the boundary
65 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
66 *
67 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
68 * a character is guaranteed to be at a normalization boundary,
69 * regardless of context.
70 * This is used for moving from one normalization boundary to the next
71 * or preceding boundary, and for performing iterative normalization.
72 *
73 * Iterative normalization is useful when only a small portion of a
74 * longer string needs to be processed.
75 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
76 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
77 * (to process only the substring for which sort key bytes are computed).
78 *
79 * The set of normalization boundaries returned by these functions may not be
80 * complete: There may be more boundaries that could be returned.
81 * Different functions may return different boundaries.
82 * @stable ICU 4.4
83 */
84 class U_COMMON_API Normalizer2 : public UObject {
85 public:
86 /**
87 * Destructor.
88 * @stable ICU 4.4
89 */
90 ~Normalizer2();
91
92 /**
93 * Returns a Normalizer2 instance for Unicode NFC normalization.
94 * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).
95 * Returns an unmodifiable singleton instance. Do not delete it.
96 * @param errorCode Standard ICU error code. Its input value must
97 * pass the U_SUCCESS() test, or else the function returns
98 * immediately. Check for U_FAILURE() on output or use with
99 * function chaining. (See User Guide for details.)
100 * @return the requested Normalizer2, if successful
101 * @stable ICU 49
102 */
103 static const Normalizer2 *
104 getNFCInstance(UErrorCode &errorCode);
105
106 /**
107 * Returns a Normalizer2 instance for Unicode NFD normalization.
108 * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).
109 * Returns an unmodifiable singleton instance. Do not delete it.
110 * @param errorCode Standard ICU error code. Its input value must
111 * pass the U_SUCCESS() test, or else the function returns
112 * immediately. Check for U_FAILURE() on output or use with
113 * function chaining. (See User Guide for details.)
114 * @return the requested Normalizer2, if successful
115 * @stable ICU 49
116 */
117 static const Normalizer2 *
118 getNFDInstance(UErrorCode &errorCode);
119
120 /**
121 * Returns a Normalizer2 instance for Unicode NFKC normalization.
122 * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).
123 * Returns an unmodifiable singleton instance. Do not delete it.
124 * @param errorCode Standard ICU error code. Its input value must
125 * pass the U_SUCCESS() test, or else the function returns
126 * immediately. Check for U_FAILURE() on output or use with
127 * function chaining. (See User Guide for details.)
128 * @return the requested Normalizer2, if successful
129 * @stable ICU 49
130 */
131 static const Normalizer2 *
132 getNFKCInstance(UErrorCode &errorCode);
133
134 /**
135 * Returns a Normalizer2 instance for Unicode NFKD normalization.
136 * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).
137 * Returns an unmodifiable singleton instance. Do not delete it.
138 * @param errorCode Standard ICU error code. Its input value must
139 * pass the U_SUCCESS() test, or else the function returns
140 * immediately. Check for U_FAILURE() on output or use with
141 * function chaining. (See User Guide for details.)
142 * @return the requested Normalizer2, if successful
143 * @stable ICU 49
144 */
145 static const Normalizer2 *
146 getNFKDInstance(UErrorCode &errorCode);
147
148 /**
149 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
150 * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).
151 * Returns an unmodifiable singleton instance. Do not delete it.
152 * @param errorCode Standard ICU error code. Its input value must
153 * pass the U_SUCCESS() test, or else the function returns
154 * immediately. Check for U_FAILURE() on output or use with
155 * function chaining. (See User Guide for details.)
156 * @return the requested Normalizer2, if successful
157 * @stable ICU 49
158 */
159 static const Normalizer2 *
160 getNFKCCasefoldInstance(UErrorCode &errorCode);
161
162 /**
163 * Returns a Normalizer2 instance which uses the specified data file
164 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
165 * and which composes or decomposes text according to the specified mode.
166 * Returns an unmodifiable singleton instance. Do not delete it.
167 *
168 * Use packageName=NULL for data files that are part of ICU's own data.
169 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
170 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
171 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
172 *
173 * @param packageName NULL for ICU built-in data, otherwise application data package name
174 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
175 * @param mode normalization mode (compose or decompose etc.)
176 * @param errorCode Standard ICU error code. Its input value must
177 * pass the U_SUCCESS() test, or else the function returns
178 * immediately. Check for U_FAILURE() on output or use with
179 * function chaining. (See User Guide for details.)
180 * @return the requested Normalizer2, if successful
181 * @stable ICU 4.4
182 */
183 static const Normalizer2 *
184 getInstance(const char *packageName,
185 const char *name,
186 UNormalization2Mode mode,
187 UErrorCode &errorCode);
188
189 /**
190 * Returns the normalized form of the source string.
191 * @param src source string
192 * @param errorCode Standard ICU error code. Its input value must
193 * pass the U_SUCCESS() test, or else the function returns
194 * immediately. Check for U_FAILURE() on output or use with
195 * function chaining. (See User Guide for details.)
196 * @return normalized src
197 * @stable ICU 4.4
198 */
199 UnicodeString
200 normalize(const UnicodeString &src, UErrorCode &errorCode) const {
201 UnicodeString result;
202 normalize(src, result, errorCode);
203 return result;
204 }
205 /**
206 * Writes the normalized form of the source string to the destination string
207 * (replacing its contents) and returns the destination string.
208 * The source and destination strings must be different objects.
209 * @param src source string
210 * @param dest destination string; its contents is replaced with normalized src
211 * @param errorCode Standard ICU error code. Its input value must
212 * pass the U_SUCCESS() test, or else the function returns
213 * immediately. Check for U_FAILURE() on output or use with
214 * function chaining. (See User Guide for details.)
215 * @return dest
216 * @stable ICU 4.4
217 */
218 virtual UnicodeString &
219 normalize(const UnicodeString &src,
220 UnicodeString &dest,
221 UErrorCode &errorCode) const = 0;
222
223 /**
224 * Normalizes a UTF-8 string and optionally records how source substrings
225 * relate to changed and unchanged result substrings.
226 *
227 * Currently implemented completely only for "compose" modes,
228 * such as for NFC, NFKC, and NFKC_Casefold
229 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
230 * Otherwise currently converts to & from UTF-16 and does not support edits.
231 *
232 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
233 * @param src Source UTF-8 string.
234 * @param sink A ByteSink to which the normalized UTF-8 result string is written.
235 * sink.Flush() is called at the end.
236 * @param edits Records edits for index mapping, working with styled text,
237 * and getting only changes (if any).
238 * The Edits contents is undefined if any error occurs.
239 * This function calls edits->reset() first unless
240 * options includes U_EDITS_NO_RESET. edits can be nullptr.
241 * @param errorCode Standard ICU error code. Its input value must
242 * pass the U_SUCCESS() test, or else the function returns
243 * immediately. Check for U_FAILURE() on output or use with
244 * function chaining. (See User Guide for details.)
245 * @draft ICU 60
246 */
247 virtual void
248 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
249 Edits *edits, UErrorCode &errorCode) const;
250
251 /**
252 * Appends the normalized form of the second string to the first string
253 * (merging them at the boundary) and returns the first string.
254 * The result is normalized if the first string was normalized.
255 * The first and second strings must be different objects.
256 * @param first string, should be normalized
257 * @param second string, will be normalized
258 * @param errorCode Standard ICU error code. Its input value must
259 * pass the U_SUCCESS() test, or else the function returns
260 * immediately. Check for U_FAILURE() on output or use with
261 * function chaining. (See User Guide for details.)
262 * @return first
263 * @stable ICU 4.4
264 */
265 virtual UnicodeString &
266 normalizeSecondAndAppend(UnicodeString &first,
267 const UnicodeString &second,
268 UErrorCode &errorCode) const = 0;
269 /**
270 * Appends the second string to the first string
271 * (merging them at the boundary) and returns the first string.
272 * The result is normalized if both the strings were normalized.
273 * The first and second strings must be different objects.
274 * @param first string, should be normalized
275 * @param second string, should be normalized
276 * @param errorCode Standard ICU error code. Its input value must
277 * pass the U_SUCCESS() test, or else the function returns
278 * immediately. Check for U_FAILURE() on output or use with
279 * function chaining. (See User Guide for details.)
280 * @return first
281 * @stable ICU 4.4
282 */
283 virtual UnicodeString &
284 append(UnicodeString &first,
285 const UnicodeString &second,
286 UErrorCode &errorCode) const = 0;
287
288 /**
289 * Gets the decomposition mapping of c.
290 * Roughly equivalent to normalizing the String form of c
291 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
292 * returns FALSE and does not write a string
293 * if c does not have a decomposition mapping in this instance's data.
294 * This function is independent of the mode of the Normalizer2.
295 * @param c code point
296 * @param decomposition String object which will be set to c's
297 * decomposition mapping, if there is one.
298 * @return TRUE if c has a decomposition, otherwise FALSE
299 * @stable ICU 4.6
300 */
301 virtual UBool
302 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
303
304 /**
305 * Gets the raw decomposition mapping of c.
306 *
307 * This is similar to the getDecomposition() method but returns the
308 * raw decomposition mapping as specified in UnicodeData.txt or
309 * (for custom data) in the mapping files processed by the gennorm2 tool.
310 * By contrast, getDecomposition() returns the processed,
311 * recursively-decomposed version of this mapping.
312 *
313 * When used on a standard NFKC Normalizer2 instance,
314 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
315 *
316 * When used on a standard NFC Normalizer2 instance,
317 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
318 * in this case, the result contains either one or two code points (=1..4 char16_ts).
319 *
320 * This function is independent of the mode of the Normalizer2.
321 * The default implementation returns FALSE.
322 * @param c code point
323 * @param decomposition String object which will be set to c's
324 * raw decomposition mapping, if there is one.
325 * @return TRUE if c has a decomposition, otherwise FALSE
326 * @stable ICU 49
327 */
328 virtual UBool
329 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
330
331 /**
332 * Performs pairwise composition of a & b and returns the composite if there is one.
333 *
334 * Returns a composite code point c only if c has a two-way mapping to a+b.
335 * In standard Unicode normalization, this means that
336 * c has a canonical decomposition to a+b
337 * and c does not have the Full_Composition_Exclusion property.
338 *
339 * This function is independent of the mode of the Normalizer2.
340 * The default implementation returns a negative value.
341 * @param a A (normalization starter) code point.
342 * @param b Another code point.
343 * @return The non-negative composite code point if there is one; otherwise a negative value.
344 * @stable ICU 49
345 */
346 virtual UChar32
347 composePair(UChar32 a, UChar32 b) const;
348
349 /**
350 * Gets the combining class of c.
351 * The default implementation returns 0
352 * but all standard implementations return the Unicode Canonical_Combining_Class value.
353 * @param c code point
354 * @return c's combining class
355 * @stable ICU 49
356 */
357 virtual uint8_t
358 getCombiningClass(UChar32 c) const;
359
360 /**
361 * Tests if the string is normalized.
362 * Internally, in cases where the quickCheck() method would return "maybe"
363 * (which is only possible for the two COMPOSE modes) this method
364 * resolves to "yes" or "no" to provide a definitive result,
365 * at the cost of doing more work in those cases.
366 * @param s input string
367 * @param errorCode Standard ICU error code. Its input value must
368 * pass the U_SUCCESS() test, or else the function returns
369 * immediately. Check for U_FAILURE() on output or use with
370 * function chaining. (See User Guide for details.)
371 * @return TRUE if s is normalized
372 * @stable ICU 4.4
373 */
374 virtual UBool
375 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
376 /**
377 * Tests if the UTF-8 string is normalized.
378 * Internally, in cases where the quickCheck() method would return "maybe"
379 * (which is only possible for the two COMPOSE modes) this method
380 * resolves to "yes" or "no" to provide a definitive result,
381 * at the cost of doing more work in those cases.
382 *
383 * This works for all normalization modes,
384 * but it is currently optimized for UTF-8 only for "compose" modes,
385 * such as for NFC, NFKC, and NFKC_Casefold
386 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
387 * For other modes it currently converts to UTF-16 and calls isNormalized().
388 *
389 * @param s UTF-8 input string
390 * @param errorCode Standard ICU error code. Its input value must
391 * pass the U_SUCCESS() test, or else the function returns
392 * immediately. Check for U_FAILURE() on output or use with
393 * function chaining. (See User Guide for details.)
394 * @return TRUE if s is normalized
395 * @draft ICU 60
396 */
397 virtual UBool
398 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
399
400
401 /**
402 * Tests if the string is normalized.
403 * For the two COMPOSE modes, the result could be "maybe" in cases that
404 * would take a little more work to resolve definitively.
405 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
406 * combination of quick check + normalization, to avoid
407 * re-checking the "yes" prefix.
408 * @param s input string
409 * @param errorCode Standard ICU error code. Its input value must
410 * pass the U_SUCCESS() test, or else the function returns
411 * immediately. Check for U_FAILURE() on output or use with
412 * function chaining. (See User Guide for details.)
413 * @return UNormalizationCheckResult
414 * @stable ICU 4.4
415 */
416 virtual UNormalizationCheckResult
417 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
418
419 /**
420 * Returns the end of the normalized substring of the input string.
421 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
422 * the substring <code>UnicodeString(s, 0, end)</code>
423 * will pass the quick check with a "yes" result.
424 *
425 * The returned end index is usually one or more characters before the
426 * "no" or "maybe" character: The end index is at a normalization boundary.
427 * (See the class documentation for more about normalization boundaries.)
428 *
429 * When the goal is a normalized string and most input strings are expected
430 * to be normalized already, then call this method,
431 * and if it returns a prefix shorter than the input string,
432 * copy that prefix and use normalizeSecondAndAppend() for the remainder.
433 * @param s input string
434 * @param errorCode Standard ICU error code. Its input value must
435 * pass the U_SUCCESS() test, or else the function returns
436 * immediately. Check for U_FAILURE() on output or use with
437 * function chaining. (See User Guide for details.)
438 * @return "yes" span end index
439 * @stable ICU 4.4
440 */
441 virtual int32_t
442 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
443
444 /**
445 * Tests if the character always has a normalization boundary before it,
446 * regardless of context.
447 * If true, then the character does not normalization-interact with
448 * preceding characters.
449 * In other words, a string containing this character can be normalized
450 * by processing portions before this character and starting from this
451 * character independently.
452 * This is used for iterative normalization. See the class documentation for details.
453 * @param c character to test
454 * @return TRUE if c has a normalization boundary before it
455 * @stable ICU 4.4
456 */
457 virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
458
459 /**
460 * Tests if the character always has a normalization boundary after it,
461 * regardless of context.
462 * If true, then the character does not normalization-interact with
463 * following characters.
464 * In other words, a string containing this character can be normalized
465 * by processing portions up to this character and after this
466 * character independently.
467 * This is used for iterative normalization. See the class documentation for details.
468 * Note that this operation may be significantly slower than hasBoundaryBefore().
469 * @param c character to test
470 * @return TRUE if c has a normalization boundary after it
471 * @stable ICU 4.4
472 */
473 virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
474
475 /**
476 * Tests if the character is normalization-inert.
477 * If true, then the character does not change, nor normalization-interact with
478 * preceding or following characters.
479 * In other words, a string containing this character can be normalized
480 * by processing portions before this character and after this
481 * character independently.
482 * This is used for iterative normalization. See the class documentation for details.
483 * Note that this operation may be significantly slower than hasBoundaryBefore().
484 * @param c character to test
485 * @return TRUE if c is normalization-inert
486 * @stable ICU 4.4
487 */
488 virtual UBool isInert(UChar32 c) const = 0;
489 };
490
491 /**
492 * Normalization filtered by a UnicodeSet.
493 * Normalizes portions of the text contained in the filter set and leaves
494 * portions not contained in the filter set unchanged.
495 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
496 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
497 * This class implements all of (and only) the Normalizer2 API.
498 * An instance of this class is unmodifiable/immutable but is constructed and
499 * must be destructed by the owner.
500 * @stable ICU 4.4
501 */
502 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
503 public:
504 /**
505 * Constructs a filtered normalizer wrapping any Normalizer2 instance
506 * and a filter set.
507 * Both are aliased and must not be modified or deleted while this object
508 * is used.
509 * The filter set should be frozen; otherwise the performance will suffer greatly.
510 * @param n2 wrapped Normalizer2 instance
511 * @param filterSet UnicodeSet which determines the characters to be normalized
512 * @stable ICU 4.4
513 */
514 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
515 norm2(n2), set(filterSet) {}
516
517 /**
518 * Destructor.
519 * @stable ICU 4.4
520 */
521 ~FilteredNormalizer2();
522
523 /**
524 * Writes the normalized form of the source string to the destination string
525 * (replacing its contents) and returns the destination string.
526 * The source and destination strings must be different objects.
527 * @param src source string
528 * @param dest destination string; its contents is replaced with normalized src
529 * @param errorCode Standard ICU error code. Its input value must
530 * pass the U_SUCCESS() test, or else the function returns
531 * immediately. Check for U_FAILURE() on output or use with
532 * function chaining. (See User Guide for details.)
533 * @return dest
534 * @stable ICU 4.4
535 */
536 virtual UnicodeString &
537 normalize(const UnicodeString &src,
538 UnicodeString &dest,
539 UErrorCode &errorCode) const U_OVERRIDE;
540
541 /**
542 * Normalizes a UTF-8 string and optionally records how source substrings
543 * relate to changed and unchanged result substrings.
544 *
545 * Currently implemented completely only for "compose" modes,
546 * such as for NFC, NFKC, and NFKC_Casefold
547 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
548 * Otherwise currently converts to & from UTF-16 and does not support edits.
549 *
550 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
551 * @param src Source UTF-8 string.
552 * @param sink A ByteSink to which the normalized UTF-8 result string is written.
553 * sink.Flush() is called at the end.
554 * @param edits Records edits for index mapping, working with styled text,
555 * and getting only changes (if any).
556 * The Edits contents is undefined if any error occurs.
557 * This function calls edits->reset() first unless
558 * options includes U_EDITS_NO_RESET. edits can be nullptr.
559 * @param errorCode Standard ICU error code. Its input value must
560 * pass the U_SUCCESS() test, or else the function returns
561 * immediately. Check for U_FAILURE() on output or use with
562 * function chaining. (See User Guide for details.)
563 * @draft ICU 60
564 */
565 virtual void
566 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
567 Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
568
569 /**
570 * Appends the normalized form of the second string to the first string
571 * (merging them at the boundary) and returns the first string.
572 * The result is normalized if the first string was normalized.
573 * The first and second strings must be different objects.
574 * @param first string, should be normalized
575 * @param second string, will be normalized
576 * @param errorCode Standard ICU error code. Its input value must
577 * pass the U_SUCCESS() test, or else the function returns
578 * immediately. Check for U_FAILURE() on output or use with
579 * function chaining. (See User Guide for details.)
580 * @return first
581 * @stable ICU 4.4
582 */
583 virtual UnicodeString &
584 normalizeSecondAndAppend(UnicodeString &first,
585 const UnicodeString &second,
586 UErrorCode &errorCode) const U_OVERRIDE;
587 /**
588 * Appends the second string to the first string
589 * (merging them at the boundary) and returns the first string.
590 * The result is normalized if both the strings were normalized.
591 * The first and second strings must be different objects.
592 * @param first string, should be normalized
593 * @param second string, should be normalized
594 * @param errorCode Standard ICU error code. Its input value must
595 * pass the U_SUCCESS() test, or else the function returns
596 * immediately. Check for U_FAILURE() on output or use with
597 * function chaining. (See User Guide for details.)
598 * @return first
599 * @stable ICU 4.4
600 */
601 virtual UnicodeString &
602 append(UnicodeString &first,
603 const UnicodeString &second,
604 UErrorCode &errorCode) const U_OVERRIDE;
605
606 /**
607 * Gets the decomposition mapping of c.
608 * For details see the base class documentation.
609 *
610 * This function is independent of the mode of the Normalizer2.
611 * @param c code point
612 * @param decomposition String object which will be set to c's
613 * decomposition mapping, if there is one.
614 * @return TRUE if c has a decomposition, otherwise FALSE
615 * @stable ICU 4.6
616 */
617 virtual UBool
618 getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
619
620 /**
621 * Gets the raw decomposition mapping of c.
622 * For details see the base class documentation.
623 *
624 * This function is independent of the mode of the Normalizer2.
625 * @param c code point
626 * @param decomposition String object which will be set to c's
627 * raw decomposition mapping, if there is one.
628 * @return TRUE if c has a decomposition, otherwise FALSE
629 * @stable ICU 49
630 */
631 virtual UBool
632 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
633
634 /**
635 * Performs pairwise composition of a & b and returns the composite if there is one.
636 * For details see the base class documentation.
637 *
638 * This function is independent of the mode of the Normalizer2.
639 * @param a A (normalization starter) code point.
640 * @param b Another code point.
641 * @return The non-negative composite code point if there is one; otherwise a negative value.
642 * @stable ICU 49
643 */
644 virtual UChar32
645 composePair(UChar32 a, UChar32 b) const U_OVERRIDE;
646
647 /**
648 * Gets the combining class of c.
649 * The default implementation returns 0
650 * but all standard implementations return the Unicode Canonical_Combining_Class value.
651 * @param c code point
652 * @return c's combining class
653 * @stable ICU 49
654 */
655 virtual uint8_t
656 getCombiningClass(UChar32 c) const U_OVERRIDE;
657
658 /**
659 * Tests if the string is normalized.
660 * For details see the Normalizer2 base class documentation.
661 * @param s input string
662 * @param errorCode Standard ICU error code. Its input value must
663 * pass the U_SUCCESS() test, or else the function returns
664 * immediately. Check for U_FAILURE() on output or use with
665 * function chaining. (See User Guide for details.)
666 * @return TRUE if s is normalized
667 * @stable ICU 4.4
668 */
669 virtual UBool
670 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
671 /**
672 * Tests if the UTF-8 string is normalized.
673 * Internally, in cases where the quickCheck() method would return "maybe"
674 * (which is only possible for the two COMPOSE modes) this method
675 * resolves to "yes" or "no" to provide a definitive result,
676 * at the cost of doing more work in those cases.
677 *
678 * This works for all normalization modes,
679 * but it is currently optimized for UTF-8 only for "compose" modes,
680 * such as for NFC, NFKC, and NFKC_Casefold
681 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
682 * For other modes it currently converts to UTF-16 and calls isNormalized().
683 *
684 * @param s UTF-8 input string
685 * @param errorCode Standard ICU error code. Its input value must
686 * pass the U_SUCCESS() test, or else the function returns
687 * immediately. Check for U_FAILURE() on output or use with
688 * function chaining. (See User Guide for details.)
689 * @return TRUE if s is normalized
690 * @draft ICU 60
691 */
692 virtual UBool
693 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;
694 /**
695 * Tests if the string is normalized.
696 * For details see the Normalizer2 base class documentation.
697 * @param s input string
698 * @param errorCode Standard ICU error code. Its input value must
699 * pass the U_SUCCESS() test, or else the function returns
700 * immediately. Check for U_FAILURE() on output or use with
701 * function chaining. (See User Guide for details.)
702 * @return UNormalizationCheckResult
703 * @stable ICU 4.4
704 */
705 virtual UNormalizationCheckResult
706 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
707 /**
708 * Returns the end of the normalized substring of the input string.
709 * For details see the Normalizer2 base class documentation.
710 * @param s input string
711 * @param errorCode Standard ICU error code. Its input value must
712 * pass the U_SUCCESS() test, or else the function returns
713 * immediately. Check for U_FAILURE() on output or use with
714 * function chaining. (See User Guide for details.)
715 * @return "yes" span end index
716 * @stable ICU 4.4
717 */
718 virtual int32_t
719 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
720
721 /**
722 * Tests if the character always has a normalization boundary before it,
723 * regardless of context.
724 * For details see the Normalizer2 base class documentation.
725 * @param c character to test
726 * @return TRUE if c has a normalization boundary before it
727 * @stable ICU 4.4
728 */
729 virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;
730
731 /**
732 * Tests if the character always has a normalization boundary after it,
733 * regardless of context.
734 * For details see the Normalizer2 base class documentation.
735 * @param c character to test
736 * @return TRUE if c has a normalization boundary after it
737 * @stable ICU 4.4
738 */
739 virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;
740
741 /**
742 * Tests if the character is normalization-inert.
743 * For details see the Normalizer2 base class documentation.
744 * @param c character to test
745 * @return TRUE if c is normalization-inert
746 * @stable ICU 4.4
747 */
748 virtual UBool isInert(UChar32 c) const U_OVERRIDE;
749 private:
750 UnicodeString &
751 normalize(const UnicodeString &src,
752 UnicodeString &dest,
753 USetSpanCondition spanCondition,
754 UErrorCode &errorCode) const;
755
756 void
757 normalizeUTF8(uint32_t options, const char *src, int32_t length,
758 ByteSink &sink, Edits *edits,
759 USetSpanCondition spanCondition,
760 UErrorCode &errorCode) const;
761
762 UnicodeString &
763 normalizeSecondAndAppend(UnicodeString &first,
764 const UnicodeString &second,
765 UBool doNormalize,
766 UErrorCode &errorCode) const;
767
768 const Normalizer2 &norm2;
769 const UnicodeSet &set;
770 };
771
772 U_NAMESPACE_END
773 #endif // U_SHOW_CPLUSPLUS_API
774
775 #endif // !UCONFIG_NO_NORMALIZATION
776 #endif // __NORMALIZER2_H__