[apple/icu.git] / icuSources / common / unicode / normalizer2.h

/*
*******************************************************************************
*
*   Copyright (C) 2009-2010, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  normalizer2.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2009nov22
*   created by: Markus W. Scherer
*/

#ifndef __NORMALIZER2_H__
#define __NORMALIZER2_H__

/**
 * \file
 * \brief C++ API: New API for Unicode Normalization.
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_NORMALIZATION

#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/unorm2.h"

U_NAMESPACE_BEGIN

/**
 * Unicode normalization functionality for standard Unicode normalization or
 * for using custom mapping tables.
 * All instances of this class are unmodifiable/immutable.
 * Instances returned by getInstance() are singletons that must not be deleted by the caller.
 * The Normalizer2 class is not intended for public subclassing.
 *
 * The primary functions are to produce a normalized string and to detect whether
 * a string is already normalized.
 * The most commonly used normalization forms are those defined in
 * http://www.unicode.org/unicode/reports/tr15/
 * However, this API supports additional normalization forms for specialized purposes.
 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
 * and can be used in implementations of UTS #46.
 *
 * Not only are the standard compose and decompose modes supplied,
 * but additional modes are provided as documented in the Mode enum.
 *
 * Some of the functions in this class identify normalization boundaries.
 * At a normalization boundary, the portions of the string
 * before it and starting from it do not interact and can be handled independently.
 *
 * The spanQuickCheckYes() stops at a normalization boundary.
 * When the goal is a normalized string, then the text before the boundary
 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
 *
 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
 * a character is guaranteed to be at a normalization boundary,
 * regardless of context.
 * This is used for moving from one normalization boundary to the next
 * or preceding boundary, and for performing iterative normalization.
 *
 * Iterative normalization is useful when only a small portion of a
 * longer string needs to be processed.
 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
 * (to process only the substring for which sort key bytes are computed).
 *
 * The set of normalization boundaries returned by these functions may not be
 * complete: There may be more boundaries that could be returned.
 * Different functions may return different boundaries.
 * @stable ICU 4.4
 */
class U_COMMON_API Normalizer2 : public UObject {
public:
    /**
     * Returns a Normalizer2 instance which uses the specified data file
     * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
     * and which composes or decomposes text according to the specified mode.
     * Returns an unmodifiable singleton instance. Do not delete it.
     *
     * Use packageName=NULL for data files that are part of ICU's own data.
     * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
     * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
     * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
     *
     * @param packageName NULL for ICU built-in data, otherwise application data package name
     * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
     * @param mode normalization mode (compose or decompose etc.)
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return the requested Normalizer2, if successful
     * @stable ICU 4.4
     */
    static const Normalizer2 *
    getInstance(const char *packageName,
                const char *name,
                UNormalization2Mode mode,
                UErrorCode &errorCode);

    /**
     * Returns the normalized form of the source string.
     * @param src source string
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return normalized src
     * @stable ICU 4.4
     */
    UnicodeString
    normalize(const UnicodeString &src, UErrorCode &errorCode) const {
        UnicodeString result;
        normalize(src, result, errorCode);
        return result;
    }
    /**
     * Writes the normalized form of the source string to the destination string
     * (replacing its contents) and returns the destination string.
     * The source and destination strings must be different objects.
     * @param src source string
     * @param dest destination string; its contents is replaced with normalized src
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return dest
     * @stable ICU 4.4
     */
    virtual UnicodeString &
    normalize(const UnicodeString &src,
              UnicodeString &dest,
              UErrorCode &errorCode) const = 0;
    /**
     * Appends the normalized form of the second string to the first string
     * (merging them at the boundary) and returns the first string.
     * The result is normalized if the first string was normalized.
     * The first and second strings must be different objects.
     * @param first string, should be normalized
     * @param second string, will be normalized
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return first
     * @stable ICU 4.4
     */
    virtual UnicodeString &
    normalizeSecondAndAppend(UnicodeString &first,
                             const UnicodeString &second,
                             UErrorCode &errorCode) const = 0;
    /**
     * Appends the second string to the first string
     * (merging them at the boundary) and returns the first string.
     * The result is normalized if both the strings were normalized.
     * The first and second strings must be different objects.
     * @param first string, should be normalized
     * @param second string, should be normalized
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return first
     * @stable ICU 4.4
     */
    virtual UnicodeString &
    append(UnicodeString &first,
           const UnicodeString &second,
           UErrorCode &errorCode) const = 0;

    /**
     * Gets the decomposition mapping of c. Equivalent to normalize(UnicodeString(c))
     * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster.
     * This function is independent of the mode of the Normalizer2.
     * @param c code point
     * @param decomposition String object which will be set to c's
     *                      decomposition mapping, if there is one.
     * @return TRUE if c has a decomposition, otherwise FALSE
     * @draft ICU 4.6
     */
    virtual UBool
    getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;

    /**
     * Tests if the string is normalized.
     * Internally, in cases where the quickCheck() method would return "maybe"
     * (which is only possible for the two COMPOSE modes) this method
     * resolves to "yes" or "no" to provide a definitive result,
     * at the cost of doing more work in those cases.
     * @param s input string
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return TRUE if s is normalized
     * @stable ICU 4.4
     */
    virtual UBool
    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;

    /**
     * Tests if the string is normalized.
     * For the two COMPOSE modes, the result could be "maybe" in cases that
     * would take a little more work to resolve definitively.
     * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
     * combination of quick check + normalization, to avoid
     * re-checking the "yes" prefix.
     * @param s input string
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return UNormalizationCheckResult
     * @stable ICU 4.4
     */
    virtual UNormalizationCheckResult
    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;

    /**
     * Returns the end of the normalized substring of the input string.
     * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
     * the substring <code>UnicodeString(s, 0, end)</code>
     * will pass the quick check with a "yes" result.
     *
     * The returned end index is usually one or more characters before the
     * "no" or "maybe" character: The end index is at a normalization boundary.
     * (See the class documentation for more about normalization boundaries.)
     *
     * When the goal is a normalized string and most input strings are expected
     * to be normalized already, then call this method,
     * and if it returns a prefix shorter than the input string,
     * copy that prefix and use normalizeSecondAndAppend() for the remainder.
     * @param s input string
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return "yes" span end index
     * @stable ICU 4.4
     */
    virtual int32_t
    spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;

    /**
     * Tests if the character always has a normalization boundary before it,
     * regardless of context.
     * If true, then the character does not normalization-interact with
     * preceding characters.
     * In other words, a string containing this character can be normalized
     * by processing portions before this character and starting from this
     * character independently.
     * This is used for iterative normalization. See the class documentation for details.
     * @param c character to test
     * @return TRUE if c has a normalization boundary before it
     * @stable ICU 4.4
     */
    virtual UBool hasBoundaryBefore(UChar32 c) const = 0;

    /**
     * Tests if the character always has a normalization boundary after it,
     * regardless of context.
     * If true, then the character does not normalization-interact with
     * following characters.
     * In other words, a string containing this character can be normalized
     * by processing portions up to this character and after this
     * character independently.
     * This is used for iterative normalization. See the class documentation for details.
     * Note that this operation may be significantly slower than hasBoundaryBefore().
     * @param c character to test
     * @return TRUE if c has a normalization boundary after it
     * @stable ICU 4.4
     */
    virtual UBool hasBoundaryAfter(UChar32 c) const = 0;

    /**
     * Tests if the character is normalization-inert.
     * If true, then the character does not change, nor normalization-interact with
     * preceding or following characters.
     * In other words, a string containing this character can be normalized
     * by processing portions before this character and after this
     * character independently.
     * This is used for iterative normalization. See the class documentation for details.
     * Note that this operation may be significantly slower than hasBoundaryBefore().
     * @param c character to test
     * @return TRUE if c is normalization-inert
     * @stable ICU 4.4
     */
    virtual UBool isInert(UChar32 c) const = 0;

private:
    // No ICU "poor man's RTTI" for this class nor its subclasses.
    virtual UClassID getDynamicClassID() const;
};

/**
 * Normalization filtered by a UnicodeSet.
 * Normalizes portions of the text contained in the filter set and leaves
 * portions not contained in the filter set unchanged.
 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
 * This class implements all of (and only) the Normalizer2 API.
 * An instance of this class is unmodifiable/immutable but is constructed and
 * must be destructed by the owner.
 * @stable ICU 4.4
 */
class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
public:
    /**
     * Constructs a filtered normalizer wrapping any Normalizer2 instance
     * and a filter set.
     * Both are aliased and must not be modified or deleted while this object
     * is used.
     * The filter set should be frozen; otherwise the performance will suffer greatly.
     * @param n2 wrapped Normalizer2 instance
     * @param filterSet UnicodeSet which determines the characters to be normalized
     * @stable ICU 4.4
     */
    FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
            norm2(n2), set(filterSet) {}

    /**
     * Writes the normalized form of the source string to the destination string
     * (replacing its contents) and returns the destination string.
     * The source and destination strings must be different objects.
     * @param src source string
     * @param dest destination string; its contents is replaced with normalized src
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return dest
     * @stable ICU 4.4
     */
    virtual UnicodeString &
    normalize(const UnicodeString &src,
              UnicodeString &dest,
              UErrorCode &errorCode) const;
    /**
     * Appends the normalized form of the second string to the first string
     * (merging them at the boundary) and returns the first string.
     * The result is normalized if the first string was normalized.
     * The first and second strings must be different objects.
     * @param first string, should be normalized
     * @param second string, will be normalized
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return first
     * @stable ICU 4.4
     */
    virtual UnicodeString &
    normalizeSecondAndAppend(UnicodeString &first,
                             const UnicodeString &second,
                             UErrorCode &errorCode) const;
    /**
     * Appends the second string to the first string
     * (merging them at the boundary) and returns the first string.
     * The result is normalized if both the strings were normalized.
     * The first and second strings must be different objects.
     * @param first string, should be normalized
     * @param second string, should be normalized
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return first
     * @stable ICU 4.4
     */
    virtual UnicodeString &
    append(UnicodeString &first,
           const UnicodeString &second,
           UErrorCode &errorCode) const;

    /**
     * Gets the decomposition mapping of c. Equivalent to normalize(UnicodeString(c))
     * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster.
     * This function is independent of the mode of the Normalizer2.
     * @param c code point
     * @param decomposition String object which will be set to c's
     *                      decomposition mapping, if there is one.
     * @return TRUE if c has a decomposition, otherwise FALSE
     * @draft ICU 4.6
     */
    virtual UBool
    getDecomposition(UChar32 c, UnicodeString &decomposition) const;

    /**
     * Tests if the string is normalized.
     * For details see the Normalizer2 base class documentation.
     * @param s input string
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return TRUE if s is normalized
     * @stable ICU 4.4
     */
    virtual UBool
    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;
    /**
     * Tests if the string is normalized.
     * For details see the Normalizer2 base class documentation.
     * @param s input string
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return UNormalizationCheckResult
     * @stable ICU 4.4
     */
    virtual UNormalizationCheckResult
    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;
    /**
     * Returns the end of the normalized substring of the input string.
     * For details see the Normalizer2 base class documentation.
     * @param s input string
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return "yes" span end index
     * @stable ICU 4.4
     */
    virtual int32_t
    spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;

    /**
     * Tests if the character always has a normalization boundary before it,
     * regardless of context.
     * For details see the Normalizer2 base class documentation.
     * @param c character to test
     * @return TRUE if c has a normalization boundary before it
     * @stable ICU 4.4
     */
    virtual UBool hasBoundaryBefore(UChar32 c) const;

    /**
     * Tests if the character always has a normalization boundary after it,
     * regardless of context.
     * For details see the Normalizer2 base class documentation.
     * @param c character to test
     * @return TRUE if c has a normalization boundary after it
     * @stable ICU 4.4
     */
    virtual UBool hasBoundaryAfter(UChar32 c) const;

    /**
     * Tests if the character is normalization-inert.
     * For details see the Normalizer2 base class documentation.
     * @param c character to test
     * @return TRUE if c is normalization-inert
     * @stable ICU 4.4
     */
    virtual UBool isInert(UChar32 c) const;
private:
    UnicodeString &
    normalize(const UnicodeString &src,
              UnicodeString &dest,
              USetSpanCondition spanCondition,
              UErrorCode &errorCode) const;

    UnicodeString &
    normalizeSecondAndAppend(UnicodeString &first,
                             const UnicodeString &second,
                             UBool doNormalize,
                             UErrorCode &errorCode) const;

    const Normalizer2 &norm2;
    const UnicodeSet &set;
};

U_NAMESPACE_END

#endif  // !UCONFIG_NO_NORMALIZATION
#endif  // __NORMALIZER2_H__
Commit	Line	Data
729e4ab9 A	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 2009-2010, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: normalizer2.h
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2009nov22
	14	* created by: Markus W. Scherer
	15	*/
	16
	17	#ifndef __NORMALIZER2_H__
	18	#define __NORMALIZER2_H__
	19
	20	/**
	21	* \file
	22	* \brief C++ API: New API for Unicode Normalization.
	23	*/
	24
	25	#include "unicode/utypes.h"
	26
	27	#if !UCONFIG_NO_NORMALIZATION
	28
	29	#include "unicode/uniset.h"
	30	#include "unicode/unistr.h"
	31	#include "unicode/unorm2.h"
	32
	33	U_NAMESPACE_BEGIN
	34
	35	/**
	36	* Unicode normalization functionality for standard Unicode normalization or
	37	* for using custom mapping tables.
	38	* All instances of this class are unmodifiable/immutable.
	39	* Instances returned by getInstance() are singletons that must not be deleted by the caller.
	40	* The Normalizer2 class is not intended for public subclassing.
	41	*
	42	* The primary functions are to produce a normalized string and to detect whether
	43	* a string is already normalized.
	44	* The most commonly used normalization forms are those defined in
	45	* http://www.unicode.org/unicode/reports/tr15/
	46	* However, this API supports additional normalization forms for specialized purposes.
	47	* For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
	48	* and can be used in implementations of UTS #46.
	49	*
	50	* Not only are the standard compose and decompose modes supplied,
	51	* but additional modes are provided as documented in the Mode enum.
	52	*
	53	* Some of the functions in this class identify normalization boundaries.
	54	* At a normalization boundary, the portions of the string
	55	* before it and starting from it do not interact and can be handled independently.
	56	*
	57	* The spanQuickCheckYes() stops at a normalization boundary.
	58	* When the goal is a normalized string, then the text before the boundary
	59	* can be copied, and the remainder can be processed with normalizeSecondAndAppend().
	60	*
	61	* The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
	62	* a character is guaranteed to be at a normalization boundary,
	63	* regardless of context.
	64	* This is used for moving from one normalization boundary to the next
65	* or preceding boundary, and for performing iterative normalization.
66	*
67	* Iterative normalization is useful when only a small portion of a
68	* longer string needs to be processed.
69	* For example, in ICU, iterative normalization is used by the NormalizationTransliterator
70	* (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
71	* (to process only the substring for which sort key bytes are computed).
72	*
73	* The set of normalization boundaries returned by these functions may not be
74	* complete: There may be more boundaries that could be returned.
75	* Different functions may return different boundaries.
76	* @stable ICU 4.4
77	*/
78	class U_COMMON_API Normalizer2 : public UObject {
79	public:
80	/**
81	* Returns a Normalizer2 instance which uses the specified data file
82	* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
83	* and which composes or decomposes text according to the specified mode.
84	* Returns an unmodifiable singleton instance. Do not delete it.
85	*
86	* Use packageName=NULL for data files that are part of ICU's own data.
87	* Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
88	* Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
89	* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
90	*
91	* @param packageName NULL for ICU built-in data, otherwise application data package name
92	* @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
93	* @param mode normalization mode (compose or decompose etc.)
94	* @param errorCode Standard ICU error code. Its input value must
95	* pass the U_SUCCESS() test, or else the function returns
96	* immediately. Check for U_FAILURE() on output or use with
97	* function chaining. (See User Guide for details.)
98	* @return the requested Normalizer2, if successful
99	* @stable ICU 4.4
100	*/
101	static const Normalizer2 *
102	getInstance(const char *packageName,
103	const char *name,
104	UNormalization2Mode mode,
105	UErrorCode &errorCode);
106
107	/**
108	* Returns the normalized form of the source string.
109	* @param src source string
110	* @param errorCode Standard ICU error code. Its input value must
111	* pass the U_SUCCESS() test, or else the function returns
112	* immediately. Check for U_FAILURE() on output or use with
113	* function chaining. (See User Guide for details.)
114	* @return normalized src
115	* @stable ICU 4.4
116	*/
117	UnicodeString
118	normalize(const UnicodeString &src, UErrorCode &errorCode) const {
119	UnicodeString result;
120	normalize(src, result, errorCode);
121	return result;
122	}
123	/**
124	* Writes the normalized form of the source string to the destination string
125	* (replacing its contents) and returns the destination string.
126	* The source and destination strings must be different objects.
127	* @param src source string
128	* @param dest destination string; its contents is replaced with normalized src
129	* @param errorCode Standard ICU error code. Its input value must
130	* pass the U_SUCCESS() test, or else the function returns
131	* immediately. Check for U_FAILURE() on output or use with
132	* function chaining. (See User Guide for details.)
133	* @return dest
134	* @stable ICU 4.4
135	*/
136	virtual UnicodeString &
137	normalize(const UnicodeString &src,
138	UnicodeString &dest,
139	UErrorCode &errorCode) const = 0;
140	/**
141	* Appends the normalized form of the second string to the first string
142	* (merging them at the boundary) and returns the first string.
143	* The result is normalized if the first string was normalized.
144	* The first and second strings must be different objects.
145	* @param first string, should be normalized
146	* @param second string, will be normalized
147	* @param errorCode Standard ICU error code. Its input value must
148	* pass the U_SUCCESS() test, or else the function returns
149	* immediately. Check for U_FAILURE() on output or use with
150	* function chaining. (See User Guide for details.)
151	* @return first
152	* @stable ICU 4.4
153	*/
154	virtual UnicodeString &
155	normalizeSecondAndAppend(UnicodeString &first,
156	const UnicodeString &second,
157	UErrorCode &errorCode) const = 0;
158	/**
159	* Appends the second string to the first string
160	* (merging them at the boundary) and returns the first string.
161	* The result is normalized if both the strings were normalized.
162	* The first and second strings must be different objects.
163	* @param first string, should be normalized
164	* @param second string, should be normalized
165	* @param errorCode Standard ICU error code. Its input value must
166	* pass the U_SUCCESS() test, or else the function returns
167	* immediately. Check for U_FAILURE() on output or use with
168	* function chaining. (See User Guide for details.)
169	* @return first
170	* @stable ICU 4.4
171	*/
172	virtual UnicodeString &
173	append(UnicodeString &first,
174	const UnicodeString &second,
175	UErrorCode &errorCode) const = 0;
176
177	/**
178	* Gets the decomposition mapping of c. Equivalent to normalize(UnicodeString(c))
179	* on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster.
180	* This function is independent of the mode of the Normalizer2.
181	* @param c code point
182	* @param decomposition String object which will be set to c's
183	* decomposition mapping, if there is one.
184	* @return TRUE if c has a decomposition, otherwise FALSE
185	* @draft ICU 4.6
186	*/
187	virtual UBool
188	getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
189
190	/**
191	* Tests if the string is normalized.
192	* Internally, in cases where the quickCheck() method would return "maybe"
193	* (which is only possible for the two COMPOSE modes) this method
194	* resolves to "yes" or "no" to provide a definitive result,
195	* at the cost of doing more work in those cases.
196	* @param s input string
197	* @param errorCode Standard ICU error code. Its input value must
198	* pass the U_SUCCESS() test, or else the function returns
199	* immediately. Check for U_FAILURE() on output or use with
200	* function chaining. (See User Guide for details.)
201	* @return TRUE if s is normalized
202	* @stable ICU 4.4
203	*/
204	virtual UBool
205	isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
206
207	/**
208	* Tests if the string is normalized.
209	* For the two COMPOSE modes, the result could be "maybe" in cases that
210	* would take a little more work to resolve definitively.
211	* Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
212	* combination of quick check + normalization, to avoid
213	* re-checking the "yes" prefix.
214	* @param s input string
215	* @param errorCode Standard ICU error code. Its input value must
216	* pass the U_SUCCESS() test, or else the function returns
217	* immediately. Check for U_FAILURE() on output or use with
218	* function chaining. (See User Guide for details.)
219	* @return UNormalizationCheckResult
220	* @stable ICU 4.4
221	*/
222	virtual UNormalizationCheckResult
223	quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
224
225	/**
226	* Returns the end of the normalized substring of the input string.
227	* In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
228	* the substring <code>UnicodeString(s, 0, end)</code>
229	* will pass the quick check with a "yes" result.
230	*
231	* The returned end index is usually one or more characters before the
232	* "no" or "maybe" character: The end index is at a normalization boundary.
233	* (See the class documentation for more about normalization boundaries.)
234	*
235	* When the goal is a normalized string and most input strings are expected
236	* to be normalized already, then call this method,
237	* and if it returns a prefix shorter than the input string,
238	* copy that prefix and use normalizeSecondAndAppend() for the remainder.
239	* @param s input string
240	* @param errorCode Standard ICU error code. Its input value must
241	* pass the U_SUCCESS() test, or else the function returns
242	* immediately. Check for U_FAILURE() on output or use with
243	* function chaining. (See User Guide for details.)
244	* @return "yes" span end index
245	* @stable ICU 4.4
246	*/
247	virtual int32_t
248	spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
249
250	/**
251	* Tests if the character always has a normalization boundary before it,
252	* regardless of context.
253	* If true, then the character does not normalization-interact with
254	* preceding characters.
255	* In other words, a string containing this character can be normalized
256	* by processing portions before this character and starting from this
257	* character independently.
258	* This is used for iterative normalization. See the class documentation for details.
259	* @param c character to test
260	* @return TRUE if c has a normalization boundary before it
261	* @stable ICU 4.4
262	*/
263	virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
264
265	/**
266	* Tests if the character always has a normalization boundary after it,
267	* regardless of context.
268	* If true, then the character does not normalization-interact with
269	* following characters.
270	* In other words, a string containing this character can be normalized
271	* by processing portions up to this character and after this
272	* character independently.
273	* This is used for iterative normalization. See the class documentation for details.
274	* Note that this operation may be significantly slower than hasBoundaryBefore().
275	* @param c character to test
276	* @return TRUE if c has a normalization boundary after it
277	* @stable ICU 4.4
278	*/
279	virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
280
281	/**
282	* Tests if the character is normalization-inert.
283	* If true, then the character does not change, nor normalization-interact with
284	* preceding or following characters.
285	* In other words, a string containing this character can be normalized
286	* by processing portions before this character and after this
287	* character independently.
288	* This is used for iterative normalization. See the class documentation for details.
289	* Note that this operation may be significantly slower than hasBoundaryBefore().
290	* @param c character to test
291	* @return TRUE if c is normalization-inert
292	* @stable ICU 4.4
293	*/
294	virtual UBool isInert(UChar32 c) const = 0;
295
296	private:
297	// No ICU "poor man's RTTI" for this class nor its subclasses.
298	virtual UClassID getDynamicClassID() const;
299	};
300
301	/**
302	* Normalization filtered by a UnicodeSet.
303	* Normalizes portions of the text contained in the filter set and leaves
304	* portions not contained in the filter set unchanged.
305	* Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
306	* Not-in-the-filter text is treated as "is normalized" and "quick check yes".
307	* This class implements all of (and only) the Normalizer2 API.
308	* An instance of this class is unmodifiable/immutable but is constructed and
309	* must be destructed by the owner.
310	* @stable ICU 4.4
311	*/
312	class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
313	public:
314	/**
315	* Constructs a filtered normalizer wrapping any Normalizer2 instance
316	* and a filter set.
317	* Both are aliased and must not be modified or deleted while this object
318	* is used.
319	* The filter set should be frozen; otherwise the performance will suffer greatly.
320	* @param n2 wrapped Normalizer2 instance
321	* @param filterSet UnicodeSet which determines the characters to be normalized
322	* @stable ICU 4.4
323	*/
324	FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
325	norm2(n2), set(filterSet) {}
326
327	/**
328	* Writes the normalized form of the source string to the destination string
329	* (replacing its contents) and returns the destination string.
330	* The source and destination strings must be different objects.
331	* @param src source string
332	* @param dest destination string; its contents is replaced with normalized src
333	* @param errorCode Standard ICU error code. Its input value must
334	* pass the U_SUCCESS() test, or else the function returns
335	* immediately. Check for U_FAILURE() on output or use with
336	* function chaining. (See User Guide for details.)
337	* @return dest
338	* @stable ICU 4.4
339	*/
340	virtual UnicodeString &
341	normalize(const UnicodeString &src,
342	UnicodeString &dest,
343	UErrorCode &errorCode) const;
344	/**
345	* Appends the normalized form of the second string to the first string
346	* (merging them at the boundary) and returns the first string.
347	* The result is normalized if the first string was normalized.
348	* The first and second strings must be different objects.
349	* @param first string, should be normalized
350	* @param second string, will be normalized
351	* @param errorCode Standard ICU error code. Its input value must
352	* pass the U_SUCCESS() test, or else the function returns
353	* immediately. Check for U_FAILURE() on output or use with
354	* function chaining. (See User Guide for details.)
355	* @return first
356	* @stable ICU 4.4
357	*/
358	virtual UnicodeString &
359	normalizeSecondAndAppend(UnicodeString &first,
360	const UnicodeString &second,
361	UErrorCode &errorCode) const;
362	/**
363	* Appends the second string to the first string
364	* (merging them at the boundary) and returns the first string.
365	* The result is normalized if both the strings were normalized.
366	* The first and second strings must be different objects.
367	* @param first string, should be normalized
368	* @param second string, should be normalized
369	* @param errorCode Standard ICU error code. Its input value must
370	* pass the U_SUCCESS() test, or else the function returns
371	* immediately. Check for U_FAILURE() on output or use with
372	* function chaining. (See User Guide for details.)
373	* @return first
374	* @stable ICU 4.4
375	*/
376	virtual UnicodeString &
377	append(UnicodeString &first,
378	const UnicodeString &second,
379	UErrorCode &errorCode) const;
380
381	/**
382	* Gets the decomposition mapping of c. Equivalent to normalize(UnicodeString(c))
383	* on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster.
384	* This function is independent of the mode of the Normalizer2.
385	* @param c code point
386	* @param decomposition String object which will be set to c's
387	* decomposition mapping, if there is one.
388	* @return TRUE if c has a decomposition, otherwise FALSE
389	* @draft ICU 4.6
390	*/
391	virtual UBool
392	getDecomposition(UChar32 c, UnicodeString &decomposition) const;
393
394	/**
395	* Tests if the string is normalized.
396	* For details see the Normalizer2 base class documentation.
397	* @param s input string
398	* @param errorCode Standard ICU error code. Its input value must
399	* pass the U_SUCCESS() test, or else the function returns
400	* immediately. Check for U_FAILURE() on output or use with
401	* function chaining. (See User Guide for details.)
402	* @return TRUE if s is normalized
403	* @stable ICU 4.4
404	*/
405	virtual UBool
406	isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;
407	/**
408	* Tests if the string is normalized.
409	* For details see the Normalizer2 base class documentation.
410	* @param s input string
411	* @param errorCode Standard ICU error code. Its input value must
412	* pass the U_SUCCESS() test, or else the function returns
413	* immediately. Check for U_FAILURE() on output or use with
414	* function chaining. (See User Guide for details.)
415	* @return UNormalizationCheckResult
416	* @stable ICU 4.4
417	*/
418	virtual UNormalizationCheckResult
419	quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;
420	/**
421	* Returns the end of the normalized substring of the input string.
422	* For details see the Normalizer2 base class documentation.
423	* @param s input string
424	* @param errorCode Standard ICU error code. Its input value must
425	* pass the U_SUCCESS() test, or else the function returns
426	* immediately. Check for U_FAILURE() on output or use with
427	* function chaining. (See User Guide for details.)
428	* @return "yes" span end index
429	* @stable ICU 4.4
430	*/
431	virtual int32_t
432	spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;
433
434	/**
435	* Tests if the character always has a normalization boundary before it,
436	* regardless of context.
437	* For details see the Normalizer2 base class documentation.
438	* @param c character to test
439	* @return TRUE if c has a normalization boundary before it
440	* @stable ICU 4.4
441	*/
442	virtual UBool hasBoundaryBefore(UChar32 c) const;
443
444	/**
445	* Tests if the character always has a normalization boundary after it,
446	* regardless of context.
447	* For details see the Normalizer2 base class documentation.
448	* @param c character to test
449	* @return TRUE if c has a normalization boundary after it
450	* @stable ICU 4.4
451	*/
452	virtual UBool hasBoundaryAfter(UChar32 c) const;
453
454	/**
455	* Tests if the character is normalization-inert.
456	* For details see the Normalizer2 base class documentation.
457	* @param c character to test
458	* @return TRUE if c is normalization-inert
459	* @stable ICU 4.4
460	*/
461	virtual UBool isInert(UChar32 c) const;
462	private:
463	UnicodeString &
464	normalize(const UnicodeString &src,
465	UnicodeString &dest,
466	USetSpanCondition spanCondition,
467	UErrorCode &errorCode) const;
468
469	UnicodeString &
470	normalizeSecondAndAppend(UnicodeString &first,
471	const UnicodeString &second,
472	UBool doNormalize,
473	UErrorCode &errorCode) const;
474
475	const Normalizer2 &norm2;
476	const UnicodeSet &set;
477	};
478
479	U_NAMESPACE_END
480
481	#endif // !UCONFIG_NO_NORMALIZATION
482	#endif // __NORMALIZER2_H__