git.saurik.com Git - apple/icu.git/blame - icuSources/common/unicode/normalizer2.h

Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
729e4ab9 A	3	/*
	4	*******************************************************************************
	5	*
51004dcb	6	* Copyright (C) 2009-2013, International Business Machines
729e4ab9 A	7	* Corporation and others. All Rights Reserved.
	8	*
	9	*******************************************************************************
	10	* file name: normalizer2.h
f3c0d7a5	11	* encoding: UTF-8
729e4ab9 A	12	* tab size: 8 (not used)
	13	* indentation:4
	14	*
	15	* created on: 2009nov22
	16	* created by: Markus W. Scherer
	17	*/
	18
	19	#ifndef __NORMALIZER2_H__
	20	#define __NORMALIZER2_H__
	21
	22	/**
	23	* \file
	24	* \brief C++ API: New API for Unicode Normalization.
	25	*/
	26
	27	#include "unicode/utypes.h"
	28
340931cb A	29	#if U_SHOW_CPLUSPLUS_API
340931cb A	30
729e4ab9 A	31	#if !UCONFIG_NO_NORMALIZATION
729e4ab9 A	32
0f5d89e8	33	#include "unicode/stringpiece.h"
729e4ab9 A	34	#include "unicode/uniset.h"
	35	#include "unicode/unistr.h"
	36	#include "unicode/unorm2.h"
	37
	38	U_NAMESPACE_BEGIN
	39
0f5d89e8 A	40	class ByteSink;
0f5d89e8 A	41
729e4ab9 A	42	/**
	43	* Unicode normalization functionality for standard Unicode normalization or
	44	* for using custom mapping tables.
	45	* All instances of this class are unmodifiable/immutable.
	46	* Instances returned by getInstance() are singletons that must not be deleted by the caller.
	47	* The Normalizer2 class is not intended for public subclassing.
	48	*
	49	* The primary functions are to produce a normalized string and to detect whether
	50	* a string is already normalized.
	51	* The most commonly used normalization forms are those defined in
	52	* http://www.unicode.org/unicode/reports/tr15/
	53	* However, this API supports additional normalization forms for specialized purposes.
	54	* For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
	55	* and can be used in implementations of UTS #46.
	56	*
	57	* Not only are the standard compose and decompose modes supplied,
	58	* but additional modes are provided as documented in the Mode enum.
	59	*
	60	* Some of the functions in this class identify normalization boundaries.
	61	* At a normalization boundary, the portions of the string
	62	* before it and starting from it do not interact and can be handled independently.
	63	*
	64	* The spanQuickCheckYes() stops at a normalization boundary.
	65	* When the goal is a normalized string, then the text before the boundary
	66	* can be copied, and the remainder can be processed with normalizeSecondAndAppend().
	67	*
	68	* The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
	69	* a character is guaranteed to be at a normalization boundary,
	70	* regardless of context.
	71	* This is used for moving from one normalization boundary to the next
	72	* or preceding boundary, and for performing iterative normalization.
	73	*
	74	* Iterative normalization is useful when only a small portion of a
	75	* longer string needs to be processed.
	76	* For example, in ICU, iterative normalization is used by the NormalizationTransliterator
	77	* (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
	78	* (to process only the substring for which sort key bytes are computed).
	79	*
	80	* The set of normalization boundaries returned by these functions may not be
	81	* complete: There may be more boundaries that could be returned.
	82	* Different functions may return different boundaries.
	83	* @stable ICU 4.4
	84	*/
	85	class U_COMMON_API Normalizer2 : public UObject {
	86	public:
4388f060 A	87	/**
	88	* Destructor.
	89	* @stable ICU 4.4
	90	*/
	91	~Normalizer2();
	92
4388f060 A	93	/**
	94	* Returns a Normalizer2 instance for Unicode NFC normalization.
	95	* Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).
	96	* Returns an unmodifiable singleton instance. Do not delete it.
	97	* @param errorCode Standard ICU error code. Its input value must
	98	* pass the U_SUCCESS() test, or else the function returns
	99	* immediately. Check for U_FAILURE() on output or use with
	100	* function chaining. (See User Guide for details.)
	101	* @return the requested Normalizer2, if successful
51004dcb	102	* @stable ICU 49
4388f060 A	103	*/
	104	static const Normalizer2 *
	105	getNFCInstance(UErrorCode &errorCode);
	106
	107	/**
	108	* Returns a Normalizer2 instance for Unicode NFD normalization.
	109	* Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).
	110	* Returns an unmodifiable singleton instance. Do not delete it.
	111	* @param errorCode Standard ICU error code. Its input value must
	112	* pass the U_SUCCESS() test, or else the function returns
	113	* immediately. Check for U_FAILURE() on output or use with
	114	* function chaining. (See User Guide for details.)
	115	* @return the requested Normalizer2, if successful
51004dcb	116	* @stable ICU 49
4388f060 A	117	*/
	118	static const Normalizer2 *
	119	getNFDInstance(UErrorCode &errorCode);
	120
	121	/**
	122	* Returns a Normalizer2 instance for Unicode NFKC normalization.
	123	* Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).
	124	* Returns an unmodifiable singleton instance. Do not delete it.
	125	* @param errorCode Standard ICU error code. Its input value must
	126	* pass the U_SUCCESS() test, or else the function returns
	127	* immediately. Check for U_FAILURE() on output or use with
	128	* function chaining. (See User Guide for details.)
	129	* @return the requested Normalizer2, if successful
51004dcb	130	* @stable ICU 49
4388f060 A	131	*/
	132	static const Normalizer2 *
	133	getNFKCInstance(UErrorCode &errorCode);
	134
	135	/**
	136	* Returns a Normalizer2 instance for Unicode NFKD normalization.
	137	* Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).
	138	* Returns an unmodifiable singleton instance. Do not delete it.
	139	* @param errorCode Standard ICU error code. Its input value must
	140	* pass the U_SUCCESS() test, or else the function returns
	141	* immediately. Check for U_FAILURE() on output or use with
	142	* function chaining. (See User Guide for details.)
	143	* @return the requested Normalizer2, if successful
51004dcb	144	* @stable ICU 49
4388f060 A	145	*/
	146	static const Normalizer2 *
	147	getNFKDInstance(UErrorCode &errorCode);
	148
	149	/**
	150	* Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
	151	* Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).
	152	* Returns an unmodifiable singleton instance. Do not delete it.
	153	* @param errorCode Standard ICU error code. Its input value must
	154	* pass the U_SUCCESS() test, or else the function returns
	155	* immediately. Check for U_FAILURE() on output or use with
	156	* function chaining. (See User Guide for details.)
	157	* @return the requested Normalizer2, if successful
51004dcb	158	* @stable ICU 49
4388f060 A	159	*/
	160	static const Normalizer2 *
	161	getNFKCCasefoldInstance(UErrorCode &errorCode);
4388f060	162
729e4ab9 A	163	/**
	164	* Returns a Normalizer2 instance which uses the specified data file
	165	* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
	166	* and which composes or decomposes text according to the specified mode.
	167	* Returns an unmodifiable singleton instance. Do not delete it.
	168	*
	169	* Use packageName=NULL for data files that are part of ICU's own data.
	170	* Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
	171	* Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
	172	* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
	173	*
	174	* @param packageName NULL for ICU built-in data, otherwise application data package name
	175	* @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
	176	* @param mode normalization mode (compose or decompose etc.)
	177	* @param errorCode Standard ICU error code. Its input value must
	178	* pass the U_SUCCESS() test, or else the function returns
	179	* immediately. Check for U_FAILURE() on output or use with
	180	* function chaining. (See User Guide for details.)
	181	* @return the requested Normalizer2, if successful
	182	* @stable ICU 4.4
	183	*/
	184	static const Normalizer2 *
	185	getInstance(const char *packageName,
	186	const char *name,
	187	UNormalization2Mode mode,
	188	UErrorCode &errorCode);
	189
	190	/**
	191	* Returns the normalized form of the source string.
	192	* @param src source string
	193	* @param errorCode Standard ICU error code. Its input value must
	194	* pass the U_SUCCESS() test, or else the function returns
	195	* immediately. Check for U_FAILURE() on output or use with
	196	* function chaining. (See User Guide for details.)
	197	* @return normalized src
	198	* @stable ICU 4.4
	199	*/
	200	UnicodeString
	201	normalize(const UnicodeString &src, UErrorCode &errorCode) const {
	202	UnicodeString result;
	203	normalize(src, result, errorCode);
	204	return result;
	205	}
	206	/**
	207	* Writes the normalized form of the source string to the destination string
	208	* (replacing its contents) and returns the destination string.
	209	* The source and destination strings must be different objects.
	210	* @param src source string
	211	* @param dest destination string; its contents is replaced with normalized src
	212	* @param errorCode Standard ICU error code. Its input value must
	213	* pass the U_SUCCESS() test, or else the function returns
	214	* immediately. Check for U_FAILURE() on output or use with
	215	* function chaining. (See User Guide for details.)
	216	* @return dest
	217	* @stable ICU 4.4
	218	*/
	219	virtual UnicodeString &
	220	normalize(const UnicodeString &src,
	221	UnicodeString &dest,
	222	UErrorCode &errorCode) const = 0;
0f5d89e8 A	223
	224	/**
	225	* Normalizes a UTF-8 string and optionally records how source substrings
	226	* relate to changed and unchanged result substrings.
	227	*
	228	* Currently implemented completely only for "compose" modes,
	229	* such as for NFC, NFKC, and NFKC_Casefold
	230	* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
	231	* Otherwise currently converts to & from UTF-16 and does not support edits.
	232	*
	233	* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
	234	* @param src Source UTF-8 string.
	235	* @param sink A ByteSink to which the normalized UTF-8 result string is written.
	236	* sink.Flush() is called at the end.
	237	* @param edits Records edits for index mapping, working with styled text,
	238	* and getting only changes (if any).
	239	* The Edits contents is undefined if any error occurs.
	240	* This function calls edits->reset() first unless
	241	* options includes U_EDITS_NO_RESET. edits can be nullptr.
	242	* @param errorCode Standard ICU error code. Its input value must
	243	* pass the U_SUCCESS() test, or else the function returns
	244	* immediately. Check for U_FAILURE() on output or use with
	245	* function chaining. (See User Guide for details.)
3d1f044b	246	* @stable ICU 60
0f5d89e8 A	247	*/
	248	virtual void
	249	normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
	250	Edits *edits, UErrorCode &errorCode) const;
	251
729e4ab9 A	252	/**
	253	* Appends the normalized form of the second string to the first string
	254	* (merging them at the boundary) and returns the first string.
	255	* The result is normalized if the first string was normalized.
	256	* The first and second strings must be different objects.
	257	* @param first string, should be normalized
	258	* @param second string, will be normalized
	259	* @param errorCode Standard ICU error code. Its input value must
	260	* pass the U_SUCCESS() test, or else the function returns
	261	* immediately. Check for U_FAILURE() on output or use with
	262	* function chaining. (See User Guide for details.)
	263	* @return first
	264	* @stable ICU 4.4
	265	*/
	266	virtual UnicodeString &
	267	normalizeSecondAndAppend(UnicodeString &first,
	268	const UnicodeString &second,
	269	UErrorCode &errorCode) const = 0;
	270	/**
	271	* Appends the second string to the first string
	272	* (merging them at the boundary) and returns the first string.
	273	* The result is normalized if both the strings were normalized.
	274	* The first and second strings must be different objects.
	275	* @param first string, should be normalized
	276	* @param second string, should be normalized
	277	* @param errorCode Standard ICU error code. Its input value must
	278	* pass the U_SUCCESS() test, or else the function returns
	279	* immediately. Check for U_FAILURE() on output or use with
	280	* function chaining. (See User Guide for details.)
	281	* @return first
	282	* @stable ICU 4.4
	283	*/
	284	virtual UnicodeString &
	285	append(UnicodeString &first,
	286	const UnicodeString &second,
	287	UErrorCode &errorCode) const = 0;
	288
	289	/**
4388f060 A	290	* Gets the decomposition mapping of c.
	291	* Roughly equivalent to normalizing the String form of c
	292	* on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
	293	* returns FALSE and does not write a string
	294	* if c does not have a decomposition mapping in this instance's data.
729e4ab9 A	295	* This function is independent of the mode of the Normalizer2.
	296	* @param c code point
	297	* @param decomposition String object which will be set to c's
	298	* decomposition mapping, if there is one.
	299	* @return TRUE if c has a decomposition, otherwise FALSE
4388f060	300	* @stable ICU 4.6
729e4ab9 A	301	*/
	302	virtual UBool
	303	getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
	304
4388f060 A	305	/**
	306	* Gets the raw decomposition mapping of c.
	307	*
	308	* This is similar to the getDecomposition() method but returns the
	309	* raw decomposition mapping as specified in UnicodeData.txt or
	310	* (for custom data) in the mapping files processed by the gennorm2 tool.
	311	* By contrast, getDecomposition() returns the processed,
	312	* recursively-decomposed version of this mapping.
	313	*
	314	* When used on a standard NFKC Normalizer2 instance,
	315	* getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
	316	*
	317	* When used on a standard NFC Normalizer2 instance,
	318	* it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
f3c0d7a5	319	* in this case, the result contains either one or two code points (=1..4 char16_ts).
4388f060 A	320	*
	321	* This function is independent of the mode of the Normalizer2.
	322	* The default implementation returns FALSE.
	323	* @param c code point
	324	* @param decomposition String object which will be set to c's
	325	* raw decomposition mapping, if there is one.
	326	* @return TRUE if c has a decomposition, otherwise FALSE
51004dcb	327	* @stable ICU 49
4388f060 A	328	*/
	329	virtual UBool
	330	getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
	331
	332	/**
	333	* Performs pairwise composition of a & b and returns the composite if there is one.
	334	*
	335	* Returns a composite code point c only if c has a two-way mapping to a+b.
	336	* In standard Unicode normalization, this means that
	337	* c has a canonical decomposition to a+b
	338	* and c does not have the Full_Composition_Exclusion property.
	339	*
	340	* This function is independent of the mode of the Normalizer2.
	341	* The default implementation returns a negative value.
	342	* @param a A (normalization starter) code point.
	343	* @param b Another code point.
	344	* @return The non-negative composite code point if there is one; otherwise a negative value.
51004dcb	345	* @stable ICU 49
4388f060 A	346	*/
	347	virtual UChar32
	348	composePair(UChar32 a, UChar32 b) const;
	349
	350	/**
	351	* Gets the combining class of c.
	352	* The default implementation returns 0
	353	* but all standard implementations return the Unicode Canonical_Combining_Class value.
	354	* @param c code point
	355	* @return c's combining class
51004dcb	356	* @stable ICU 49
4388f060 A	357	*/
	358	virtual uint8_t
	359	getCombiningClass(UChar32 c) const;
	360
729e4ab9 A	361	/**
	362	* Tests if the string is normalized.
	363	* Internally, in cases where the quickCheck() method would return "maybe"
	364	* (which is only possible for the two COMPOSE modes) this method
	365	* resolves to "yes" or "no" to provide a definitive result,
	366	* at the cost of doing more work in those cases.
	367	* @param s input string
	368	* @param errorCode Standard ICU error code. Its input value must
	369	* pass the U_SUCCESS() test, or else the function returns
	370	* immediately. Check for U_FAILURE() on output or use with
	371	* function chaining. (See User Guide for details.)
	372	* @return TRUE if s is normalized
	373	* @stable ICU 4.4
	374	*/
	375	virtual UBool
	376	isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
0f5d89e8 A	377	/**
	378	* Tests if the UTF-8 string is normalized.
	379	* Internally, in cases where the quickCheck() method would return "maybe"
	380	* (which is only possible for the two COMPOSE modes) this method
	381	* resolves to "yes" or "no" to provide a definitive result,
	382	* at the cost of doing more work in those cases.
	383	*
	384	* This works for all normalization modes,
	385	* but it is currently optimized for UTF-8 only for "compose" modes,
	386	* such as for NFC, NFKC, and NFKC_Casefold
	387	* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
	388	* For other modes it currently converts to UTF-16 and calls isNormalized().
	389	*
	390	* @param s UTF-8 input string
	391	* @param errorCode Standard ICU error code. Its input value must
	392	* pass the U_SUCCESS() test, or else the function returns
	393	* immediately. Check for U_FAILURE() on output or use with
	394	* function chaining. (See User Guide for details.)
	395	* @return TRUE if s is normalized
3d1f044b	396	* @stable ICU 60
0f5d89e8 A	397	*/
	398	virtual UBool
	399	isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
	400
729e4ab9 A	401
	402	/**
	403	* Tests if the string is normalized.
	404	* For the two COMPOSE modes, the result could be "maybe" in cases that
	405	* would take a little more work to resolve definitively.
	406	* Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
	407	* combination of quick check + normalization, to avoid
	408	* re-checking the "yes" prefix.
	409	* @param s input string
	410	* @param errorCode Standard ICU error code. Its input value must
	411	* pass the U_SUCCESS() test, or else the function returns
	412	* immediately. Check for U_FAILURE() on output or use with
	413	* function chaining. (See User Guide for details.)
	414	* @return UNormalizationCheckResult
	415	* @stable ICU 4.4
	416	*/
	417	virtual UNormalizationCheckResult
	418	quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
	419
	420	/**
	421	* Returns the end of the normalized substring of the input string.
	422	* In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
	423	* the substring <code>UnicodeString(s, 0, end)</code>
	424	* will pass the quick check with a "yes" result.
	425	*
	426	* The returned end index is usually one or more characters before the
	427	* "no" or "maybe" character: The end index is at a normalization boundary.
	428	* (See the class documentation for more about normalization boundaries.)
	429	*
	430	* When the goal is a normalized string and most input strings are expected
	431	* to be normalized already, then call this method,
	432	* and if it returns a prefix shorter than the input string,
	433	* copy that prefix and use normalizeSecondAndAppend() for the remainder.
	434	* @param s input string
	435	* @param errorCode Standard ICU error code. Its input value must
	436	* pass the U_SUCCESS() test, or else the function returns
	437	* immediately. Check for U_FAILURE() on output or use with
	438	* function chaining. (See User Guide for details.)
	439	* @return "yes" span end index
	440	* @stable ICU 4.4
	441	*/
	442	virtual int32_t
	443	spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
	444
	445	/**
	446	* Tests if the character always has a normalization boundary before it,
	447	* regardless of context.
	448	* If true, then the character does not normalization-interact with
	449	* preceding characters.
	450	* In other words, a string containing this character can be normalized
	451	* by processing portions before this character and starting from this
	452	* character independently.
	453	* This is used for iterative normalization. See the class documentation for details.
	454	* @param c character to test
	455	* @return TRUE if c has a normalization boundary before it
	456	* @stable ICU 4.4
	457	*/
	458	virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
	459
	460	/**
	461	* Tests if the character always has a normalization boundary after it,
	462	* regardless of context.
	463	* If true, then the character does not normalization-interact with
	464	* following characters.
465	* In other words, a string containing this character can be normalized
466	* by processing portions up to this character and after this
467	* character independently.
468	* This is used for iterative normalization. See the class documentation for details.
469	* Note that this operation may be significantly slower than hasBoundaryBefore().
470	* @param c character to test
471	* @return TRUE if c has a normalization boundary after it
472	* @stable ICU 4.4
473	*/
474	virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
475
476	/**
477	* Tests if the character is normalization-inert.
478	* If true, then the character does not change, nor normalization-interact with
479	* preceding or following characters.
480	* In other words, a string containing this character can be normalized
481	* by processing portions before this character and after this
482	* character independently.
483	* This is used for iterative normalization. See the class documentation for details.
484	* Note that this operation may be significantly slower than hasBoundaryBefore().
485	* @param c character to test
486	* @return TRUE if c is normalization-inert
487	* @stable ICU 4.4
488	*/
489	virtual UBool isInert(UChar32 c) const = 0;
729e4ab9 A	490	};
	491
	492	/**
	493	* Normalization filtered by a UnicodeSet.
	494	* Normalizes portions of the text contained in the filter set and leaves
	495	* portions not contained in the filter set unchanged.
	496	* Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
	497	* Not-in-the-filter text is treated as "is normalized" and "quick check yes".
	498	* This class implements all of (and only) the Normalizer2 API.
	499	* An instance of this class is unmodifiable/immutable but is constructed and
	500	* must be destructed by the owner.
	501	* @stable ICU 4.4
	502	*/
	503	class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
	504	public:
	505	/**
	506	* Constructs a filtered normalizer wrapping any Normalizer2 instance
	507	* and a filter set.
	508	* Both are aliased and must not be modified or deleted while this object
	509	* is used.
	510	* The filter set should be frozen; otherwise the performance will suffer greatly.
	511	* @param n2 wrapped Normalizer2 instance
	512	* @param filterSet UnicodeSet which determines the characters to be normalized
	513	* @stable ICU 4.4
	514	*/
	515	FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
	516	norm2(n2), set(filterSet) {}
	517
4388f060 A	518	/**
	519	* Destructor.
	520	* @stable ICU 4.4
	521	*/
	522	~FilteredNormalizer2();
	523
729e4ab9 A	524	/**
	525	* Writes the normalized form of the source string to the destination string
	526	* (replacing its contents) and returns the destination string.
	527	* The source and destination strings must be different objects.
	528	* @param src source string
	529	* @param dest destination string; its contents is replaced with normalized src
	530	* @param errorCode Standard ICU error code. Its input value must
	531	* pass the U_SUCCESS() test, or else the function returns
	532	* immediately. Check for U_FAILURE() on output or use with
	533	* function chaining. (See User Guide for details.)
	534	* @return dest
	535	* @stable ICU 4.4
	536	*/
	537	virtual UnicodeString &
	538	normalize(const UnicodeString &src,
	539	UnicodeString &dest,
0f5d89e8 A	540	UErrorCode &errorCode) const U_OVERRIDE;
	541
	542	/**
	543	* Normalizes a UTF-8 string and optionally records how source substrings
	544	* relate to changed and unchanged result substrings.
	545	*
	546	* Currently implemented completely only for "compose" modes,
	547	* such as for NFC, NFKC, and NFKC_Casefold
	548	* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
	549	* Otherwise currently converts to & from UTF-16 and does not support edits.
	550	*
	551	* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
	552	* @param src Source UTF-8 string.
	553	* @param sink A ByteSink to which the normalized UTF-8 result string is written.
	554	* sink.Flush() is called at the end.
	555	* @param edits Records edits for index mapping, working with styled text,
	556	* and getting only changes (if any).
	557	* The Edits contents is undefined if any error occurs.
	558	* This function calls edits->reset() first unless
	559	* options includes U_EDITS_NO_RESET. edits can be nullptr.
	560	* @param errorCode Standard ICU error code. Its input value must
	561	* pass the U_SUCCESS() test, or else the function returns
	562	* immediately. Check for U_FAILURE() on output or use with
	563	* function chaining. (See User Guide for details.)
3d1f044b	564	* @stable ICU 60
0f5d89e8 A	565	*/
	566	virtual void
	567	normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
	568	Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
	569
729e4ab9 A	570	/**
	571	* Appends the normalized form of the second string to the first string
	572	* (merging them at the boundary) and returns the first string.
	573	* The result is normalized if the first string was normalized.
	574	* The first and second strings must be different objects.
	575	* @param first string, should be normalized
	576	* @param second string, will be normalized
	577	* @param errorCode Standard ICU error code. Its input value must
	578	* pass the U_SUCCESS() test, or else the function returns
	579	* immediately. Check for U_FAILURE() on output or use with
	580	* function chaining. (See User Guide for details.)
	581	* @return first
	582	* @stable ICU 4.4
	583	*/
	584	virtual UnicodeString &
	585	normalizeSecondAndAppend(UnicodeString &first,
	586	const UnicodeString &second,
0f5d89e8	587	UErrorCode &errorCode) const U_OVERRIDE;
729e4ab9 A	588	/**
	589	* Appends the second string to the first string
	590	* (merging them at the boundary) and returns the first string.
	591	* The result is normalized if both the strings were normalized.
	592	* The first and second strings must be different objects.
	593	* @param first string, should be normalized
	594	* @param second string, should be normalized
	595	* @param errorCode Standard ICU error code. Its input value must
	596	* pass the U_SUCCESS() test, or else the function returns
	597	* immediately. Check for U_FAILURE() on output or use with
	598	* function chaining. (See User Guide for details.)
	599	* @return first
	600	* @stable ICU 4.4
	601	*/
	602	virtual UnicodeString &
	603	append(UnicodeString &first,
	604	const UnicodeString &second,
0f5d89e8	605	UErrorCode &errorCode) const U_OVERRIDE;
729e4ab9 A	606
729e4ab9 A	607	/**
4388f060 A	608	* Gets the decomposition mapping of c.
	609	* For details see the base class documentation.
	610	*
729e4ab9 A	611	* This function is independent of the mode of the Normalizer2.
	612	* @param c code point
	613	* @param decomposition String object which will be set to c's
	614	* decomposition mapping, if there is one.
	615	* @return TRUE if c has a decomposition, otherwise FALSE
4388f060	616	* @stable ICU 4.6
729e4ab9 A	617	*/
729e4ab9 A	618	virtual UBool
0f5d89e8	619	getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
729e4ab9	620
4388f060 A	621	/**
	622	* Gets the raw decomposition mapping of c.
	623	* For details see the base class documentation.
	624	*
	625	* This function is independent of the mode of the Normalizer2.
	626	* @param c code point
	627	* @param decomposition String object which will be set to c's
	628	* raw decomposition mapping, if there is one.
	629	* @return TRUE if c has a decomposition, otherwise FALSE
51004dcb	630	* @stable ICU 49
4388f060 A	631	*/
4388f060 A	632	virtual UBool
0f5d89e8	633	getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
4388f060 A	634
	635	/**
	636	* Performs pairwise composition of a & b and returns the composite if there is one.
	637	* For details see the base class documentation.
	638	*
	639	* This function is independent of the mode of the Normalizer2.
	640	* @param a A (normalization starter) code point.
	641	* @param b Another code point.
	642	* @return The non-negative composite code point if there is one; otherwise a negative value.
51004dcb	643	* @stable ICU 49
4388f060 A	644	*/
4388f060 A	645	virtual UChar32
0f5d89e8	646	composePair(UChar32 a, UChar32 b) const U_OVERRIDE;
4388f060 A	647
	648	/**
	649	* Gets the combining class of c.
	650	* The default implementation returns 0
	651	* but all standard implementations return the Unicode Canonical_Combining_Class value.
	652	* @param c code point
	653	* @return c's combining class
51004dcb	654	* @stable ICU 49
4388f060 A	655	*/
4388f060 A	656	virtual uint8_t
0f5d89e8	657	getCombiningClass(UChar32 c) const U_OVERRIDE;
4388f060	658
729e4ab9 A	659	/**
	660	* Tests if the string is normalized.
	661	* For details see the Normalizer2 base class documentation.
	662	* @param s input string
	663	* @param errorCode Standard ICU error code. Its input value must
	664	* pass the U_SUCCESS() test, or else the function returns
	665	* immediately. Check for U_FAILURE() on output or use with
	666	* function chaining. (See User Guide for details.)
	667	* @return TRUE if s is normalized
	668	* @stable ICU 4.4
	669	*/
	670	virtual UBool
0f5d89e8 A	671	isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
	672	/**
	673	* Tests if the UTF-8 string is normalized.
	674	* Internally, in cases where the quickCheck() method would return "maybe"
	675	* (which is only possible for the two COMPOSE modes) this method
	676	* resolves to "yes" or "no" to provide a definitive result,
	677	* at the cost of doing more work in those cases.
	678	*
	679	* This works for all normalization modes,
	680	* but it is currently optimized for UTF-8 only for "compose" modes,
	681	* such as for NFC, NFKC, and NFKC_Casefold
	682	* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
	683	* For other modes it currently converts to UTF-16 and calls isNormalized().
	684	*
	685	* @param s UTF-8 input string
	686	* @param errorCode Standard ICU error code. Its input value must
	687	* pass the U_SUCCESS() test, or else the function returns
	688	* immediately. Check for U_FAILURE() on output or use with
	689	* function chaining. (See User Guide for details.)
	690	* @return TRUE if s is normalized
3d1f044b	691	* @stable ICU 60
0f5d89e8 A	692	*/
	693	virtual UBool
	694	isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;
729e4ab9 A	695	/**
	696	* Tests if the string is normalized.
	697	* For details see the Normalizer2 base class documentation.
	698	* @param s input string
	699	* @param errorCode Standard ICU error code. Its input value must
	700	* pass the U_SUCCESS() test, or else the function returns
	701	* immediately. Check for U_FAILURE() on output or use with
	702	* function chaining. (See User Guide for details.)
	703	* @return UNormalizationCheckResult
	704	* @stable ICU 4.4
	705	*/
	706	virtual UNormalizationCheckResult
0f5d89e8	707	quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
729e4ab9 A	708	/**
	709	* Returns the end of the normalized substring of the input string.
	710	* For details see the Normalizer2 base class documentation.
	711	* @param s input string
	712	* @param errorCode Standard ICU error code. Its input value must
	713	* pass the U_SUCCESS() test, or else the function returns
	714	* immediately. Check for U_FAILURE() on output or use with
	715	* function chaining. (See User Guide for details.)
	716	* @return "yes" span end index
	717	* @stable ICU 4.4
	718	*/
	719	virtual int32_t
0f5d89e8	720	spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
729e4ab9 A	721
	722	/**
	723	* Tests if the character always has a normalization boundary before it,
	724	* regardless of context.
	725	* For details see the Normalizer2 base class documentation.
	726	* @param c character to test
	727	* @return TRUE if c has a normalization boundary before it
	728	* @stable ICU 4.4
	729	*/
0f5d89e8	730	virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;
729e4ab9 A	731
	732	/**
	733	* Tests if the character always has a normalization boundary after it,
	734	* regardless of context.
	735	* For details see the Normalizer2 base class documentation.
	736	* @param c character to test
	737	* @return TRUE if c has a normalization boundary after it
	738	* @stable ICU 4.4
	739	*/
0f5d89e8	740	virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;
729e4ab9 A	741
	742	/**
	743	* Tests if the character is normalization-inert.
	744	* For details see the Normalizer2 base class documentation.
	745	* @param c character to test
	746	* @return TRUE if c is normalization-inert
	747	* @stable ICU 4.4
	748	*/
0f5d89e8	749	virtual UBool isInert(UChar32 c) const U_OVERRIDE;
729e4ab9 A	750	private:
	751	UnicodeString &
	752	normalize(const UnicodeString &src,
	753	UnicodeString &dest,
	754	USetSpanCondition spanCondition,
	755	UErrorCode &errorCode) const;
	756
0f5d89e8 A	757	void
	758	normalizeUTF8(uint32_t options, const char *src, int32_t length,
	759	ByteSink &sink, Edits *edits,
	760	USetSpanCondition spanCondition,
	761	UErrorCode &errorCode) const;
	762
729e4ab9 A	763	UnicodeString &
	764	normalizeSecondAndAppend(UnicodeString &first,
	765	const UnicodeString &second,
	766	UBool doNormalize,
	767	UErrorCode &errorCode) const;
	768
	769	const Normalizer2 &norm2;
	770	const UnicodeSet &set;
	771	};
	772
	773	U_NAMESPACE_END
	774
	775	#endif // !UCONFIG_NO_NORMALIZATION
340931cb A	776
	777	#endif /* U_SHOW_CPLUSPLUS_API */
	778
729e4ab9	779	#endif // __NORMALIZER2_H__