git.saurik.com Git - apple/javascriptcore.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 2009-2010, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: unorm2.h
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2009dec15
	14	* created by: Markus W. Scherer
	15	*/
	16
	17	#ifndef __UNORM2_H__
	18	#define __UNORM2_H__
	19
	20	/**
	21	* \file
	22	* \brief C API: New API for Unicode Normalization.
	23	*
	24	* Unicode normalization functionality for standard Unicode normalization or
	25	* for using custom mapping tables.
	26	* All instances of UNormalizer2 are unmodifiable/immutable.
	27	* Instances returned by unorm2_getInstance() are singletons that must not be deleted by the caller.
	28	* For more details see the Normalizer2 C++ class.
	29	*/
	30
	31	#include "unicode/utypes.h"
	32	#include "unicode/localpointer.h"
	33	#include "unicode/uset.h"
	34
	35	/**
	36	* Constants for normalization modes.
	37	* For details about standard Unicode normalization forms
	38	* and about the algorithms which are also used with custom mapping tables
	39	* see http://www.unicode.org/unicode/reports/tr15/
	40	* @stable ICU 4.4
	41	*/
	42	typedef enum {
	43	/**
	44	* Decomposition followed by composition.
	45	* Same as standard NFC when using an "nfc" instance.
	46	* Same as standard NFKC when using an "nfkc" instance.
	47	* For details about standard Unicode normalization forms
	48	* see http://www.unicode.org/unicode/reports/tr15/
	49	* @stable ICU 4.4
	50	*/
	51	UNORM2_COMPOSE,
	52	/**
	53	* Map, and reorder canonically.
	54	* Same as standard NFD when using an "nfc" instance.
	55	* Same as standard NFKD when using an "nfkc" instance.
	56	* For details about standard Unicode normalization forms
	57	* see http://www.unicode.org/unicode/reports/tr15/
	58	* @stable ICU 4.4
	59	*/
	60	UNORM2_DECOMPOSE,
	61	/**
	62	* "Fast C or D" form.
	63	* If a string is in this form, then further decomposition <i>without reordering</i>
	64	* would yield the same form as DECOMPOSE.
	65	* Text in "Fast C or D" form can be processed efficiently with data tables
	66	* that are "canonically closed", that is, that provide equivalent data for
	67	* equivalent text, without having to be fully normalized.
	68	* Not a standard Unicode normalization form.
	69	* Not a unique form: Different FCD strings can be canonically equivalent.
	70	* For details see http://www.unicode.org/notes/tn5/#FCD
	71	* @stable ICU 4.4
	72	*/
	73	UNORM2_FCD,
	74	/**
	75	* Compose only contiguously.
	76	* Also known as "FCC" or "Fast C Contiguous".
	77	* The result will often but not always be in NFC.
	78	* The result will conform to FCD which is useful for processing.
	79	* Not a standard Unicode normalization form.
	80	* For details see http://www.unicode.org/notes/tn5/#FCC
	81	* @stable ICU 4.4
	82	*/
	83	UNORM2_COMPOSE_CONTIGUOUS
	84	} UNormalization2Mode;
	85
	86	/**
	87	* Result values for normalization quick check functions.
	88	* For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
	89	* @stable ICU 2.0
	90	*/
	91	typedef enum UNormalizationCheckResult {
	92	/**
	93	* The input string is not in the normalization form.
	94	* @stable ICU 2.0
	95	*/
	96	UNORM_NO,
	97	/**
	98	* The input string is in the normalization form.
	99	* @stable ICU 2.0
	100	*/
	101	UNORM_YES,
	102	/**
	103	* The input string may or may not be in the normalization form.
	104	* This value is only returned for composition forms like NFC and FCC,
	105	* when a backward-combining character is found for which the surrounding text
	106	* would have to be analyzed further.
	107	* @stable ICU 2.0
	108	*/
	109	UNORM_MAYBE
	110	} UNormalizationCheckResult;
	111
	112	/**
	113	* Opaque C service object type for the new normalization API.
	114	* @stable ICU 4.4
	115	*/
	116	struct UNormalizer2;
	117	typedef struct UNormalizer2 UNormalizer2; /*< C typedef for struct UNormalizer2. @stable ICU 4.4 /
	118
	119	#if !UCONFIG_NO_NORMALIZATION
	120
	121	/**
	122	* Returns a UNormalizer2 instance which uses the specified data file
	123	* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
	124	* and which composes or decomposes text according to the specified mode.
	125	* Returns an unmodifiable singleton instance. Do not delete it.
	126	*
	127	* Use packageName=NULL for data files that are part of ICU's own data.
	128	* Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
	129	* Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
	130	* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
	131	*
	132	* @param packageName NULL for ICU built-in data, otherwise application data package name
	133	* @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
	134	* @param mode normalization mode (compose or decompose etc.)
	135	* @param pErrorCode Standard ICU error code. Its input value must
	136	* pass the U_SUCCESS() test, or else the function returns
	137	* immediately. Check for U_FAILURE() on output or use with
	138	* function chaining. (See User Guide for details.)
	139	* @return the requested UNormalizer2, if successful
	140	* @stable ICU 4.4
	141	*/
	142	U_STABLE const UNormalizer2 * U_EXPORT2
	143	unorm2_getInstance(const char *packageName,
	144	const char *name,
	145	UNormalization2Mode mode,
	146	UErrorCode *pErrorCode);
	147
	148	/**
	149	* Constructs a filtered normalizer wrapping any UNormalizer2 instance
	150	* and a filter set.
	151	* Both are aliased and must not be modified or deleted while this object
	152	* is used.
	153	* The filter set should be frozen; otherwise the performance will suffer greatly.
	154	* @param norm2 wrapped UNormalizer2 instance
	155	* @param filterSet USet which determines the characters to be normalized
	156	* @param pErrorCode Standard ICU error code. Its input value must
	157	* pass the U_SUCCESS() test, or else the function returns
	158	* immediately. Check for U_FAILURE() on output or use with
	159	* function chaining. (See User Guide for details.)
	160	* @return the requested UNormalizer2, if successful
	161	* @stable ICU 4.4
	162	*/
	163	U_STABLE UNormalizer2 * U_EXPORT2
	164	unorm2_openFiltered(const UNormalizer2 norm2, const USet filterSet, UErrorCode *pErrorCode);
	165
	166	/**
	167	* Closes a UNormalizer2 instance from unorm2_openFiltered().
	168	* Do not close instances from unorm2_getInstance()!
	169	* @param norm2 UNormalizer2 instance to be closed
	170	* @stable ICU 4.4
	171	*/
	172	U_STABLE void U_EXPORT2
	173	unorm2_close(UNormalizer2 *norm2);
	174
	175	#if U_SHOW_CPLUSPLUS_API
	176
	177	U_NAMESPACE_BEGIN
	178
	179	/**
	180	* \class LocalUNormalizer2Pointer
	181	* "Smart pointer" class, closes a UNormalizer2 via unorm2_close().
	182	* For most methods see the LocalPointerBase base class.
	183	*
	184	* @see LocalPointerBase
	185	* @see LocalPointer
	186	* @stable ICU 4.4
	187	*/
	188	U_DEFINE_LOCAL_OPEN_POINTER(LocalUNormalizer2Pointer, UNormalizer2, unorm2_close);
	189
	190	U_NAMESPACE_END
	191
	192	#endif
	193
	194	/**
	195	* Writes the normalized form of the source string to the destination string
	196	* (replacing its contents) and returns the length of the destination string.
	197	* The source and destination strings must be different buffers.
	198	* @param norm2 UNormalizer2 instance
	199	* @param src source string
	200	* @param length length of the source string, or -1 if NUL-terminated
	201	* @param dest destination string; its contents is replaced with normalized src
	202	* @param capacity number of UChars that can be written to dest
	203	* @param pErrorCode Standard ICU error code. Its input value must
	204	* pass the U_SUCCESS() test, or else the function returns
	205	* immediately. Check for U_FAILURE() on output or use with
	206	* function chaining. (See User Guide for details.)
	207	* @return dest
	208	* @stable ICU 4.4
	209	*/
	210	U_STABLE int32_t U_EXPORT2
	211	unorm2_normalize(const UNormalizer2 *norm2,
	212	const UChar *src, int32_t length,
	213	UChar *dest, int32_t capacity,
	214	UErrorCode *pErrorCode);
	215	/**
	216	* Appends the normalized form of the second string to the first string
	217	* (merging them at the boundary) and returns the length of the first string.
	218	* The result is normalized if the first string was normalized.
	219	* The first and second strings must be different buffers.
	220	* @param norm2 UNormalizer2 instance
	221	* @param first string, should be normalized
	222	* @param firstLength length of the first string, or -1 if NUL-terminated
	223	* @param firstCapacity number of UChars that can be written to first
	224	* @param second string, will be normalized
	225	* @param secondLength length of the source string, or -1 if NUL-terminated
	226	* @param pErrorCode Standard ICU error code. Its input value must
	227	* pass the U_SUCCESS() test, or else the function returns
	228	* immediately. Check for U_FAILURE() on output or use with
	229	* function chaining. (See User Guide for details.)
	230	* @return first
	231	* @stable ICU 4.4
	232	*/
	233	U_STABLE int32_t U_EXPORT2
	234	unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
	235	UChar *first, int32_t firstLength, int32_t firstCapacity,
	236	const UChar *second, int32_t secondLength,
	237	UErrorCode *pErrorCode);
	238	/**
	239	* Appends the second string to the first string
	240	* (merging them at the boundary) and returns the length of the first string.
	241	* The result is normalized if both the strings were normalized.
	242	* The first and second strings must be different buffers.
	243	* @param norm2 UNormalizer2 instance
	244	* @param first string, should be normalized
	245	* @param firstLength length of the first string, or -1 if NUL-terminated
	246	* @param firstCapacity number of UChars that can be written to first
	247	* @param second string, should be normalized
	248	* @param secondLength length of the source string, or -1 if NUL-terminated
	249	* @param pErrorCode Standard ICU error code. Its input value must
	250	* pass the U_SUCCESS() test, or else the function returns
	251	* immediately. Check for U_FAILURE() on output or use with
	252	* function chaining. (See User Guide for details.)
	253	* @return first
	254	* @stable ICU 4.4
	255	*/
	256	U_STABLE int32_t U_EXPORT2
	257	unorm2_append(const UNormalizer2 *norm2,
	258	UChar *first, int32_t firstLength, int32_t firstCapacity,
	259	const UChar *second, int32_t secondLength,
	260	UErrorCode *pErrorCode);
	261
	262	/**
	263	* Gets the decomposition mapping of c. Equivalent to unorm2_normalize(string(c))
	264	* on a UNORM2_DECOMPOSE UNormalizer2 instance, but much faster.
	265	* This function is independent of the mode of the UNormalizer2.
	266	* @param norm2 UNormalizer2 instance
	267	* @param c code point
	268	* @param decomposition String buffer which will be set to c's
	269	* decomposition mapping, if there is one.
	270	* @param capacity number of UChars that can be written to decomposition
	271	* @param pErrorCode Standard ICU error code. Its input value must
	272	* pass the U_SUCCESS() test, or else the function returns
	273	* immediately. Check for U_FAILURE() on output or use with
	274	* function chaining. (See User Guide for details.)
	275	* @return the non-negative length of c's decomposition, if there is one; otherwise a negative value
	276	* @draft ICU 4.6
	277	*/
	278	U_DRAFT int32_t U_EXPORT2
	279	unorm2_getDecomposition(const UNormalizer2 *norm2,
	280	UChar32 c, UChar *decomposition, int32_t capacity,
	281	UErrorCode *pErrorCode);
	282
	283	/**
	284	* Tests if the string is normalized.
	285	* Internally, in cases where the quickCheck() method would return "maybe"
	286	* (which is only possible for the two COMPOSE modes) this method
	287	* resolves to "yes" or "no" to provide a definitive result,
	288	* at the cost of doing more work in those cases.
	289	* @param norm2 UNormalizer2 instance
	290	* @param s input string
	291	* @param length length of the string, or -1 if NUL-terminated
	292	* @param pErrorCode Standard ICU error code. Its input value must
	293	* pass the U_SUCCESS() test, or else the function returns
	294	* immediately. Check for U_FAILURE() on output or use with
	295	* function chaining. (See User Guide for details.)
	296	* @return TRUE if s is normalized
	297	* @stable ICU 4.4
	298	*/
	299	U_STABLE UBool U_EXPORT2
	300	unorm2_isNormalized(const UNormalizer2 *norm2,
	301	const UChar *s, int32_t length,
	302	UErrorCode *pErrorCode);
	303
	304	/**
	305	* Tests if the string is normalized.
	306	* For the two COMPOSE modes, the result could be "maybe" in cases that
	307	* would take a little more work to resolve definitively.
	308	* Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
	309	* combination of quick check + normalization, to avoid
	310	* re-checking the "yes" prefix.
	311	* @param norm2 UNormalizer2 instance
	312	* @param s input string
	313	* @param length length of the string, or -1 if NUL-terminated
	314	* @param pErrorCode Standard ICU error code. Its input value must
	315	* pass the U_SUCCESS() test, or else the function returns
	316	* immediately. Check for U_FAILURE() on output or use with
	317	* function chaining. (See User Guide for details.)
	318	* @return UNormalizationCheckResult
	319	* @stable ICU 4.4
	320	*/
	321	U_STABLE UNormalizationCheckResult U_EXPORT2
	322	unorm2_quickCheck(const UNormalizer2 *norm2,
	323	const UChar *s, int32_t length,
	324	UErrorCode *pErrorCode);
	325
	326	/**
	327	* Returns the end of the normalized substring of the input string.
	328	* In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
	329	* the substring <code>UnicodeString(s, 0, end)</code>
	330	* will pass the quick check with a "yes" result.
	331	*
	332	* The returned end index is usually one or more characters before the
	333	* "no" or "maybe" character: The end index is at a normalization boundary.
	334	* (See the class documentation for more about normalization boundaries.)
	335	*
	336	* When the goal is a normalized string and most input strings are expected
	337	* to be normalized already, then call this method,
	338	* and if it returns a prefix shorter than the input string,
	339	* copy that prefix and use normalizeSecondAndAppend() for the remainder.
	340	* @param norm2 UNormalizer2 instance
	341	* @param s input string
	342	* @param length length of the string, or -1 if NUL-terminated
	343	* @param pErrorCode Standard ICU error code. Its input value must
	344	* pass the U_SUCCESS() test, or else the function returns
	345	* immediately. Check for U_FAILURE() on output or use with
	346	* function chaining. (See User Guide for details.)
	347	* @return "yes" span end index
	348	* @stable ICU 4.4
	349	*/
	350	U_STABLE int32_t U_EXPORT2
	351	unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
	352	const UChar *s, int32_t length,
	353	UErrorCode *pErrorCode);
	354
	355	/**
	356	* Tests if the character always has a normalization boundary before it,
	357	* regardless of context.
	358	* For details see the Normalizer2 base class documentation.
	359	* @param norm2 UNormalizer2 instance
	360	* @param c character to test
	361	* @return TRUE if c has a normalization boundary before it
	362	* @stable ICU 4.4
	363	*/
	364	U_STABLE UBool U_EXPORT2
	365	unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c);
	366
	367	/**
	368	* Tests if the character always has a normalization boundary after it,
	369	* regardless of context.
	370	* For details see the Normalizer2 base class documentation.
	371	* @param norm2 UNormalizer2 instance
	372	* @param c character to test
	373	* @return TRUE if c has a normalization boundary after it
	374	* @stable ICU 4.4
	375	*/
	376	U_STABLE UBool U_EXPORT2
	377	unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c);
	378
	379	/**
	380	* Tests if the character is normalization-inert.
	381	* For details see the Normalizer2 base class documentation.
	382	* @param norm2 UNormalizer2 instance
	383	* @param c character to test
	384	* @return TRUE if c is normalization-inert
	385	* @stable ICU 4.4
	386	*/
	387	U_STABLE UBool U_EXPORT2
	388	unorm2_isInert(const UNormalizer2 *norm2, UChar32 c);
	389
	390	#endif /* !UCONFIG_NO_NORMALIZATION */
	391	#endif /* __UNORM2_H__ */