git.saurik.com Git - apple/javascriptcore.git/blame

Commit	Line	Data
93a37866 A	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 2009-2010, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: unorm2.h
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2009dec15
	14	* created by: Markus W. Scherer
	15	*/
	16
	17	#ifndef __UNORM2_H__
	18	#define __UNORM2_H__
	19
	20	/**
	21	* \file
	22	* \brief C API: New API for Unicode Normalization.
	23	*
	24	* Unicode normalization functionality for standard Unicode normalization or
	25	* for using custom mapping tables.
	26	* All instances of UNormalizer2 are unmodifiable/immutable.
	27	* Instances returned by unorm2_getInstance() are singletons that must not be deleted by the caller.
	28	* For more details see the Normalizer2 C++ class.
	29	*/
	30
	31	#include "unicode/utypes.h"
	32	#include "unicode/localpointer.h"
	33	#include "unicode/uset.h"
	34
	35	/**
	36	* Constants for normalization modes.
	37	* For details about standard Unicode normalization forms
	38	* and about the algorithms which are also used with custom mapping tables
	39	* see http://www.unicode.org/unicode/reports/tr15/
	40	* @stable ICU 4.4
	41	*/
	42	typedef enum {
	43	/**
	44	* Decomposition followed by composition.
	45	* Same as standard NFC when using an "nfc" instance.
	46	* Same as standard NFKC when using an "nfkc" instance.
	47	* For details about standard Unicode normalization forms
	48	* see http://www.unicode.org/unicode/reports/tr15/
	49	* @stable ICU 4.4
	50	*/
	51	UNORM2_COMPOSE,
	52	/**
	53	* Map, and reorder canonically.
	54	* Same as standard NFD when using an "nfc" instance.
	55	* Same as standard NFKD when using an "nfkc" instance.
	56	* For details about standard Unicode normalization forms
	57	* see http://www.unicode.org/unicode/reports/tr15/
	58	* @stable ICU 4.4
	59	*/
	60	UNORM2_DECOMPOSE,
	61	/**
	62	* "Fast C or D" form.
	63	* If a string is in this form, then further decomposition <i>without reordering</i>
	64	* would yield the same form as DECOMPOSE.
65	* Text in "Fast C or D" form can be processed efficiently with data tables
66	* that are "canonically closed", that is, that provide equivalent data for
67	* equivalent text, without having to be fully normalized.
68	* Not a standard Unicode normalization form.
69	* Not a unique form: Different FCD strings can be canonically equivalent.
70	* For details see http://www.unicode.org/notes/tn5/#FCD
71	* @stable ICU 4.4
72	*/
73	UNORM2_FCD,
74	/**
75	* Compose only contiguously.
76	* Also known as "FCC" or "Fast C Contiguous".
77	* The result will often but not always be in NFC.
78	* The result will conform to FCD which is useful for processing.
79	* Not a standard Unicode normalization form.
80	* For details see http://www.unicode.org/notes/tn5/#FCC
81	* @stable ICU 4.4
82	*/
83	UNORM2_COMPOSE_CONTIGUOUS
84	} UNormalization2Mode;
85
86	/**
87	* Result values for normalization quick check functions.
88	* For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
89	* @stable ICU 2.0
90	*/
91	typedef enum UNormalizationCheckResult {
92	/**
93	* The input string is not in the normalization form.
94	* @stable ICU 2.0
95	*/
96	UNORM_NO,
97	/**
98	* The input string is in the normalization form.
99	* @stable ICU 2.0
100	*/
101	UNORM_YES,
102	/**
103	* The input string may or may not be in the normalization form.
104	* This value is only returned for composition forms like NFC and FCC,
105	* when a backward-combining character is found for which the surrounding text
106	* would have to be analyzed further.
107	* @stable ICU 2.0
108	*/
109	UNORM_MAYBE
110	} UNormalizationCheckResult;
111
112	/**
113	* Opaque C service object type for the new normalization API.
114	* @stable ICU 4.4
115	*/
116	struct UNormalizer2;
117	typedef struct UNormalizer2 UNormalizer2; /*< C typedef for struct UNormalizer2. @stable ICU 4.4 /
118
119	#if !UCONFIG_NO_NORMALIZATION
120
121	/**
122	* Returns a UNormalizer2 instance which uses the specified data file
123	* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
124	* and which composes or decomposes text according to the specified mode.
125	* Returns an unmodifiable singleton instance. Do not delete it.
126	*
127	* Use packageName=NULL for data files that are part of ICU's own data.
128	* Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
129	* Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
130	* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
131	*
132	* @param packageName NULL for ICU built-in data, otherwise application data package name
133	* @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
134	* @param mode normalization mode (compose or decompose etc.)
135	* @param pErrorCode Standard ICU error code. Its input value must
136	* pass the U_SUCCESS() test, or else the function returns
137	* immediately. Check for U_FAILURE() on output or use with
138	* function chaining. (See User Guide for details.)
139	* @return the requested UNormalizer2, if successful
140	* @stable ICU 4.4
141	*/
142	U_STABLE const UNormalizer2 * U_EXPORT2
143	unorm2_getInstance(const char *packageName,
144	const char *name,
145	UNormalization2Mode mode,
146	UErrorCode *pErrorCode);
147
148	/**
149	* Constructs a filtered normalizer wrapping any UNormalizer2 instance
150	* and a filter set.
151	* Both are aliased and must not be modified or deleted while this object
152	* is used.
153	* The filter set should be frozen; otherwise the performance will suffer greatly.
154	* @param norm2 wrapped UNormalizer2 instance
155	* @param filterSet USet which determines the characters to be normalized
156	* @param pErrorCode Standard ICU error code. Its input value must
157	* pass the U_SUCCESS() test, or else the function returns
158	* immediately. Check for U_FAILURE() on output or use with
159	* function chaining. (See User Guide for details.)
160	* @return the requested UNormalizer2, if successful
161	* @stable ICU 4.4
162	*/
163	U_STABLE UNormalizer2 * U_EXPORT2
164	unorm2_openFiltered(const UNormalizer2 norm2, const USet filterSet, UErrorCode *pErrorCode);
165
166	/**
167	* Closes a UNormalizer2 instance from unorm2_openFiltered().
168	* Do not close instances from unorm2_getInstance()!
169	* @param norm2 UNormalizer2 instance to be closed
170	* @stable ICU 4.4
171	*/
172	U_STABLE void U_EXPORT2
173	unorm2_close(UNormalizer2 *norm2);
174
175	#if U_SHOW_CPLUSPLUS_API
176
177	U_NAMESPACE_BEGIN
178
179	/**
180	* \class LocalUNormalizer2Pointer
181	* "Smart pointer" class, closes a UNormalizer2 via unorm2_close().
182	* For most methods see the LocalPointerBase base class.
183	*
184	* @see LocalPointerBase
185	* @see LocalPointer
186	* @stable ICU 4.4
187	*/
188	U_DEFINE_LOCAL_OPEN_POINTER(LocalUNormalizer2Pointer, UNormalizer2, unorm2_close);
189
190	U_NAMESPACE_END
191
192	#endif
193
194	/**
195	* Writes the normalized form of the source string to the destination string
196	* (replacing its contents) and returns the length of the destination string.
197	* The source and destination strings must be different buffers.
198	* @param norm2 UNormalizer2 instance
199	* @param src source string
200	* @param length length of the source string, or -1 if NUL-terminated
201	* @param dest destination string; its contents is replaced with normalized src
202	* @param capacity number of UChars that can be written to dest
203	* @param pErrorCode Standard ICU error code. Its input value must
204	* pass the U_SUCCESS() test, or else the function returns
205	* immediately. Check for U_FAILURE() on output or use with
206	* function chaining. (See User Guide for details.)
207	* @return dest
208	* @stable ICU 4.4
209	*/
210	U_STABLE int32_t U_EXPORT2
211	unorm2_normalize(const UNormalizer2 *norm2,
212	const UChar *src, int32_t length,
213	UChar *dest, int32_t capacity,
214	UErrorCode *pErrorCode);
215	/**
216	* Appends the normalized form of the second string to the first string
217	* (merging them at the boundary) and returns the length of the first string.
218	* The result is normalized if the first string was normalized.
219	* The first and second strings must be different buffers.
220	* @param norm2 UNormalizer2 instance
221	* @param first string, should be normalized
222	* @param firstLength length of the first string, or -1 if NUL-terminated
223	* @param firstCapacity number of UChars that can be written to first
224	* @param second string, will be normalized
225	* @param secondLength length of the source string, or -1 if NUL-terminated
226	* @param pErrorCode Standard ICU error code. Its input value must
227	* pass the U_SUCCESS() test, or else the function returns
228	* immediately. Check for U_FAILURE() on output or use with
229	* function chaining. (See User Guide for details.)
230	* @return first
231	* @stable ICU 4.4
232	*/
233	U_STABLE int32_t U_EXPORT2
234	unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
235	UChar *first, int32_t firstLength, int32_t firstCapacity,
236	const UChar *second, int32_t secondLength,
237	UErrorCode *pErrorCode);
238	/**
239	* Appends the second string to the first string
240	* (merging them at the boundary) and returns the length of the first string.
241	* The result is normalized if both the strings were normalized.
242	* The first and second strings must be different buffers.
243	* @param norm2 UNormalizer2 instance
244	* @param first string, should be normalized
245	* @param firstLength length of the first string, or -1 if NUL-terminated
246	* @param firstCapacity number of UChars that can be written to first
247	* @param second string, should be normalized
248	* @param secondLength length of the source string, or -1 if NUL-terminated
249	* @param pErrorCode Standard ICU error code. Its input value must
250	* pass the U_SUCCESS() test, or else the function returns
251	* immediately. Check for U_FAILURE() on output or use with
252	* function chaining. (See User Guide for details.)
253	* @return first
254	* @stable ICU 4.4
255	*/
256	U_STABLE int32_t U_EXPORT2
257	unorm2_append(const UNormalizer2 *norm2,
258	UChar *first, int32_t firstLength, int32_t firstCapacity,
259	const UChar *second, int32_t secondLength,
260	UErrorCode *pErrorCode);
261
262	/**
263	* Gets the decomposition mapping of c. Equivalent to unorm2_normalize(string(c))
264	* on a UNORM2_DECOMPOSE UNormalizer2 instance, but much faster.
265	* This function is independent of the mode of the UNormalizer2.
266	* @param norm2 UNormalizer2 instance
267	* @param c code point
268	* @param decomposition String buffer which will be set to c's
269	* decomposition mapping, if there is one.
270	* @param capacity number of UChars that can be written to decomposition
271	* @param pErrorCode Standard ICU error code. Its input value must
272	* pass the U_SUCCESS() test, or else the function returns
273	* immediately. Check for U_FAILURE() on output or use with
274	* function chaining. (See User Guide for details.)
275	* @return the non-negative length of c's decomposition, if there is one; otherwise a negative value
276	* @draft ICU 4.6
277	*/
278	U_DRAFT int32_t U_EXPORT2
279	unorm2_getDecomposition(const UNormalizer2 *norm2,
280	UChar32 c, UChar *decomposition, int32_t capacity,
281	UErrorCode *pErrorCode);
282
283	/**
284	* Tests if the string is normalized.
285	* Internally, in cases where the quickCheck() method would return "maybe"
286	* (which is only possible for the two COMPOSE modes) this method
287	* resolves to "yes" or "no" to provide a definitive result,
288	* at the cost of doing more work in those cases.
289	* @param norm2 UNormalizer2 instance
290	* @param s input string
291	* @param length length of the string, or -1 if NUL-terminated
292	* @param pErrorCode Standard ICU error code. Its input value must
293	* pass the U_SUCCESS() test, or else the function returns
294	* immediately. Check for U_FAILURE() on output or use with
295	* function chaining. (See User Guide for details.)
296	* @return TRUE if s is normalized
297	* @stable ICU 4.4
298	*/
299	U_STABLE UBool U_EXPORT2
300	unorm2_isNormalized(const UNormalizer2 *norm2,
301	const UChar *s, int32_t length,
302	UErrorCode *pErrorCode);
303
304	/**
305	* Tests if the string is normalized.
306	* For the two COMPOSE modes, the result could be "maybe" in cases that
307	* would take a little more work to resolve definitively.
308	* Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
309	* combination of quick check + normalization, to avoid
310	* re-checking the "yes" prefix.
311	* @param norm2 UNormalizer2 instance
312	* @param s input string
313	* @param length length of the string, or -1 if NUL-terminated
314	* @param pErrorCode Standard ICU error code. Its input value must
315	* pass the U_SUCCESS() test, or else the function returns
316	* immediately. Check for U_FAILURE() on output or use with
317	* function chaining. (See User Guide for details.)
318	* @return UNormalizationCheckResult
319	* @stable ICU 4.4
320	*/
321	U_STABLE UNormalizationCheckResult U_EXPORT2
322	unorm2_quickCheck(const UNormalizer2 *norm2,
323	const UChar *s, int32_t length,
324	UErrorCode *pErrorCode);
325
326	/**
327	* Returns the end of the normalized substring of the input string.
328	* In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
329	* the substring <code>UnicodeString(s, 0, end)</code>
330	* will pass the quick check with a "yes" result.
331	*
332	* The returned end index is usually one or more characters before the
333	* "no" or "maybe" character: The end index is at a normalization boundary.
334	* (See the class documentation for more about normalization boundaries.)
335	*
336	* When the goal is a normalized string and most input strings are expected
337	* to be normalized already, then call this method,
338	* and if it returns a prefix shorter than the input string,
339	* copy that prefix and use normalizeSecondAndAppend() for the remainder.
340	* @param norm2 UNormalizer2 instance
341	* @param s input string
342	* @param length length of the string, or -1 if NUL-terminated
343	* @param pErrorCode Standard ICU error code. Its input value must
344	* pass the U_SUCCESS() test, or else the function returns
345	* immediately. Check for U_FAILURE() on output or use with
346	* function chaining. (See User Guide for details.)
347	* @return "yes" span end index
348	* @stable ICU 4.4
349	*/
350	U_STABLE int32_t U_EXPORT2
351	unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
352	const UChar *s, int32_t length,
353	UErrorCode *pErrorCode);
354
355	/**
356	* Tests if the character always has a normalization boundary before it,
357	* regardless of context.
358	* For details see the Normalizer2 base class documentation.
359	* @param norm2 UNormalizer2 instance
360	* @param c character to test
361	* @return TRUE if c has a normalization boundary before it
362	* @stable ICU 4.4
363	*/
364	U_STABLE UBool U_EXPORT2
365	unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c);
366
367	/**
368	* Tests if the character always has a normalization boundary after it,
369	* regardless of context.
370	* For details see the Normalizer2 base class documentation.
371	* @param norm2 UNormalizer2 instance
372	* @param c character to test
373	* @return TRUE if c has a normalization boundary after it
374	* @stable ICU 4.4
375	*/
376	U_STABLE UBool U_EXPORT2
377	unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c);
378
379	/**
380	* Tests if the character is normalization-inert.
381	* For details see the Normalizer2 base class documentation.
382	* @param norm2 UNormalizer2 instance
383	* @param c character to test
384	* @return TRUE if c is normalization-inert
385	* @stable ICU 4.4
386	*/
387	U_STABLE UBool U_EXPORT2
388	unorm2_isInert(const UNormalizer2 *norm2, UChar32 c);
389
390	#endif /* !UCONFIG_NO_NORMALIZATION */
391	#endif /* __UNORM2_H__ */