[apple/xnu.git] / bsd / sys / utfconv.h

/*
 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#ifndef _SYS_UTFCONV_H_
#define	_SYS_UTFCONV_H_

#include <sys/appleapiopts.h>
#include <sys/cdefs.h> 

#ifdef KERNEL
#ifdef __APPLE_API_UNSTABLE

/*
 * UTF-8 encode/decode flags
 */
#define	UTF_REVERSE_ENDIAN   0x0001   /* reverse UCS-2 byte order */
#define UTF_NO_NULL_TERM     0x0002   /* do not add null termination */
#define	UTF_DECOMPOSED       0x0004   /* generate fully decomposed UCS-2 */
#define	UTF_PRECOMPOSED      0x0008   /* generate precomposed UCS-2 */
#define UTF_ESCAPE_ILLEGAL   0x0010   /* escape illegal UTF-8 */
#define UTF_SFM_CONVERSIONS  0x0020   /* Use SFM mappings for illegal NTFS chars */

#define UTF_BIG_ENDIAN       \
        ((BYTE_ORDER == BIG_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)

#define UTF_LITTLE_ENDIAN    \
        ((BYTE_ORDER == LITTLE_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)

__BEGIN_DECLS


/*
 * unicode_combinable - Test for a combining unicode character.
 *
 * This function is similar to __CFUniCharIsNonBaseCharacter except
 * that it also includes Hangul Jamo characters.
 */

int unicode_combinable(u_int16_t character);

/*
 * Test for a precomposed character.
 * 
 * Similar to __CFUniCharIsDecomposableCharacter.
 */

int unicode_decomposeable(u_int16_t character);


/*
 * utf8_encodelen - Calculate the UTF-8 encoding length
 *
 * This function takes an Unicode input string, ucsp, of ucslen bytes
 * and calculates the size of the UTF-8 output in bytes (not including
 * a NULL termination byte). The string must reside in kernel memory.
 *
 * FLAGS
 *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime
 *
 *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 *
 *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 *
 *    UTF_DECOMPOSED:  assume fully decomposed output
 *
 * ERRORS
 *    None
 */
size_t
utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash,
               int flags);


/*
 * utf8_encodestr - Encodes a Unicode string into UTF-8
 *
 * This function takes an Unicode input string, ucsp, of ucslen bytes
 * and produces the UTF-8 output into a buffer of buflen bytes pointed
 * to by utf8p. The size of the output in bytes (not including a NULL
 * termination byte) is returned in utf8len. The UTF-8 string output
 * is NULL terminated. Both buffers must reside in kernel memory.
 *
 * If '/' chars are possible in the Unicode input then an alternate
 * (replacement) char must be provided in altslash.
 *
 * FLAGS
 *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime
 *
 *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 *
 *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 *
 *    UTF_NO_NULL_TERM:  do not add null termination to output string
 *
 *    UTF_DECOMPOSED:  generate fully decomposed output
 *
 * ERRORS
 *    ENAMETOOLONG:  output did not fit; only utf8len bytes were encoded
 *
 *    EINVAL:  illegal Unicode char encountered
 */
int
utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
               size_t * utf8len, size_t buflen, u_int16_t altslash, int flags);


/*
 * utf8_decodestr - Decodes a UTF-8 string into Unicode
 *
 * This function takes an UTF-8 input string, utf8p, of utf8len bytes
 * and produces the Unicode output into a buffer of buflen bytes pointed
 * to by ucsp. The size of the output in bytes (not including a NULL
 * termination byte) is returned in ucslen. Both buffers must reside
 * in kernel memory.
 *
 * If '/' chars are allowed in the Unicode output then an alternate
 * (replacement) char must be provided in altslash.
 *
 * FLAGS
 *    UTF_REV_ENDIAN:  Unicode byte order is opposite current runtime
 *
 *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 *
 *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 *
 *    UTF_DECOMPOSED:  generate fully decomposed output (NFD)
 *
 *    UTF_PRECOMPOSED:  generate precomposed output (NFC)
 *
 *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
 *
 * ERRORS
 *    ENAMETOOLONG:  output did not fit; only ucslen bytes were decoded.
 *
 *    EINVAL:  illegal UTF-8 sequence encountered.
 */
int
utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
               size_t *ucslen, size_t buflen, u_int16_t altslash, int flags);


/*
 * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
 *
 * This function takes an UTF-8 input string, instr, of inlen bytes
 * and produces normalized UTF-8 output into a buffer of buflen bytes
 * pointed to by outstr. The size of the output in bytes (not including
 * a NULL termination byte) is returned in outlen. In-place conversions
 * are not supported (i.e. instr != outstr).  Both buffers must reside
 * in kernel memory.
 *
 * FLAGS
 *    UTF_DECOMPOSED:  output string will be fully decomposed (NFD)
 *
 *    UTF_PRECOMPOSED:  output string will be precomposed (NFC)
 *
 *    UTF_NO_NULL_TERM:  do not add null termination to output string
 *
 *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
 *
 * ERRORS
 *    ENAMETOOLONG:  output did not fit or input exceeded MAXPATHLEN bytes
 *
 *    EINVAL:  illegal UTF-8 sequence encountered or invalid flags
 */
int
utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
                  size_t *outlen, size_t buflen, int flags);


/*
 * utf8_validatestr - validates a UTF-8 string
 *
 * This function takes an UTF-8 input string, utf8p, of utf8len bytes
 * and determines if its valid UTF-8.  The string must reside in kernel
 * memory.
 *
 * ERRORS
 *    EINVAL:  illegal UTF-8 sequence encountered.
 */
int
utf8_validatestr(const u_int8_t* utf8p, size_t utf8len);


__END_DECLS

#endif /* __APPLE_API_UNSTABLE */
#endif /* KERNEL */

#endif /* !_SYS_UTFCONV_H_ */
Commit	Line	Data
1c79356b	1	/*
2d21ac55	2	* Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
5d5c5d0d	3	*
2d21ac55	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b	5	*
2d21ac55 A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
8f6c56a5	14	*
2d21ac55 A	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5 A	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
8f6c56a5 A	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55 A	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
8f6c56a5	25	*
2d21ac55	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b A	27	*/
	28
	29	#ifndef _SYS_UTFCONV_H_
	30	#define _SYS_UTFCONV_H_
	31
9bccf70c	32	#include <sys/appleapiopts.h>
55e303ae	33	#include <sys/cdefs.h>
9bccf70c	34
1c79356b	35	#ifdef KERNEL
9bccf70c	36	#ifdef __APPLE_API_UNSTABLE
2d21ac55	37
1c79356b A	38	/*
	39	* UTF-8 encode/decode flags
	40	*/
2d21ac55 A	41	#define UTF_REVERSE_ENDIAN 0x0001 /* reverse UCS-2 byte order */
	42	#define UTF_NO_NULL_TERM 0x0002 /* do not add null termination */
	43	#define UTF_DECOMPOSED 0x0004 /* generate fully decomposed UCS-2 */
	44	#define UTF_PRECOMPOSED 0x0008 /* generate precomposed UCS-2 */
	45	#define UTF_ESCAPE_ILLEGAL 0x0010 /* escape illegal UTF-8 */
	46	#define UTF_SFM_CONVERSIONS 0x0020 /* Use SFM mappings for illegal NTFS chars */
	47
	48	#define UTF_BIG_ENDIAN \
	49	((BYTE_ORDER == BIG_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)
	50
	51	#define UTF_LITTLE_ENDIAN \
	52	((BYTE_ORDER == LITTLE_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)
1c79356b A	53
1c79356b A	54	__BEGIN_DECLS
1c79356b	55
b0d623f7 A	56
	57	/*
	58	* unicode_combinable - Test for a combining unicode character.
	59	*
	60	* This function is similar to __CFUniCharIsNonBaseCharacter except
	61	* that it also includes Hangul Jamo characters.
	62	*/
	63
	64	int unicode_combinable(u_int16_t character);
	65
	66	/*
	67	* Test for a precomposed character.
	68	*
	69	* Similar to __CFUniCharIsDecomposableCharacter.
	70	*/
	71
	72	int unicode_decomposeable(u_int16_t character);
	73
	74
2d21ac55 A	75	/*
	76	* utf8_encodelen - Calculate the UTF-8 encoding length
	77	*
	78	* This function takes an Unicode input string, ucsp, of ucslen bytes
	79	* and calculates the size of the UTF-8 output in bytes (not including
	80	* a NULL termination byte). The string must reside in kernel memory.
	81	*
	82	* FLAGS
	83	* UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
	84	*
	85	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
	86	*
	87	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
	88	*
	89	* UTF_DECOMPOSED: assume fully decomposed output
	90	*
	91	* ERRORS
	92	* None
	93	*/
	94	size_t
	95	utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash,
	96	int flags);
	97
	98
	99	/*
	100	* utf8_encodestr - Encodes a Unicode string into UTF-8
	101	*
	102	* This function takes an Unicode input string, ucsp, of ucslen bytes
	103	* and produces the UTF-8 output into a buffer of buflen bytes pointed
	104	* to by utf8p. The size of the output in bytes (not including a NULL
	105	* termination byte) is returned in utf8len. The UTF-8 string output
	106	* is NULL terminated. Both buffers must reside in kernel memory.
	107	*
	108	* If '/' chars are possible in the Unicode input then an alternate
	109	* (replacement) char must be provided in altslash.
	110	*
	111	* FLAGS
	112	* UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
	113	*
	114	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
	115	*
	116	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
	117	*
	118	* UTF_NO_NULL_TERM: do not add null termination to output string
	119	*
	120	* UTF_DECOMPOSED: generate fully decomposed output
	121	*
	122	* ERRORS
	123	* ENAMETOOLONG: output did not fit; only utf8len bytes were encoded
	124	*
	125	* EINVAL: illegal Unicode char encountered
	126	*/
	127	int
	128	utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
	129	size_t * utf8len, size_t buflen, u_int16_t altslash, int flags);
91447636	130
91447636	131
2d21ac55 A	132	/*
	133	* utf8_decodestr - Decodes a UTF-8 string into Unicode
	134	*
	135	* This function takes an UTF-8 input string, utf8p, of utf8len bytes
	136	* and produces the Unicode output into a buffer of buflen bytes pointed
	137	* to by ucsp. The size of the output in bytes (not including a NULL
	138	* termination byte) is returned in ucslen. Both buffers must reside
	139	* in kernel memory.
	140	*
	141	* If '/' chars are allowed in the Unicode output then an alternate
	142	* (replacement) char must be provided in altslash.
	143	*
	144	* FLAGS
	145	* UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
	146	*
	147	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
	148	*
	149	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
	150	*
	151	* UTF_DECOMPOSED: generate fully decomposed output (NFD)
	152	*
	153	* UTF_PRECOMPOSED: generate precomposed output (NFC)
	154	*
	155	* UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
	156	*
	157	* ERRORS
	158	* ENAMETOOLONG: output did not fit; only ucslen bytes were decoded.
	159	*
	160	* EINVAL: illegal UTF-8 sequence encountered.
	161	*/
	162	int
	163	utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
	164	size_t *ucslen, size_t buflen, u_int16_t altslash, int flags);
	165
	166
	167	/*
	168	* utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
	169	*
	170	* This function takes an UTF-8 input string, instr, of inlen bytes
	171	* and produces normalized UTF-8 output into a buffer of buflen bytes
	172	* pointed to by outstr. The size of the output in bytes (not including
	173	* a NULL termination byte) is returned in outlen. In-place conversions
	174	* are not supported (i.e. instr != outstr). Both buffers must reside
	175	* in kernel memory.
	176	*
	177	* FLAGS
	178	* UTF_DECOMPOSED: output string will be fully decomposed (NFD)
	179	*
	180	* UTF_PRECOMPOSED: output string will be precomposed (NFC)
	181	*
	182	* UTF_NO_NULL_TERM: do not add null termination to output string
	183	*
	184	* UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
	185	*
	186	* ERRORS
	187	* ENAMETOOLONG: output did not fit or input exceeded MAXPATHLEN bytes
	188	*
	189	* EINVAL: illegal UTF-8 sequence encountered or invalid flags
	190	*/
	191	int
	192	utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
	193	size_t *outlen, size_t buflen, int flags);
	194
	195
196	/*
197	* utf8_validatestr - validates a UTF-8 string
198	*
199	* This function takes an UTF-8 input string, utf8p, of utf8len bytes
200	* and determines if its valid UTF-8. The string must reside in kernel
201	* memory.
202	*
203	* ERRORS
204	* EINVAL: illegal UTF-8 sequence encountered.
205	*/
206	int
207	utf8_validatestr(const u_int8_t* utf8p, size_t utf8len);
208
1c79356b	209
1c79356b A	210	__END_DECLS
1c79356b A	211
9bccf70c	212	#endif /* __APPLE_API_UNSTABLE */
1c79356b A	213	#endif /* KERNEL */
	214
	215	#endif /* !_SYS_UTFCONV_H_ */