[apple/hfs.git] / livefiles_hfs_plugin / lf_hfs_sbunicode.h

/* Copyright © 2017-2018 Apple Inc. All rights reserved.
 *
 *  lf_hfs_sbunicode.h
 *  livefiles_hfs
 *
 *  Created by Oded Shoshani on 31/1/18.
 */

#ifndef lf_hfs_sbunicode_h
#define lf_hfs_sbunicode_h

/*
 Includes Unicode 3.2 decomposition code derived from Core Foundation
 */

/*
 * UTF-8 (Unicode Transformation Format)
 *
 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
 * character as a sequence of one to four bytes. Only the shortest form
 * required to represent the significant Unicode bits is legal.
 *
 * UTF-8 Multibyte Codes
 *
 * Bytes   Bits   Unicode Min  Unicode Max   UTF-8 Byte Sequence (binary)
 * -----------------------------------------------------------------------------
 *   1       7       0x0000        0x007F    0xxxxxxx
 *   2      11       0x0080        0x07FF    110xxxxx 10xxxxxx
 *   3      16       0x0800        0xFFFF    1110xxxx 10xxxxxx 10xxxxxx
 *   4      21      0x10000      0x10FFFF    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 * -----------------------------------------------------------------------------
 */

/*
 * UTF-8 encode/decode flags
 */
#define UTF_REVERSE_ENDIAN      0x0001   /* reverse UCS-2 byte order */
#define UTF_ADD_NULL_TERM       0x0002   /* add null termination */
#define UTF_DECOMPOSED          0x0004   /* generate fully decomposed UCS-2 */
#define UTF_PRECOMPOSED         0x0008   /* generate precomposed UCS-2 */
#define UTF_ESCAPE_ILLEGAL      0x0010   /* escape illegal UTF-8 */
#define UTF_SFM_CONVERSIONS     0x0020   /* Use SFM mappings for illegal NTFS chars */

#define UTF_BIG_ENDIAN       \
((BYTE_ORDER == BIG_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)
#define UTF_LITTLE_ENDIAN    \
((BYTE_ORDER == LITTLE_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)


/*
 * utf8_encodelen - Calculate the UTF-8 encoding length
 *
 * This function takes a Unicode input string, ucsp, of ucslen bytes
 * and calculates the size of the UTF-8 output in bytes (not including
 * a NULL termination byte). The string must reside in kernel memory.
 *
 * If '/' chars are possible in the Unicode input then an alternate
 * (replacement) char should be provided in altslash.
 *
 * FLAGS
 *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime
 *
 *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 *
 *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 *
 *    UTF_DECOMPOSED:  generate fully decomposed output
 *
 *    UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
 *
 * ERRORS
 *    None
 */
size_t utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags);

/*
 * utf8_encodestr - Encodes a Unicode string to UTF-8
 *
 * NOTES:
 *    The resulting UTF-8 string is NULL terminated.
 *
 *    If '/' chars are allowed on disk then an alternate
 *    (replacement) char must be provided in altslash.
 *
 * input flags:
 *    UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
 *
 *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 *
 *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 *
 *    UTF_DECOMPOSED:  generate fully decomposed output
 *
 *    UTF_ADD_NULL_TERM:  add NULL termination to UTF-8 output
 *
 * result:
 *    ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
 *
 *    EINVAL: Illegal char found; char was replaced by an '_'.
 */
extern int utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p, size_t * utf8len, size_t buflen, u_int16_t altslash, int flags);

/*
 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
 *
 * NOTES:
 *    The input UTF-8 string does not need to be null terminated
 *    if utf8len is set.
 *
 *    If '/' chars are allowed on disk then an alternate
 *    (replacement) char must be provided in altslash.
 *
 * input flags:
 *    UTF_REV_ENDIAN:  Unicode byte order is opposite current runtime
 *
 *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 *
 *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 *
 *    UTF_DECOMPOSED:  generate fully decomposed output (NFD)
 *
 *    UTF_PRECOMPOSED:  generate precomposed output (NFC)
 *
 *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
 *
 * result:
 *    ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
 *
 *    EINVAL: Illegal UTF-8 sequence found.
 */
int utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp, size_t *ucslen, size_t buflen, u_int16_t altslash, int flags);

#endif /* lf_hfs_sbunicode_h */
Commit	Line	Data
de8ee011 A	1	/* Copyright © 2017-2018 Apple Inc. All rights reserved.
	2	*
	3	* lf_hfs_sbunicode.h
	4	* livefiles_hfs
	5	*
	6	* Created by Oded Shoshani on 31/1/18.
	7	*/
	8
	9	#ifndef lf_hfs_sbunicode_h
	10	#define lf_hfs_sbunicode_h
	11
	12	/*
	13	Includes Unicode 3.2 decomposition code derived from Core Foundation
	14	*/
	15
	16	/*
	17	* UTF-8 (Unicode Transformation Format)
	18	*
	19	* UTF-8 is the Unicode Transformation Format that serializes a Unicode
	20	* character as a sequence of one to four bytes. Only the shortest form
	21	* required to represent the significant Unicode bits is legal.
	22	*
	23	* UTF-8 Multibyte Codes
	24	*
	25	* Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
	26	* -----------------------------------------------------------------------------
	27	* 1 7 0x0000 0x007F 0xxxxxxx
	28	* 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
	29	* 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
	30	* 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	31	* -----------------------------------------------------------------------------
	32	*/
	33
	34	/*
	35	* UTF-8 encode/decode flags
	36	*/
	37	#define UTF_REVERSE_ENDIAN 0x0001 /* reverse UCS-2 byte order */
	38	#define UTF_ADD_NULL_TERM 0x0002 /* add null termination */
	39	#define UTF_DECOMPOSED 0x0004 /* generate fully decomposed UCS-2 */
	40	#define UTF_PRECOMPOSED 0x0008 /* generate precomposed UCS-2 */
	41	#define UTF_ESCAPE_ILLEGAL 0x0010 /* escape illegal UTF-8 */
	42	#define UTF_SFM_CONVERSIONS 0x0020 /* Use SFM mappings for illegal NTFS chars */
	43
	44	#define UTF_BIG_ENDIAN \
	45	((BYTE_ORDER == BIG_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)
	46	#define UTF_LITTLE_ENDIAN \
	47	((BYTE_ORDER == LITTLE_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)
	48
	49
	50
	51	/*
	52	* utf8_encodelen - Calculate the UTF-8 encoding length
	53	*
	54	* This function takes a Unicode input string, ucsp, of ucslen bytes
	55	* and calculates the size of the UTF-8 output in bytes (not including
	56	* a NULL termination byte). The string must reside in kernel memory.
	57	*
	58	* If '/' chars are possible in the Unicode input then an alternate
	59	* (replacement) char should be provided in altslash.
	60	*
	61	* FLAGS
	62	* UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
	63	*
	64	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
65	*
66	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
67	*
68	* UTF_DECOMPOSED: generate fully decomposed output
69	*
70	* UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
71	*
72	* ERRORS
73	* None
74	*/
75	size_t utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags);
76
77	/*
78	* utf8_encodestr - Encodes a Unicode string to UTF-8
79	*
80	* NOTES:
81	* The resulting UTF-8 string is NULL terminated.
82	*
83	* If '/' chars are allowed on disk then an alternate
84	* (replacement) char must be provided in altslash.
85	*
86	* input flags:
87	* UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
88	*
89	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
90	*
91	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
92	*
93	* UTF_DECOMPOSED: generate fully decomposed output
94	*
95	* UTF_ADD_NULL_TERM: add NULL termination to UTF-8 output
96	*
97	* result:
98	* ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
99	*
100	* EINVAL: Illegal char found; char was replaced by an '_'.
101	*/
102	extern int utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p, size_t * utf8len, size_t buflen, u_int16_t altslash, int flags);
103
104	/*
105	* utf8_decodestr - Decodes a UTF-8 string back to Unicode
106	*
107	* NOTES:
108	* The input UTF-8 string does not need to be null terminated
109	* if utf8len is set.
110	*
111	* If '/' chars are allowed on disk then an alternate
112	* (replacement) char must be provided in altslash.
113	*
114	* input flags:
115	* UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
116	*
117	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
118	*
119	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
120	*
121	* UTF_DECOMPOSED: generate fully decomposed output (NFD)
122	*
123	* UTF_PRECOMPOSED: generate precomposed output (NFC)
124	*
125	* UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
126	*
127	* result:
128	* ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
129	*
130	* EINVAL: Illegal UTF-8 sequence found.
131	*/
132	int utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp, size_t *ucslen, size_t buflen, u_int16_t altslash, int flags);
133
134	#endif /* lf_hfs_sbunicode_h */
135