]> git.saurik.com Git - apple/hfs.git/blob - livefiles_hfs_plugin/lf_hfs_sbunicode.h
hfs-522.0.9.tar.gz
[apple/hfs.git] / livefiles_hfs_plugin / lf_hfs_sbunicode.h
1 /* Copyright © 2017-2018 Apple Inc. All rights reserved.
2 *
3 * lf_hfs_sbunicode.h
4 * livefiles_hfs
5 *
6 * Created by Oded Shoshani on 31/1/18.
7 */
8
9 #ifndef lf_hfs_sbunicode_h
10 #define lf_hfs_sbunicode_h
11
12 /*
13 Includes Unicode 3.2 decomposition code derived from Core Foundation
14 */
15
16 /*
17 * UTF-8 (Unicode Transformation Format)
18 *
19 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
20 * character as a sequence of one to four bytes. Only the shortest form
21 * required to represent the significant Unicode bits is legal.
22 *
23 * UTF-8 Multibyte Codes
24 *
25 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
26 * -----------------------------------------------------------------------------
27 * 1 7 0x0000 0x007F 0xxxxxxx
28 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
29 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
30 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
31 * -----------------------------------------------------------------------------
32 */
33
34 /*
35 * UTF-8 encode/decode flags
36 */
37 #define UTF_REVERSE_ENDIAN 0x0001 /* reverse UCS-2 byte order */
38 #define UTF_ADD_NULL_TERM 0x0002 /* add null termination */
39 #define UTF_DECOMPOSED 0x0004 /* generate fully decomposed UCS-2 */
40 #define UTF_PRECOMPOSED 0x0008 /* generate precomposed UCS-2 */
41 #define UTF_ESCAPE_ILLEGAL 0x0010 /* escape illegal UTF-8 */
42 #define UTF_SFM_CONVERSIONS 0x0020 /* Use SFM mappings for illegal NTFS chars */
43
44 #define UTF_BIG_ENDIAN \
45 ((BYTE_ORDER == BIG_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)
46 #define UTF_LITTLE_ENDIAN \
47 ((BYTE_ORDER == LITTLE_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)
48
49
50
51 /*
52 * utf8_encodelen - Calculate the UTF-8 encoding length
53 *
54 * This function takes a Unicode input string, ucsp, of ucslen bytes
55 * and calculates the size of the UTF-8 output in bytes (not including
56 * a NULL termination byte). The string must reside in kernel memory.
57 *
58 * If '/' chars are possible in the Unicode input then an alternate
59 * (replacement) char should be provided in altslash.
60 *
61 * FLAGS
62 * UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
63 *
64 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
65 *
66 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
67 *
68 * UTF_DECOMPOSED: generate fully decomposed output
69 *
70 * UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
71 *
72 * ERRORS
73 * None
74 */
75 size_t utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags);
76
77 /*
78 * utf8_encodestr - Encodes a Unicode string to UTF-8
79 *
80 * NOTES:
81 * The resulting UTF-8 string is NULL terminated.
82 *
83 * If '/' chars are allowed on disk then an alternate
84 * (replacement) char must be provided in altslash.
85 *
86 * input flags:
87 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
88 *
89 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
90 *
91 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
92 *
93 * UTF_DECOMPOSED: generate fully decomposed output
94 *
95 * UTF_ADD_NULL_TERM: add NULL termination to UTF-8 output
96 *
97 * result:
98 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
99 *
100 * EINVAL: Illegal char found; char was replaced by an '_'.
101 */
102 extern int utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p, size_t * utf8len, size_t buflen, u_int16_t altslash, int flags);
103
104 /*
105 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
106 *
107 * NOTES:
108 * The input UTF-8 string does not need to be null terminated
109 * if utf8len is set.
110 *
111 * If '/' chars are allowed on disk then an alternate
112 * (replacement) char must be provided in altslash.
113 *
114 * input flags:
115 * UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
116 *
117 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
118 *
119 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
120 *
121 * UTF_DECOMPOSED: generate fully decomposed output (NFD)
122 *
123 * UTF_PRECOMPOSED: generate precomposed output (NFC)
124 *
125 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
126 *
127 * result:
128 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
129 *
130 * EINVAL: Illegal UTF-8 sequence found.
131 */
132 int utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp, size_t *ucslen, size_t buflen, u_int16_t altslash, int flags);
133
134 #endif /* lf_hfs_sbunicode_h */
135