]>
Commit | Line | Data |
---|---|---|
de8ee011 A |
1 | /* Copyright © 2017-2018 Apple Inc. All rights reserved. |
2 | * | |
3 | * lf_hfs_sbunicode.h | |
4 | * livefiles_hfs | |
5 | * | |
6 | * Created by Oded Shoshani on 31/1/18. | |
7 | */ | |
8 | ||
9 | #ifndef lf_hfs_sbunicode_h | |
10 | #define lf_hfs_sbunicode_h | |
11 | ||
12 | /* | |
13 | Includes Unicode 3.2 decomposition code derived from Core Foundation | |
14 | */ | |
15 | ||
16 | /* | |
17 | * UTF-8 (Unicode Transformation Format) | |
18 | * | |
19 | * UTF-8 is the Unicode Transformation Format that serializes a Unicode | |
20 | * character as a sequence of one to four bytes. Only the shortest form | |
21 | * required to represent the significant Unicode bits is legal. | |
22 | * | |
23 | * UTF-8 Multibyte Codes | |
24 | * | |
25 | * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary) | |
26 | * ----------------------------------------------------------------------------- | |
27 | * 1 7 0x0000 0x007F 0xxxxxxx | |
28 | * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx | |
29 | * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx | |
30 | * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
31 | * ----------------------------------------------------------------------------- | |
32 | */ | |
33 | ||
34 | /* | |
35 | * UTF-8 encode/decode flags | |
36 | */ | |
37 | #define UTF_REVERSE_ENDIAN 0x0001 /* reverse UCS-2 byte order */ | |
38 | #define UTF_ADD_NULL_TERM 0x0002 /* add null termination */ | |
39 | #define UTF_DECOMPOSED 0x0004 /* generate fully decomposed UCS-2 */ | |
40 | #define UTF_PRECOMPOSED 0x0008 /* generate precomposed UCS-2 */ | |
41 | #define UTF_ESCAPE_ILLEGAL 0x0010 /* escape illegal UTF-8 */ | |
42 | #define UTF_SFM_CONVERSIONS 0x0020 /* Use SFM mappings for illegal NTFS chars */ | |
43 | ||
44 | #define UTF_BIG_ENDIAN \ | |
45 | ((BYTE_ORDER == BIG_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN) | |
46 | #define UTF_LITTLE_ENDIAN \ | |
47 | ((BYTE_ORDER == LITTLE_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN) | |
48 | ||
49 | ||
50 | ||
51 | /* | |
52 | * utf8_encodelen - Calculate the UTF-8 encoding length | |
53 | * | |
54 | * This function takes a Unicode input string, ucsp, of ucslen bytes | |
55 | * and calculates the size of the UTF-8 output in bytes (not including | |
56 | * a NULL termination byte). The string must reside in kernel memory. | |
57 | * | |
58 | * If '/' chars are possible in the Unicode input then an alternate | |
59 | * (replacement) char should be provided in altslash. | |
60 | * | |
61 | * FLAGS | |
62 | * UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime | |
63 | * | |
64 | * UTF_BIG_ENDIAN: Unicode byte order is always big endian | |
65 | * | |
66 | * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian | |
67 | * | |
68 | * UTF_DECOMPOSED: generate fully decomposed output | |
69 | * | |
70 | * UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it | |
71 | * | |
72 | * ERRORS | |
73 | * None | |
74 | */ | |
75 | size_t utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags); | |
76 | ||
77 | /* | |
78 | * utf8_encodestr - Encodes a Unicode string to UTF-8 | |
79 | * | |
80 | * NOTES: | |
81 | * The resulting UTF-8 string is NULL terminated. | |
82 | * | |
83 | * If '/' chars are allowed on disk then an alternate | |
84 | * (replacement) char must be provided in altslash. | |
85 | * | |
86 | * input flags: | |
87 | * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime | |
88 | * | |
89 | * UTF_BIG_ENDIAN: Unicode byte order is always big endian | |
90 | * | |
91 | * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian | |
92 | * | |
93 | * UTF_DECOMPOSED: generate fully decomposed output | |
94 | * | |
95 | * UTF_ADD_NULL_TERM: add NULL termination to UTF-8 output | |
96 | * | |
97 | * result: | |
98 | * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded | |
99 | * | |
100 | * EINVAL: Illegal char found; char was replaced by an '_'. | |
101 | */ | |
102 | extern int utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p, size_t * utf8len, size_t buflen, u_int16_t altslash, int flags); | |
103 | ||
104 | /* | |
105 | * utf8_decodestr - Decodes a UTF-8 string back to Unicode | |
106 | * | |
107 | * NOTES: | |
108 | * The input UTF-8 string does not need to be null terminated | |
109 | * if utf8len is set. | |
110 | * | |
111 | * If '/' chars are allowed on disk then an alternate | |
112 | * (replacement) char must be provided in altslash. | |
113 | * | |
114 | * input flags: | |
115 | * UTF_REV_ENDIAN: Unicode byte order is opposite current runtime | |
116 | * | |
117 | * UTF_BIG_ENDIAN: Unicode byte order is always big endian | |
118 | * | |
119 | * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian | |
120 | * | |
121 | * UTF_DECOMPOSED: generate fully decomposed output (NFD) | |
122 | * | |
123 | * UTF_PRECOMPOSED: generate precomposed output (NFC) | |
124 | * | |
125 | * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input | |
126 | * | |
127 | * result: | |
128 | * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded. | |
129 | * | |
130 | * EINVAL: Illegal UTF-8 sequence found. | |
131 | */ | |
132 | int utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp, size_t *ucslen, size_t buflen, u_int16_t altslash, int flags); | |
133 | ||
134 | #endif /* lf_hfs_sbunicode_h */ | |
135 |