git.saurik.com Git - apple/xnu.git/blame

Commit	Line	Data
f427ee49 A	1	/*
2	* Copyright (c) 2016-2020 Apple Inc. All rights reserved.
3	*
4	* @APPLE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. Please obtain a copy of the License at
10	* http://www.opensource.apple.com/apsl/ and read it before using this
11	* file.
12	*
13	* The Original Code and all software distributed under the License are
14	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18	* Please see the License for the specific language governing rights and
19	* limitations under the License.
20	*
21	* @APPLE_LICENSE_HEADER_END@
22	*/
23
24	#ifndef unicode_h
25	#define unicode_h
26
27	#ifdef KERNEL_PRIVATE
28
29	#include <sys/cdefs.h>
30	#include <stdbool.h>
31
32	/*
33	* WARNING - callers that use the following Unicode normalization interface for on-disk
34	* structures should be aware that the implementation will be periodically updated for
35	* the latest Unicode standard version.
36	*/
37
38	enum {
39	/* Maximum size of UTF32 reordering buffer for stream-safe format */
40	kNCFStreamSafeBufMax = 32
41	};
42
43	/*
44	* utf8_normalizeOptCaseFoldAndHash
45	*
46	* Convert a given UTF-8 string to UTF-32 in one of the following normalized forms,
47	* as specified by the case_sens parameter, and feed the result incrementally to
48	* the provided hash function callback:
49	* - "canonical caseless form" (case-folded NFD, as described by definition D145
50	* in chapter 3 of The Unicode Standard); for case-insensitive behavior.
51	* - standard NFD; for case-sensitive behavior (if case_sens = true).
52	*
53	* The input string should be valid UTF-8 that meets the criteria for stream safe
54	* text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
55	* It should not contain ASCII 0x00 or '/'.
56	*
57	* str: The input UTF-8 string (need not be 0 terminated)
58	* str_len: The byte length of the input string (excluding any 0 terminator)
59	* case_sens: False for case-insensitive behavior; generates canonical caseless form.
60	* True for case-sensitive behavior; generates standard NFD.
61	* hash_func: A pointer to a hashing function to compute the hash of the
62	* normalized/case-folded result. buf contains buf_len bytes
63	* of data to be added to the hash using the caller-supplied
64	* context (ctx).
65	* hash_ctx: The context for the hash function.
66	*
67	* Returns: 0 on success, or
68	* EILSEQ: The input string contains illegal ASCII-range characters
69	* (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
70	* contains codepoints that are non-characters or unassigned in
71	* the version of Unicode currently supported.
72	*/
73	int utf8_normalizeOptCaseFoldAndHash(const char *str,
74	size_t str_len,
75	bool case_sens,
76	void (hash_func)(void buf, size_t buf_len, void *ctx),
77	void *hash_ctx);
78
79	/*
80	* utf8_normalizeOptCaseFoldAndCompare
81	*
82	* Determine whether two UTF-8 strings are equal after converting each to one of the
83	* following normalized forms, as specified by the case_sens parameter:
84	* - "canonical caseless form" (case-folded NFD); for case-insensitive comparison.
85	* - standard NFD; for case-sensitive comparison (if case_sens = true).
86	* On success, sets are_equal to true if the strings are equal, or false if they are not.
87	*
88	* The input strings should be valid UTF-8 that meet the criteria for stream safe
89	* text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
90	* They should not contain ASCII 0x00 or '/'.
91	*
92	* strA: A UTF-8 string to be compared (need not be 0 terminated)
93	* strA_len: The byte length of strA (excluding any 0 terminator)
94	* strB: The second UTF-8 string to be compared (need not be 0 terminated)
95	* strB_len: The byte length of strB (excluding any 0 terminator)
96	* case_sens: False for case-insensitive behavior; compares canonical caseless forms.
97	* True for case-sensitive behavior; compares standard NFD forms.
98	* are_equal: On success, set to true if the strings are equal, or set to false
99	* if they are not.
100	*
101	* Returns: 0 on success, or
102	* EILSEQ: One or both of the input strings contains illegal ASCII-range
103	* characters (0x00 or '/'), or is not well-formed stream-safe UTF-8,
104	* or contains codepoints that are non-characters or unassigned in
105	* the version of Unicode currently supported.
106	* Note: The comparison may terminate early when a difference is
107	* detected, and may return 0 and set *are_equal=false even
108	* if one or both strings are invalid.
109	*/
110	int utf8_normalizeOptCaseFoldAndCompare(const char *strA,
111	size_t strA_len,
112	const char *strB,
113	size_t strB_len,
114	bool case_sens,
115	bool *are_equal);
116
117	/*
118	* utf8_normalizeOptCaseFold
119	*
120	* Convert a given UTF-8 string to UTF-32 in one of the following normalized forms,
121	* as specified by the case_sens parameter, and copy the result to the ustr
122	* buffer:
123	* - "canonical caseless form" (case-folded NFD, as described by definition D145
124	* in chapter 3 of The Unicode Standard); for case-insensitive behavior.
125	* - standard NFD; for case-sensitive behavior (if case_sens = true).
126	*
127	* The input string should be valid UTF-8 that meets the criteria for stream safe
128	* text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
129	* It should not contain ASCII 0x00 or '/'.
130	*
131	* str: The input UTF-8 string (need not be 0 terminated)
132	* str_len: The byte length of the input string (excluding any 0 terminator)
133	* case_sens: False for case-insensitive behavior; generates canonical caseless form.
134	* True for case-sensitive behavior; generates standard NFD.
135	* ustr: A pointer to a buffer for the resulting UTF-32 string.
136	* ustr_size: The capacity of ustr, in UTF-32 units.
137	* ustr_len: Pointer to a value that will be filled in with the actual length
138	* in UTF-32 units of the string copied to ustr.
139	*
140	* Returns: 0 on success, or
141	* EILSEQ: The input string contains illegal ASCII-range characters
142	* (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
143	* contains codepoints that are non-characters or unassigned in
144	* the version of Unicode currently supported.
145	* ENOMEM: ustr_size is insufficient for the resulting string. In this
146	* case the value returned in *ustr_len is invalid.
147	*/
148	int utf8_normalizeOptCaseFold(const char *str,
149	size_t str_len,
150	bool case_sens,
151	int32_t *ustr,
152	int32_t ustr_size,
153	int32_t *ustr_len);
154
155	/*
156	* utf8_normalizeOptCaseFoldToUTF8
157	*
158	* Convert a given UTF-8 string to UTF-8 in one of the following normalized forms,
159	* as specified by the case_sens parameter, and copy the result to the ustr
160	* buffer:
161	* - "canonical caseless form" (case-folded NFD, as described by definition D145
162	* in chapter 3 of The Unicode Standard); for case-insensitive behavior.
163	* - standard NFD; for case-sensitive behavior (if case_sens = true).
164	*
165	* The input string should be valid UTF-8 that meets the criteria for stream safe
166	* text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
167	* It should not contain ASCII 0x00 or '/'.
168	*
169	* str: The input UTF-8 string (need not be 0 terminated)
170	* str_len: The byte length of the input string (excluding any 0 terminator)
171	* case_sens: False for case-insensitive behavior; generates canonical caseless form.
172	* True for case-sensitive behavior; generates standard NFD.
173	* ustr: A pointer to a buffer for the resulting UTF-8 string.
174	* ustr_size: The capacity of ustr, in bytes.
175	* ustr_len: Pointer to a value that will be filled in with the actual length
176	* in bytes of the string copied to ustr.
177	*
178	* Returns: 0 on success, or
179	* EILSEQ: The input string contains illegal ASCII-range characters
180	* (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
181	* contains codepoints that are non-characters or unassigned in
182	* the version of Unicode currently supported.
183	* ENOMEM: ustr_size is insufficient for the resulting string. In this
184	* case the value returned in *ustr_len is invalid.
185	*/
186	int utf8_normalizeOptCaseFoldToUTF8(const char *str,
187	size_t str_len,
188	bool case_sens,
189	char *ustr,
190	size_t ustr_size,
191	size_t *ustr_len);
192
193	/*
194	* utf8_normalizeOptCaseFoldAndMatchSubstring
195	*
196	* Determine whether the normalized UTF32 string derived from a specified UTF-8 string
197	* strA contains another UTF32 string ustrB which has already been normalized, typically
198	* with normalizeOptCaseFold. The normalization for both strings is one of the following,
199	* as specified by the case_sens parameter:
200	* - "canonical caseless form" (case-folded NFD); for case-insensitive comparison.
201	* - standard NFD; for case-sensitive comparison (if case_sens = true).
202	* On success, sets are_equal to true if strA contains ustrB, or false otherwise.
203	*
204	* The input string strA should be valid UTF-8 that meets the criteria for stream safe
205	* text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
206	* It should not contain ASCII 0x00 or '/'.
207	*
208	* strA: A UTF-8 string (need not be 0 terminated) in which to search for the
209	* substring specified by ustrB.
210	* strA_len: The byte length of strA (excluding any 0 terminator)
211	* ustrB: A normalized UTF-32 substring (need not be 0 terminated) to be searched
212	* for in the UTF-32 string resulting from converting strA to the normalized
213	* UTF-32 form specified by the case_sens parameter; ustrB must already be
214	* in that form. Normally this will be produced using normalizeOptCaseFold.
215	* ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator).
216	* case_sens: False for case-insensitive matching; compares canonical caseless forms.
217	* True for case-sensitive matching; compares standard NFD forms.
218	* buf: Pointer to caller-supplied working memory for storing the portion of
219	* strA which has been converted to normalized UTF-32.
220	* buf_size: The size of buf.
221	* has_match: On success, set to true if strA (when converter to UTF-32 and normalized
222	* per case_sens) contains ustrB, set to false otherwise.
223	*
224	* Returns: 0 on success, or
225	* EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is
226	* not well-formed stream-safe UTF-8, or contains codepoints that are
227	* non-characters or unassigned in the version of Unicode currently
228	* supported.
229	* Note: The search may terminate early when a match is detected, and
230	* may return 0 and set *has_match=true even if strA is invalid.
231	* ENOMEM: buf_size is insufficient.
232	*/
233	int utf8_normalizeOptCaseFoldAndMatchSubstring(const char *strA,
234	size_t strA_len,
235	const int32_t *ustrB,
236	int32_t ustrB_len,
237	bool case_sens,
238	void *buf,
239	size_t buf_size,
240	bool *has_match);
241
242	/*
243	* utf8_normalizeOptCaseFoldGetUVersion
244	*
245	* Get the Unicode and code version currently associated with the normalizeOptCaseFold
246	* functions. The caller allocates the version array and passes it to the function,
247	* which will fill out the array as follows:
248	* version[0] = Unicode major version; for Unicode 6.3.0 this would be 6
249	* version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3
250	* version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0
251	* version[3] = Code revision level; for any given Unicode version, this value starts
252	* at 0 and is incremented for each significant revision to the
253	* normalizeOptCaseFold functions.
254	*/
255	void utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4]);
256
257	#endif /* KERNEL_PRIVATE */
258
259	#endif /* unicode_h */

f427ee49
A

1

/*

2

3

*

4

* @APPLE_LICENSE_HEADER_START@

5

*

6

* This file contains Original Code and/or Modifications of Original Code

7

* as defined in and that are subject to the Apple Public Source License

8

* Version 2.0 (the 'License'). You may not use this file except in

9

* compliance with the License. Please obtain a copy of the License at

10

* http://www.opensource.apple.com/apsl/ and read it before using this

11

* file.

12

*

13

* The Original Code and all software distributed under the License are

14

* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER

15

* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,

16

* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,

17

* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.

18

* Please see the License for the specific language governing rights and

19

* limitations under the License.

20

*

21

* @APPLE_LICENSE_HEADER_END@

*/

#ifndef unicode_h

#define unicode_h

#ifdef KERNEL_PRIVATE

28

29

#include <sys/cdefs.h>

#include <stdbool.h>

/*

* WARNING - callers that use the following Unicode normalization interface for on-disk

34

* structures should be aware that the implementation will be periodically updated for

35

* the latest Unicode standard version.

*/

enum {

/* Maximum size of UTF32 reordering buffer for stream-safe format */

40

kNCFStreamSafeBufMax = 32

};

/*

* utf8_normalizeOptCaseFoldAndHash

45

*

46

* Convert a given UTF-8 string to UTF-32 in one of the following normalized forms,

47

* as specified by the case_sens parameter, and feed the result incrementally to

48

* the provided hash function callback:

49

* - "canonical caseless form" (case-folded NFD, as described by definition D145

50

* in chapter 3 of The Unicode Standard); for case-insensitive behavior.

51

* - standard NFD; for case-sensitive behavior (if case_sens = true).

52

*

53

* The input string should be valid UTF-8 that meets the criteria for stream safe

54

* text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.

55

* It should not contain ASCII 0x00 or '/'.

56

*

57

* str: The input UTF-8 string (need not be 0 terminated)

58

* str_len: The byte length of the input string (excluding any 0 terminator)

59

* case_sens: False for case-insensitive behavior; generates canonical caseless form.

60

* True for case-sensitive behavior; generates standard NFD.

61

* hash_func: A pointer to a hashing function to compute the hash of the

62

* normalized/case-folded result. buf contains buf_len bytes

63

* of data to be added to the hash using the caller-supplied

64

* context (ctx).

65

* hash_ctx: The context for the hash function.

66

*

67

* Returns: 0 on success, or

68

* EILSEQ: The input string contains illegal ASCII-range characters

69

* (0x00 or '/'), or is not well-formed stream-safe UTF-8, or

70

* contains codepoints that are non-characters or unassigned in

71

* the version of Unicode currently supported.

72

*/

73

int utf8_normalizeOptCaseFoldAndHash(const char *str,

74

size_t str_len,

75

bool case_sens,

76

void (*hash_func)(void *buf, size_t buf_len, void *ctx),

void *hash_ctx);

/*

* utf8_normalizeOptCaseFoldAndCompare

81

*

82

* Determine whether two UTF-8 strings are equal after converting each to one of the

83

* following normalized forms, as specified by the case_sens parameter:

84

* - "canonical caseless form" (case-folded NFD); for case-insensitive comparison.

85

* - standard NFD; for case-sensitive comparison (if case_sens = true).

86

* On success, sets are_equal to true if the strings are equal, or false if they are not.

87

*

88

* The input strings should be valid UTF-8 that meet the criteria for stream safe

89

* text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.

90

* They should not contain ASCII 0x00 or '/'.

91

*

92

* strA: A UTF-8 string to be compared (need not be 0 terminated)

93

* strA_len: The byte length of strA (excluding any 0 terminator)

94

* strB: The second UTF-8 string to be compared (need not be 0 terminated)

95

* strB_len: The byte length of strB (excluding any 0 terminator)

96

* case_sens: False for case-insensitive behavior; compares canonical caseless forms.

97

* True for case-sensitive behavior; compares standard NFD forms.

98

* are_equal: On success, set to true if the strings are equal, or set to false

99

* if they are not.

100

*

101

* Returns: 0 on success, or

102

* EILSEQ: One or both of the input strings contains illegal ASCII-range

103

* characters (0x00 or '/'), or is not well-formed stream-safe UTF-8,

104

* or contains codepoints that are non-characters or unassigned in

105

* the version of Unicode currently supported.

106

* Note: The comparison may terminate early when a difference is

107

* detected, and may return 0 and set *are_equal=false even

108

* if one or both strings are invalid.

109

*/

110

int utf8_normalizeOptCaseFoldAndCompare(const char *strA,

size_t strA_len,

const char *strB,

size_t strB_len,

bool case_sens,

bool *are_equal);

/*

* utf8_normalizeOptCaseFold

119

*

120

* Convert a given UTF-8 string to UTF-32 in one of the following normalized forms,

121

* as specified by the case_sens parameter, and copy the result to the ustr

122

* buffer:

123

* - "canonical caseless form" (case-folded NFD, as described by definition D145

124

* in chapter 3 of The Unicode Standard); for case-insensitive behavior.

125

* - standard NFD; for case-sensitive behavior (if case_sens = true).

126

*

127

* The input string should be valid UTF-8 that meets the criteria for stream safe

128

* text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.

129

* It should not contain ASCII 0x00 or '/'.

130

*

131

* str: The input UTF-8 string (need not be 0 terminated)

132

* str_len: The byte length of the input string (excluding any 0 terminator)

133

* case_sens: False for case-insensitive behavior; generates canonical caseless form.

134

* True for case-sensitive behavior; generates standard NFD.

135

* ustr: A pointer to a buffer for the resulting UTF-32 string.

136

* ustr_size: The capacity of ustr, in UTF-32 units.

137

* ustr_len: Pointer to a value that will be filled in with the actual length

138

* in UTF-32 units of the string copied to ustr.

139

*

140

* Returns: 0 on success, or

141

* EILSEQ: The input string contains illegal ASCII-range characters

142

* (0x00 or '/'), or is not well-formed stream-safe UTF-8, or

143

* contains codepoints that are non-characters or unassigned in

144

* the version of Unicode currently supported.

145

* ENOMEM: ustr_size is insufficient for the resulting string. In this

146

* case the value returned in *ustr_len is invalid.

147

*/

148

int utf8_normalizeOptCaseFold(const char *str,

size_t str_len,

bool case_sens,

int32_t *ustr,

int32_t ustr_size,

int32_t *ustr_len);

/*

* utf8_normalizeOptCaseFoldToUTF8

157

*

158

* Convert a given UTF-8 string to UTF-8 in one of the following normalized forms,

159

* as specified by the case_sens parameter, and copy the result to the ustr

160

* buffer:

161

* - "canonical caseless form" (case-folded NFD, as described by definition D145

162

* in chapter 3 of The Unicode Standard); for case-insensitive behavior.

163

* - standard NFD; for case-sensitive behavior (if case_sens = true).

164

*

165

* The input string should be valid UTF-8 that meets the criteria for stream safe

166

* text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.

167

* It should not contain ASCII 0x00 or '/'.

168

*

169

* str: The input UTF-8 string (need not be 0 terminated)

170

* str_len: The byte length of the input string (excluding any 0 terminator)

171

* case_sens: False for case-insensitive behavior; generates canonical caseless form.

172

* True for case-sensitive behavior; generates standard NFD.

173

* ustr: A pointer to a buffer for the resulting UTF-8 string.

174

* ustr_size: The capacity of ustr, in bytes.

175

* ustr_len: Pointer to a value that will be filled in with the actual length

176

* in bytes of the string copied to ustr.

177

*

178

* Returns: 0 on success, or

179

* EILSEQ: The input string contains illegal ASCII-range characters

180

* (0x00 or '/'), or is not well-formed stream-safe UTF-8, or

181

* contains codepoints that are non-characters or unassigned in

182

* the version of Unicode currently supported.

183

* ENOMEM: ustr_size is insufficient for the resulting string. In this

184

* case the value returned in *ustr_len is invalid.

185

*/

186

int utf8_normalizeOptCaseFoldToUTF8(const char *str,

size_t str_len,

bool case_sens,

char *ustr,

size_t ustr_size,

size_t *ustr_len);

/*

* utf8_normalizeOptCaseFoldAndMatchSubstring

195

*

196

* Determine whether the normalized UTF32 string derived from a specified UTF-8 string

197

* strA contains another UTF32 string ustrB which has already been normalized, typically

198

* with normalizeOptCaseFold. The normalization for both strings is one of the following,

199

* as specified by the case_sens parameter:

200

* - "canonical caseless form" (case-folded NFD); for case-insensitive comparison.

201

* - standard NFD; for case-sensitive comparison (if case_sens = true).

202

* On success, sets are_equal to true if strA contains ustrB, or false otherwise.

203

*

204

* The input string strA should be valid UTF-8 that meets the criteria for stream safe

205

* text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.

206

* It should not contain ASCII 0x00 or '/'.

207

*

208

* strA: A UTF-8 string (need not be 0 terminated) in which to search for the

209

* substring specified by ustrB.

210

* strA_len: The byte length of strA (excluding any 0 terminator)

211

* ustrB: A normalized UTF-32 substring (need not be 0 terminated) to be searched

212

* for in the UTF-32 string resulting from converting strA to the normalized

213

* UTF-32 form specified by the case_sens parameter; ustrB must already be

214

* in that form. Normally this will be produced using normalizeOptCaseFold.

215

* ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator).

216

* case_sens: False for case-insensitive matching; compares canonical caseless forms.

217

* True for case-sensitive matching; compares standard NFD forms.

218

* buf: Pointer to caller-supplied working memory for storing the portion of

219

* strA which has been converted to normalized UTF-32.

220

* buf_size: The size of buf.

221

* has_match: On success, set to true if strA (when converter to UTF-32 and normalized

222

* per case_sens) contains ustrB, set to false otherwise.

223

*

224

* Returns: 0 on success, or

225

* EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is

226

* not well-formed stream-safe UTF-8, or contains codepoints that are

227

* non-characters or unassigned in the version of Unicode currently

228

* supported.

229

* Note: The search may terminate early when a match is detected, and

230

* may return 0 and set *has_match=true even if strA is invalid.

231

* ENOMEM: buf_size is insufficient.

232

*/

233

int utf8_normalizeOptCaseFoldAndMatchSubstring(const char *strA,

234

size_t strA_len,

235

const int32_t *ustrB,

int32_t ustrB_len,

bool case_sens,

void *buf,

size_t buf_size,

bool *has_match);

/*

* utf8_normalizeOptCaseFoldGetUVersion

244

*

245

* Get the Unicode and code version currently associated with the normalizeOptCaseFold

246

* functions. The caller allocates the version array and passes it to the function,

247

* which will fill out the array as follows:

248

* version[0] = Unicode major version; for Unicode 6.3.0 this would be 6

249

* version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3

250

* version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0

251

* version[3] = Code revision level; for any given Unicode version, this value starts

252

* at 0 and is incremented for each significant revision to the

253

* normalizeOptCaseFold functions.

254

*/

255

void utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4]);

256

257

#endif /* KERNEL_PRIVATE */

258

259

#endif /* unicode_h */