]>
Commit | Line | Data |
---|---|---|
f427ee49 A |
1 | /* |
2 | * Copyright (c) 2016-2020 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
24 | #ifndef unicode_h | |
25 | #define unicode_h | |
26 | ||
27 | #ifdef KERNEL_PRIVATE | |
28 | ||
29 | #include <sys/cdefs.h> | |
30 | #include <stdbool.h> | |
31 | ||
32 | /* | |
33 | * WARNING - callers that use the following Unicode normalization interface for on-disk | |
34 | * structures should be aware that the implementation will be periodically updated for | |
35 | * the latest Unicode standard version. | |
36 | */ | |
37 | ||
38 | enum { | |
39 | /* Maximum size of UTF32 reordering buffer for stream-safe format */ | |
40 | kNCFStreamSafeBufMax = 32 | |
41 | }; | |
42 | ||
43 | /* | |
44 | * utf8_normalizeOptCaseFoldAndHash | |
45 | * | |
46 | * Convert a given UTF-8 string to UTF-32 in one of the following normalized forms, | |
47 | * as specified by the case_sens parameter, and feed the result incrementally to | |
48 | * the provided hash function callback: | |
49 | * - "canonical caseless form" (case-folded NFD, as described by definition D145 | |
50 | * in chapter 3 of The Unicode Standard); for case-insensitive behavior. | |
51 | * - standard NFD; for case-sensitive behavior (if case_sens = true). | |
52 | * | |
53 | * The input string should be valid UTF-8 that meets the criteria for stream safe | |
54 | * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. | |
55 | * It should not contain ASCII 0x00 or '/'. | |
56 | * | |
57 | * str: The input UTF-8 string (need not be 0 terminated) | |
58 | * str_len: The byte length of the input string (excluding any 0 terminator) | |
59 | * case_sens: False for case-insensitive behavior; generates canonical caseless form. | |
60 | * True for case-sensitive behavior; generates standard NFD. | |
61 | * hash_func: A pointer to a hashing function to compute the hash of the | |
62 | * normalized/case-folded result. buf contains buf_len bytes | |
63 | * of data to be added to the hash using the caller-supplied | |
64 | * context (ctx). | |
65 | * hash_ctx: The context for the hash function. | |
66 | * | |
67 | * Returns: 0 on success, or | |
68 | * EILSEQ: The input string contains illegal ASCII-range characters | |
69 | * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or | |
70 | * contains codepoints that are non-characters or unassigned in | |
71 | * the version of Unicode currently supported. | |
72 | */ | |
73 | int utf8_normalizeOptCaseFoldAndHash(const char *str, | |
74 | size_t str_len, | |
75 | bool case_sens, | |
76 | void (*hash_func)(void *buf, size_t buf_len, void *ctx), | |
77 | void *hash_ctx); | |
78 | ||
79 | /* | |
80 | * utf8_normalizeOptCaseFoldAndCompare | |
81 | * | |
82 | * Determine whether two UTF-8 strings are equal after converting each to one of the | |
83 | * following normalized forms, as specified by the case_sens parameter: | |
84 | * - "canonical caseless form" (case-folded NFD); for case-insensitive comparison. | |
85 | * - standard NFD; for case-sensitive comparison (if case_sens = true). | |
86 | * On success, sets are_equal to true if the strings are equal, or false if they are not. | |
87 | * | |
88 | * The input strings should be valid UTF-8 that meet the criteria for stream safe | |
89 | * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. | |
90 | * They should not contain ASCII 0x00 or '/'. | |
91 | * | |
92 | * strA: A UTF-8 string to be compared (need not be 0 terminated) | |
93 | * strA_len: The byte length of strA (excluding any 0 terminator) | |
94 | * strB: The second UTF-8 string to be compared (need not be 0 terminated) | |
95 | * strB_len: The byte length of strB (excluding any 0 terminator) | |
96 | * case_sens: False for case-insensitive behavior; compares canonical caseless forms. | |
97 | * True for case-sensitive behavior; compares standard NFD forms. | |
98 | * are_equal: On success, set to true if the strings are equal, or set to false | |
99 | * if they are not. | |
100 | * | |
101 | * Returns: 0 on success, or | |
102 | * EILSEQ: One or both of the input strings contains illegal ASCII-range | |
103 | * characters (0x00 or '/'), or is not well-formed stream-safe UTF-8, | |
104 | * or contains codepoints that are non-characters or unassigned in | |
105 | * the version of Unicode currently supported. | |
106 | * Note: The comparison may terminate early when a difference is | |
107 | * detected, and may return 0 and set *are_equal=false even | |
108 | * if one or both strings are invalid. | |
109 | */ | |
110 | int utf8_normalizeOptCaseFoldAndCompare(const char *strA, | |
111 | size_t strA_len, | |
112 | const char *strB, | |
113 | size_t strB_len, | |
114 | bool case_sens, | |
115 | bool *are_equal); | |
116 | ||
117 | /* | |
118 | * utf8_normalizeOptCaseFold | |
119 | * | |
120 | * Convert a given UTF-8 string to UTF-32 in one of the following normalized forms, | |
121 | * as specified by the case_sens parameter, and copy the result to the ustr | |
122 | * buffer: | |
123 | * - "canonical caseless form" (case-folded NFD, as described by definition D145 | |
124 | * in chapter 3 of The Unicode Standard); for case-insensitive behavior. | |
125 | * - standard NFD; for case-sensitive behavior (if case_sens = true). | |
126 | * | |
127 | * The input string should be valid UTF-8 that meets the criteria for stream safe | |
128 | * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. | |
129 | * It should not contain ASCII 0x00 or '/'. | |
130 | * | |
131 | * str: The input UTF-8 string (need not be 0 terminated) | |
132 | * str_len: The byte length of the input string (excluding any 0 terminator) | |
133 | * case_sens: False for case-insensitive behavior; generates canonical caseless form. | |
134 | * True for case-sensitive behavior; generates standard NFD. | |
135 | * ustr: A pointer to a buffer for the resulting UTF-32 string. | |
136 | * ustr_size: The capacity of ustr, in UTF-32 units. | |
137 | * ustr_len: Pointer to a value that will be filled in with the actual length | |
138 | * in UTF-32 units of the string copied to ustr. | |
139 | * | |
140 | * Returns: 0 on success, or | |
141 | * EILSEQ: The input string contains illegal ASCII-range characters | |
142 | * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or | |
143 | * contains codepoints that are non-characters or unassigned in | |
144 | * the version of Unicode currently supported. | |
145 | * ENOMEM: ustr_size is insufficient for the resulting string. In this | |
146 | * case the value returned in *ustr_len is invalid. | |
147 | */ | |
148 | int utf8_normalizeOptCaseFold(const char *str, | |
149 | size_t str_len, | |
150 | bool case_sens, | |
151 | int32_t *ustr, | |
152 | int32_t ustr_size, | |
153 | int32_t *ustr_len); | |
154 | ||
155 | /* | |
156 | * utf8_normalizeOptCaseFoldToUTF8 | |
157 | * | |
158 | * Convert a given UTF-8 string to UTF-8 in one of the following normalized forms, | |
159 | * as specified by the case_sens parameter, and copy the result to the ustr | |
160 | * buffer: | |
161 | * - "canonical caseless form" (case-folded NFD, as described by definition D145 | |
162 | * in chapter 3 of The Unicode Standard); for case-insensitive behavior. | |
163 | * - standard NFD; for case-sensitive behavior (if case_sens = true). | |
164 | * | |
165 | * The input string should be valid UTF-8 that meets the criteria for stream safe | |
166 | * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. | |
167 | * It should not contain ASCII 0x00 or '/'. | |
168 | * | |
169 | * str: The input UTF-8 string (need not be 0 terminated) | |
170 | * str_len: The byte length of the input string (excluding any 0 terminator) | |
171 | * case_sens: False for case-insensitive behavior; generates canonical caseless form. | |
172 | * True for case-sensitive behavior; generates standard NFD. | |
173 | * ustr: A pointer to a buffer for the resulting UTF-8 string. | |
174 | * ustr_size: The capacity of ustr, in bytes. | |
175 | * ustr_len: Pointer to a value that will be filled in with the actual length | |
176 | * in bytes of the string copied to ustr. | |
177 | * | |
178 | * Returns: 0 on success, or | |
179 | * EILSEQ: The input string contains illegal ASCII-range characters | |
180 | * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or | |
181 | * contains codepoints that are non-characters or unassigned in | |
182 | * the version of Unicode currently supported. | |
183 | * ENOMEM: ustr_size is insufficient for the resulting string. In this | |
184 | * case the value returned in *ustr_len is invalid. | |
185 | */ | |
186 | int utf8_normalizeOptCaseFoldToUTF8(const char *str, | |
187 | size_t str_len, | |
188 | bool case_sens, | |
189 | char *ustr, | |
190 | size_t ustr_size, | |
191 | size_t *ustr_len); | |
192 | ||
193 | /* | |
194 | * utf8_normalizeOptCaseFoldAndMatchSubstring | |
195 | * | |
196 | * Determine whether the normalized UTF32 string derived from a specified UTF-8 string | |
197 | * strA contains another UTF32 string ustrB which has already been normalized, typically | |
198 | * with normalizeOptCaseFold. The normalization for both strings is one of the following, | |
199 | * as specified by the case_sens parameter: | |
200 | * - "canonical caseless form" (case-folded NFD); for case-insensitive comparison. | |
201 | * - standard NFD; for case-sensitive comparison (if case_sens = true). | |
202 | * On success, sets are_equal to true if strA contains ustrB, or false otherwise. | |
203 | * | |
204 | * The input string strA should be valid UTF-8 that meets the criteria for stream safe | |
205 | * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. | |
206 | * It should not contain ASCII 0x00 or '/'. | |
207 | * | |
208 | * strA: A UTF-8 string (need not be 0 terminated) in which to search for the | |
209 | * substring specified by ustrB. | |
210 | * strA_len: The byte length of strA (excluding any 0 terminator) | |
211 | * ustrB: A normalized UTF-32 substring (need not be 0 terminated) to be searched | |
212 | * for in the UTF-32 string resulting from converting strA to the normalized | |
213 | * UTF-32 form specified by the case_sens parameter; ustrB must already be | |
214 | * in that form. Normally this will be produced using normalizeOptCaseFold. | |
215 | * ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator). | |
216 | * case_sens: False for case-insensitive matching; compares canonical caseless forms. | |
217 | * True for case-sensitive matching; compares standard NFD forms. | |
218 | * buf: Pointer to caller-supplied working memory for storing the portion of | |
219 | * strA which has been converted to normalized UTF-32. | |
220 | * buf_size: The size of buf. | |
221 | * has_match: On success, set to true if strA (when converter to UTF-32 and normalized | |
222 | * per case_sens) contains ustrB, set to false otherwise. | |
223 | * | |
224 | * Returns: 0 on success, or | |
225 | * EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is | |
226 | * not well-formed stream-safe UTF-8, or contains codepoints that are | |
227 | * non-characters or unassigned in the version of Unicode currently | |
228 | * supported. | |
229 | * Note: The search may terminate early when a match is detected, and | |
230 | * may return 0 and set *has_match=true even if strA is invalid. | |
231 | * ENOMEM: buf_size is insufficient. | |
232 | */ | |
233 | int utf8_normalizeOptCaseFoldAndMatchSubstring(const char *strA, | |
234 | size_t strA_len, | |
235 | const int32_t *ustrB, | |
236 | int32_t ustrB_len, | |
237 | bool case_sens, | |
238 | void *buf, | |
239 | size_t buf_size, | |
240 | bool *has_match); | |
241 | ||
242 | /* | |
243 | * utf8_normalizeOptCaseFoldGetUVersion | |
244 | * | |
245 | * Get the Unicode and code version currently associated with the normalizeOptCaseFold | |
246 | * functions. The caller allocates the version array and passes it to the function, | |
247 | * which will fill out the array as follows: | |
248 | * version[0] = Unicode major version; for Unicode 6.3.0 this would be 6 | |
249 | * version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3 | |
250 | * version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0 | |
251 | * version[3] = Code revision level; for any given Unicode version, this value starts | |
252 | * at 0 and is incremented for each significant revision to the | |
253 | * normalizeOptCaseFold functions. | |
254 | */ | |
255 | void utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4]); | |
256 | ||
257 | #endif /* KERNEL_PRIVATE */ | |
258 | ||
259 | #endif /* unicode_h */ |