]>
Commit | Line | Data |
---|---|---|
f427ee49 A |
1 | /* |
2 | * Copyright (C) 2016-2020 Apple, Inc. All rights reserved. | |
3 | * Some portions covered by other copyrights, listed below. | |
4 | *--- | |
5 | * Copyright (C) 2016 and later: Unicode, Inc. and others. | |
6 | * License & terms of use: http://www.unicode.org/copyright.html | |
7 | *--- | |
8 | * Copyright (C) 1999-2015, International Business Machines | |
9 | * Corporation and others. All Rights Reserved. | |
10 | * | |
11 | * add APPLE_OSREFERENCE_LICENSE_HEADER stuff... | |
12 | */ | |
13 | ||
14 | #include <libkern/libkern.h> | |
15 | #include <sys/errno.h> | |
16 | #include <sys/unicode.h> | |
17 | #include "vfs_unicode_data.h" | |
18 | #define STATIC_UNLESS_TEST static | |
19 | ||
20 | enum { | |
21 | /* Maximum number of UTF8 bytes from one Unicode code point (one UTF32 code unit) */ | |
22 | kMaxUTF8BytesPerChar = 4 | |
23 | }; | |
24 | ||
25 | /* local prototypes used by exported functions (and themselves exported for testing) */ | |
26 | STATIC_UNLESS_TEST | |
27 | int32_t utf8ToU32Code(int32_t u32char, const char** srcPtr, const char* srcLimit); | |
28 | STATIC_UNLESS_TEST | |
29 | int32_t normalizeOptCaseFoldU32Char(int32_t u32char, bool case_sens, | |
30 | int32_t u32NormFoldBuf[kNFCSingleCharDecompMax], | |
31 | uint8_t combClass[kNFCSingleCharDecompMax]); | |
32 | /* local prototypes used by exported functions (not exported for separate testing) */ | |
33 | static int nextBaseAndAnyMarks(const char** strP, const char *strLimit, bool case_sens, | |
34 | int32_t* unorm, uint8_t* unormcc, int32_t* unormlenP, int32_t* unormstartP, | |
35 | int32_t* buf, uint8_t* bufcc, int32_t* buflenP, | |
36 | bool* needReorderP, bool* startP); | |
37 | void doReorder(int32_t* buf, uint8_t* bufcc, int32_t buflen); | |
38 | int32_t u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerChar]); | |
39 | ||
40 | /* | |
41 | * utf8_normalizeOptCaseFoldGetUVersion | |
42 | * | |
43 | * version[0] = Unicode major version; for Unicode 6.3.0 this would be 6 | |
44 | * version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3 | |
45 | * version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0 | |
46 | * version[3] = Code revision level; for any given Unicode version, this value starts | |
47 | * at 0 and is incremented for each significant revision to the | |
48 | * normalizeOptCaseFold functions. | |
49 | */ | |
50 | void | |
51 | utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4]) | |
52 | { | |
53 | version[0] = 13; | |
54 | version[1] = 0; | |
55 | version[2] = 0; | |
56 | version[3] = 0; | |
57 | return; | |
58 | } | |
59 | ||
60 | /* | |
61 | * utf8_normalizeOptCaseFoldAndHash | |
62 | * | |
63 | * str: The input UTF-8 string (need not be 0 terminated) | |
64 | * str_len: The byte length of the input string (excluding any 0 terminator) | |
65 | * case_sens: False for case-insensitive behavior; generates canonical caseless form. | |
66 | * True for case-sensitive behavior; generates standard NFD. | |
67 | * hash_func: A pointer to a hashing function to compute the hash of the | |
68 | * normalized/case-folded result. buf contains buf_len bytes | |
69 | * of data to be added to the hash using the caller-supplied | |
70 | * context (ctx). | |
71 | * hash_ctx: The context for the hash function. | |
72 | * | |
73 | * Returns: 0 on success, or | |
74 | * EILSEQ: The input string contains illegal ASCII-range characters | |
75 | * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or | |
76 | * contains codepoints that are non-characters or unassigned in | |
77 | * the version of Unicode currently supported (Unicode 9.0). | |
78 | */ | |
79 | ||
80 | int | |
81 | utf8_normalizeOptCaseFoldAndHash(const char *str, | |
82 | size_t str_len, | |
83 | bool case_sens, | |
84 | void (*hash_func)(void *buf, size_t buf_len, void *ctx), | |
85 | void *hash_ctx) | |
86 | { | |
87 | const char *strLimit = str + str_len; | |
88 | ||
89 | /* Data for the next pending single-char norm from input; | |
90 | * This will always begin with a base char (combining class 0) | |
91 | * or the first character in the string, which may no be a base */ | |
92 | int32_t unorm[kNFCSingleCharDecompMax]; | |
93 | uint8_t unormcc[kNFCSingleCharDecompMax]; | |
94 | int32_t unormlen = 0; | |
95 | int32_t unormstart = 0; | |
96 | ||
97 | bool start = true; | |
98 | ||
99 | /* main loop: | |
100 | * Each input character may be normalized to a sequence of one or more characters, | |
101 | * some of which may have non-zero combining class. Any sequence of characters | |
102 | * with non-zero combining class resulting from one or more input characters needs | |
103 | * to be accumulated in the main buffer so we can reorder as necessary before | |
104 | * calling the hash function. | |
105 | * | |
106 | * At the beginning of the main loop: The normalization buffer and main buffer are | |
107 | * both empty. | |
108 | * | |
109 | * Each time through the main loop we do the following: | |
110 | * 1. If there are characters available in the normalization result buffer (from the | |
111 | * result of normalizing a previous input character), copy the first character and | |
112 | * any following characters that have non-zero combining class to the main buffer. | |
113 | * 2. If there is nothing left in the normalization buffer, then loop processing | |
114 | * input characters as follows: | |
115 | * a) Get the next input character from UTF8, get its normalized and case-folded | |
116 | * result in the normalization buffer. | |
117 | * b) If the first character in the normalization buffer has combining class 0, | |
118 | * break; we will handle this normalization buffer next time through the main | |
119 | * loop. | |
120 | * c) Else copy the current normalization buffer (which has only combining marks) | |
121 | * to the main buffer, and continue with the loop processing input characters. | |
122 | * 3. At this point the first character in the main buffer may or may not have | |
123 | * combining class 0, but any subsequent characters (up to the the limit for | |
124 | * stream safe text) will be combining characters with nonzero combining class. | |
125 | * Reorder the combining marks if necessary into canonical order. | |
126 | * 4. Call the hash function for each character in the main buffer. | |
127 | * | |
128 | */ | |
129 | do { | |
130 | /* Data for the buffers being built up from input */ | |
131 | int32_t buf[kNCFStreamSafeBufMax]; | |
132 | uint8_t bufcc[kNCFStreamSafeBufMax]; | |
133 | int32_t buflen = 0; | |
134 | bool needReorder = false; | |
135 | int err; | |
136 | ||
137 | err = nextBaseAndAnyMarks(&str, strLimit, case_sens, unorm, unormcc, &unormlen, &unormstart, | |
138 | buf, bufcc, &buflen, &needReorder, &start); | |
139 | if (err != 0) { | |
140 | return err; | |
141 | } | |
142 | ||
143 | if (buflen > 0) { | |
144 | /* Now buffer should have all of the combining marks up to the next base char. | |
145 | * Normally it will also start with the last base char encountered (unless the | |
146 | * UTF8 string began with a combining mark). */ | |
147 | /* Now reorder combining marks if necessary. */ | |
148 | if (needReorder) { | |
149 | doReorder(buf, bufcc, buflen); | |
150 | } | |
151 | /* Now write to hash func */ | |
152 | hash_func(buf, buflen * sizeof(buf[0]), hash_ctx); | |
153 | } | |
154 | /* OK so far, top of loop clears buffers to start refilling again */ | |
155 | } while (str < strLimit || unormlen > 0); | |
156 | return 0; | |
157 | } | |
158 | ||
159 | /* | |
160 | * utf8_normalizeOptCaseFoldAndCompare | |
161 | * | |
162 | * strA: A UTF-8 string to be compared (need not be 0 terminated) | |
163 | * strA_len: The byte length of strA (excluding any 0 terminator) | |
164 | * strB: The second UTF-8 string to be compared (need not be 0 terminated) | |
165 | * strB_len: The byte length of strB (excluding any 0 terminator) | |
166 | * case_sens: False for case-insensitive behavior; compares canonical caseless forms. | |
167 | * True for case-sensitive behavior; compares standard NFD forms. | |
168 | * are_equal: On success, set to true if the strings are equal, or set to false | |
169 | * if they are not. | |
170 | * | |
171 | * Returns: 0 on success, or | |
172 | * EILSEQ: One or both of the input strings contains illegal ASCII-range | |
173 | * characters (0x00 or '/'), or is not well-formed stream-safe UTF-8, | |
174 | * or contains codepoints that are non-characters or unassigned in | |
175 | * the version of Unicode currently supported (Unicode 9.0). | |
176 | * Note: The comparison may terminate early when a difference is | |
177 | * detected, and may return 0 and set *are_equal=false even | |
178 | * if one or both strings are invalid. | |
179 | */ | |
180 | enum { kNFCSingleCharDecompMaxPlusPushback = kNFCSingleCharDecompMax + 4 }; /* room for 03B9 pushback(s) */ | |
181 | ||
182 | int | |
183 | utf8_normalizeOptCaseFoldAndCompare(const char *strA, | |
184 | size_t strA_len, | |
185 | const char *strB, | |
186 | size_t strB_len, | |
187 | bool case_sens, | |
188 | bool *are_equal) | |
189 | { | |
190 | const char *strALimit = strA + strA_len; | |
191 | const char *strBLimit = strB + strB_len; | |
192 | ||
193 | /* Data for the next pending single-char norms from each input; | |
194 | * These will always begin with a base char (combining class 0) | |
195 | * or the first character in the string, which may not be a base */ | |
196 | int32_t unormA[kNFCSingleCharDecompMaxPlusPushback], unormB[kNFCSingleCharDecompMaxPlusPushback]; | |
197 | uint8_t unormAcc[kNFCSingleCharDecompMaxPlusPushback], unormBcc[kNFCSingleCharDecompMaxPlusPushback]; | |
198 | int32_t unormAlen = 0, unormBlen = 0; | |
199 | int32_t unormAstart = 0, unormBstart = 0; | |
200 | ||
201 | bool startA = true, startB = true; | |
202 | ||
203 | /* main loop: | |
204 | * The main loop here is similar to the main loop in utf8_normalizeOptCaseFoldAndHash, | |
205 | * described above. The differences are: | |
206 | * - We keep a normalization buffer and main buffer for each string. | |
207 | * - In the main loop, we do steps 1-3 for each string. | |
208 | * - In step 4, instead of calling the hash function, we compare the two main | |
209 | * buffers; if they are unequal, we return a non-equal result. | |
210 | * - After the end of the main loop, if we still have data for one string but | |
211 | * not the other, return a non-equal result, else return an equal result. | |
212 | */ | |
213 | do { | |
214 | /* Data for the buffers being built up from each input */ | |
215 | int32_t bufA[kNCFStreamSafeBufMax], bufB[kNCFStreamSafeBufMax]; | |
216 | uint8_t bufAcc[kNCFStreamSafeBufMax], bufBcc[kNCFStreamSafeBufMax]; | |
217 | int32_t bufAlen = 0, bufBlen = 0; | |
218 | bool needReorderA = false, needReorderB = false; | |
219 | int err; | |
220 | ||
221 | err = nextBaseAndAnyMarks(&strA, strALimit, case_sens, unormA, unormAcc, &unormAlen, &unormAstart, | |
222 | bufA, bufAcc, &bufAlen, &needReorderA, &startA); | |
223 | if (err != 0) { | |
224 | return err; | |
225 | } | |
226 | err = nextBaseAndAnyMarks(&strB, strBLimit, case_sens, unormB, unormBcc, &unormBlen, &unormBstart, | |
227 | bufB, bufBcc, &bufBlen, &needReorderB, &startB); | |
228 | if (err != 0) { | |
229 | return err; | |
230 | } | |
231 | ||
232 | if (bufAlen > 0 || bufBlen > 0) { | |
233 | /* Now each buffer should have all of the combining marks up to the next base char. | |
234 | * Normally it will also start with the last base char encountered (unless the | |
235 | * UTF8 string began with a combining mark). */ | |
236 | /* Now reorder combining marks if necessary. */ | |
237 | if (needReorderA) { | |
238 | doReorder(bufA, bufAcc, bufAlen); | |
239 | } | |
240 | if (needReorderB) { | |
241 | doReorder(bufB, bufBcc, bufBlen); | |
242 | } | |
243 | /* handle 03B9 pushback */ | |
244 | int32_t idx; | |
245 | if (!case_sens) { | |
246 | if (bufAlen > 1 && bufA[bufAlen - 1] == 0x03B9 && unormAstart == 0) { | |
247 | int32_t tailCount = 0; | |
248 | while (tailCount < kNFCSingleCharDecompMaxPlusPushback - unormAlen && bufAlen > 1 && bufA[bufAlen - 1] == 0x03B9) { | |
249 | tailCount++; | |
250 | bufAlen--; | |
251 | } | |
252 | for (idx = unormAlen; idx > 0; idx--) { | |
253 | unormA[idx - 1 + tailCount] = unormA[idx - 1]; | |
254 | unormAcc[idx - 1 + tailCount] = unormAcc[idx - 1]; | |
255 | } | |
256 | for (idx = 0; idx < tailCount; idx++) { | |
257 | unormA[idx] = 0x03B9; | |
258 | unormAcc[idx] = 0; | |
259 | } | |
260 | unormAlen += tailCount; | |
261 | } | |
262 | if (bufBlen > 1 && bufB[bufBlen - 1] == 0x03B9 && unormBstart == 0) { | |
263 | int32_t tailCount = 0; | |
264 | while (tailCount < kNFCSingleCharDecompMaxPlusPushback - unormBlen && bufBlen > 1 && bufB[bufBlen - 1] == 0x03B9) { | |
265 | tailCount++; | |
266 | bufBlen--; | |
267 | } | |
268 | for (idx = unormBlen; idx > 0; idx--) { | |
269 | unormB[idx - 1 + tailCount] = unormB[idx - 1]; | |
270 | unormBcc[idx - 1 + tailCount] = unormBcc[idx - 1]; | |
271 | } | |
272 | for (idx = 0; idx < tailCount; idx++) { | |
273 | unormB[idx] = 0x03B9; | |
274 | unormBcc[idx] = 0; | |
275 | } | |
276 | unormBlen += tailCount; | |
277 | } | |
278 | } | |
279 | /* Now compare the buffers. */ | |
280 | if (bufAlen != bufBlen || memcmp(bufA, bufB, bufAlen * sizeof(bufA[0])) != 0) { | |
281 | *are_equal = false; | |
282 | return 0; | |
283 | } | |
284 | } | |
285 | /* OK so far, top of loop clears buffers to start refilling again */ | |
286 | } while ((strA < strALimit || unormAlen > 0) && (strB < strBLimit || unormBlen > 0)); | |
287 | ||
288 | *are_equal = (strA == strALimit && unormAlen == 0 && strB == strBLimit && unormBlen == 0); | |
289 | return 0; | |
290 | } | |
291 | ||
292 | /* | |
293 | * utf8_normalizeOptCaseFold | |
294 | * | |
295 | * str: The input UTF-8 string (need not be 0 terminated) | |
296 | * str_len: The byte length of the input string (excluding any 0 terminator) | |
297 | * case_sens: False for case-insensitive behavior; generates canonical caseless form. | |
298 | * True for case-sensitive behavior; generates standard NFD. | |
299 | * ustr: A pointer to a buffer for the resulting UTF-32 string. | |
300 | * ustr_size: The capacity of ustr, in UTF-32 units. | |
301 | * ustr_len: Pointer to a value that will be filled in with the actual length | |
302 | * in UTF-32 units of the string copied to ustr. | |
303 | * | |
304 | * Returns: 0 on success, or | |
305 | * EILSEQ: The input string contains illegal ASCII-range characters | |
306 | * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or | |
307 | * contains codepoints that are non-characters or unassigned in | |
308 | * the version of Unicode currently supported. | |
309 | * ENOMEM: ustr_size is insufficient for the resulting string. In this | |
310 | * case the value returned in *ustr_len is invalid. | |
311 | */ | |
312 | int | |
313 | utf8_normalizeOptCaseFold(const char *str, | |
314 | size_t str_len, | |
315 | bool case_sens, | |
316 | int32_t *ustr, | |
317 | int32_t ustr_size, | |
318 | int32_t *ustr_len) | |
319 | { | |
320 | const char *strLimit = str + str_len; | |
321 | int32_t *ustrCur = ustr; | |
322 | const int32_t *ustrLimit = ustr + ustr_size; | |
323 | ||
324 | /* Data for the next pending single-char norm from input; | |
325 | * This will always begin with a base char (combining class 0) */ | |
326 | int32_t unorm[kNFCSingleCharDecompMax]; | |
327 | uint8_t unormcc[kNFCSingleCharDecompMax]; | |
328 | int32_t unormlen = 0; | |
329 | int32_t unormstart = 0; | |
330 | ||
331 | bool start = true; | |
332 | ||
333 | *ustr_len = 0; | |
334 | do { | |
335 | /* Data for the buffers being built up from input */ | |
336 | int32_t buf[kNCFStreamSafeBufMax]; | |
337 | uint8_t bufcc[kNCFStreamSafeBufMax]; | |
338 | int32_t buflen = 0; | |
339 | bool needReorder = false; | |
340 | int err; | |
341 | ||
342 | err = nextBaseAndAnyMarks(&str, strLimit, case_sens, unorm, unormcc, &unormlen, &unormstart, | |
343 | buf, bufcc, &buflen, &needReorder, &start); | |
344 | if (err != 0) { | |
345 | return err; | |
346 | } | |
347 | ||
348 | if (buflen > 0) { | |
349 | if (needReorder) { | |
350 | doReorder(buf, bufcc, buflen); | |
351 | } | |
352 | /* Now copy to output buffer */ | |
353 | int32_t idx; | |
354 | if (ustrCur + buflen > ustrLimit) { | |
355 | return ENOMEM; | |
356 | } | |
357 | for (idx = 0; idx < buflen; idx++) { | |
358 | *ustrCur++ = buf[idx]; | |
359 | } | |
360 | } | |
361 | /* OK so far, top of loop clears buffers to start refilling again */ | |
362 | } while (str < strLimit || unormlen > 0); | |
363 | *ustr_len = (uint32_t)(ustrCur - ustr); // XXXpjr: the explicit (uint32_t) cast wasn't present in the original code drop | |
364 | return 0; | |
365 | } | |
366 | ||
367 | /* | |
368 | * utf8_normalizeOptCaseFoldToUTF8 | |
369 | * (This is similar to normalizeOptCaseFold except that this has a different output | |
370 | * buffer type, and adds conversion to UTF8 while copying to output buffer) | |
371 | * | |
372 | * str: The input UTF-8 string (need not be 0 terminated) | |
373 | * str_len: The byte length of the input string (excluding any 0 terminator) | |
374 | * case_sens: False for case-insensitive behavior; generates canonical caseless form. | |
375 | * True for case-sensitive behavior; generates standard NFD. | |
376 | * ustr: A pointer to a buffer for the resulting UTF-8 string. | |
377 | * ustr_size: The capacity of ustr, in bytes. | |
378 | * ustr_len: Pointer to a value that will be filled in with the actual length | |
379 | * in bytes of the string copied to ustr. | |
380 | * | |
381 | * Returns: 0 on success, or | |
382 | * EILSEQ: The input string contains illegal ASCII-range characters | |
383 | * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or | |
384 | * contains codepoints that are non-characters or unassigned in | |
385 | * the version of Unicode currently supported. | |
386 | * ENOMEM: ustr_size is insufficient for the resulting string. In this | |
387 | * case the value returned in *ustr_len is invalid. | |
388 | */ | |
389 | int | |
390 | utf8_normalizeOptCaseFoldToUTF8(const char *str, | |
391 | size_t str_len, | |
392 | bool case_sens, | |
393 | char *ustr, | |
394 | size_t ustr_size, | |
395 | size_t *ustr_len) | |
396 | { | |
397 | const char *strLimit = str + str_len; | |
398 | char *ustrCur = ustr; | |
399 | const char *ustrLimit = ustr + ustr_size; | |
400 | ||
401 | /* Data for the next pending single-char norm from input; | |
402 | * This will always begin with a base char (combining class 0) */ | |
403 | int32_t unorm[kNFCSingleCharDecompMax]; | |
404 | uint8_t unormcc[kNFCSingleCharDecompMax]; | |
405 | int32_t unormlen = 0; | |
406 | int32_t unormstart = 0; | |
407 | ||
408 | bool start = true; | |
409 | ||
410 | *ustr_len = 0; | |
411 | do { | |
412 | /* Data for the buffers being built up from input */ | |
413 | int32_t buf[kNCFStreamSafeBufMax]; | |
414 | uint8_t bufcc[kNCFStreamSafeBufMax]; | |
415 | int32_t buflen = 0; | |
416 | bool needReorder = false; | |
417 | int err; | |
418 | ||
419 | err = nextBaseAndAnyMarks(&str, strLimit, case_sens, unorm, unormcc, &unormlen, &unormstart, | |
420 | buf, bufcc, &buflen, &needReorder, &start); | |
421 | if (err != 0) { | |
422 | return err; | |
423 | } | |
424 | ||
425 | if (buflen > 0) { | |
426 | uint8_t utf8Bytes[kMaxUTF8BytesPerChar]; | |
427 | int32_t *bufPtr = buf; | |
428 | if (needReorder) { | |
429 | doReorder(buf, bufcc, buflen); | |
430 | } | |
431 | /* Now copy to output buffer */ | |
432 | while (buflen-- > 0) { | |
433 | int32_t idx, utf8Len = u32CharToUTF8Bytes((uint32_t)*bufPtr++, utf8Bytes); | |
434 | if (ustrCur + utf8Len > ustrLimit) { | |
435 | return ENOMEM; | |
436 | } | |
437 | for (idx = 0; idx < utf8Len; idx++) { | |
438 | *ustrCur++ = (char)utf8Bytes[idx]; | |
439 | } | |
440 | } | |
441 | } | |
442 | /* OK so far, top of loop clears buffers to start refilling again */ | |
443 | } while (str < strLimit || unormlen > 0); | |
444 | *ustr_len = ustrCur - ustr; | |
445 | return 0; | |
446 | } | |
447 | ||
448 | /* | |
449 | * utf8_normalizeOptCaseFoldAndMatchSubstring | |
450 | * | |
451 | * strA: A UTF-8 string (need not be 0 terminated) in which to search for the | |
452 | * substring specified by ustrB. | |
453 | * strA_len: The byte length of strA (excluding any 0 terminator) | |
454 | * ustrB: A normalized UTF-32 substring (need not be 0 terminated) to be searched | |
455 | * for in the UTF-32 string resulting from converting strA to the normalized | |
456 | * UTF-32 form specified by the case_sens parameter; ustrB must already be | |
457 | * in that form. | |
458 | * ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator). | |
459 | * case_sens: False for case-insensitive matching; compares canonical caseless forms. | |
460 | * True for case-sensitive matching; compares standard NFD forms. | |
461 | * buf: Pointer to caller-supplied working memory for storing the portion of | |
462 | * strA which has been converted to normalized UTF-32. | |
463 | * buf_size: The size of buf. | |
464 | * has_match: On success, set to true if strA (when converter to UTF-32 and normalized | |
465 | * per case_sens) contains ustrB, set to false otherwise. | |
466 | * | |
467 | * Returns: 0 on success, or | |
468 | * EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is | |
469 | * not well-formed stream-safe UTF-8, or contains codepoints that are | |
470 | * non-characters or unassigned in the version of Unicode currently | |
471 | * supported. | |
472 | * Note: The search may terminate early when a match is detected, and | |
473 | * may return 0 and set *has_match=true even if strA is invalid. | |
474 | * ENOMEM: buf_size is insufficient. | |
475 | */ | |
476 | int | |
477 | utf8_normalizeOptCaseFoldAndMatchSubstring(const char *strA, | |
478 | size_t strA_len, | |
479 | const int32_t *ustrB, | |
480 | int32_t ustrB_len, | |
481 | bool case_sens, | |
482 | void *buf, | |
483 | size_t buf_size, | |
484 | bool *has_match) | |
485 | { | |
486 | /* | |
487 | * ustrA represents the current position in the UTF-32 normalized version of strA | |
488 | * at which we want to test for a match; ustrANormEnd is the position beyond that | |
489 | * which is just after the end of what has already been converted from strA to | |
490 | * UTF-32 normalized form. | |
491 | * Each time through the main loop: | |
492 | * - The first task is to make sure we have enough of strA converted to UTF32 | |
493 | * normalized form to test for match with ustrB at the current match position. | |
494 | * If we don't, then convert more of strA to UTF-32 normalized form until we | |
495 | * have enough to compare with ustrB. To do this, run a loop which is like the | |
496 | * main loop in utf8_normalizeOptCaseFoldAndHash except that in step 4, instead of | |
497 | * calling the hash function, we copy the normalized buffer to ustrANormEnd, | |
498 | * advancing the latter. We keep doing this until we have enough additional | |
499 | * converted to match with ustrB. | |
500 | * - Then we test for match of ustrB at the current ustrA position. If there is | |
501 | * a match we return; otherwise, if there is more strA to convert we advance | |
502 | * ustrA and repeat the main loop, otherwise we return without a match. | |
503 | */ | |
504 | if (ustrB_len == 0) { /* always matches */ | |
505 | *has_match = true; | |
506 | return 0; | |
507 | } | |
508 | *has_match = false; /* initialize return value */ | |
509 | if (ustrB_len > 2 * strA_len) { | |
510 | /* If ustrB is clearly too long to find in strA, don't bother normalizing strA. | |
511 | * A UTF-8 character of 1 byte (ASCII) will normalize to 1 UTF-32 unit. | |
512 | * A UTF-8 character of 2-4 bytes will normalize to a maximum of 4 UTF-32 units. | |
513 | * The maximum expansion from unnormalized UTF-8 byte length to normalized | |
514 | * UTF-32 unit length is thus 2. */ | |
515 | return 0; | |
516 | } | |
517 | ||
518 | const char *strALimit = strA + strA_len; | |
519 | int32_t *ustrA = (int32_t *)buf; | |
520 | const int32_t *ustrALimit = ustrA + (buf_size / sizeof(int32_t)); | |
521 | int32_t *ustrANormEnd = ustrA; /* how far we have already normalized in ustrA */ | |
522 | ||
523 | /* Data for the next pending single-char norms from each input; | |
524 | * These will always begin with a base char (combining class 0) | |
525 | * or the first character in the string, which may not be a base */ | |
526 | int32_t unormA[kNFCSingleCharDecompMax]; | |
527 | uint8_t unormAcc[kNFCSingleCharDecompMax]; | |
528 | int32_t unormAlen = 0; | |
529 | int32_t unormAstart = 0; | |
530 | ||
531 | bool startA = true; | |
532 | ||
533 | while (true) { | |
534 | /* convert enough more of strA to normalized UTF-32 in ustrA to check for match */ | |
535 | if (ustrANormEnd - ustrA < ustrB_len) { | |
536 | do { | |
537 | /* Data for the buffers being built up from each input */ | |
538 | int32_t bufA[kNCFStreamSafeBufMax]; | |
539 | uint8_t bufAcc[kNCFStreamSafeBufMax]; | |
540 | int32_t bufAlen = 0; | |
541 | bool needReorderA = false; | |
542 | int err; | |
543 | ||
544 | err = nextBaseAndAnyMarks(&strA, strALimit, case_sens, unormA, unormAcc, &unormAlen, &unormAstart, | |
545 | bufA, bufAcc, &bufAlen, &needReorderA, &startA); | |
546 | if (err != 0) { | |
547 | return err; | |
548 | } | |
549 | ||
550 | if (bufAlen > 0) { | |
551 | /* Now each buffer should have all of the combining marks up to the next base char. | |
552 | * Normally it will also start with the last base char encountered (unless the | |
553 | * UTF8 string began with a combining mark). */ | |
554 | /* Now reorder combining marks if necessary. Should be rare, and sequences should | |
555 | * usually be short when does occur => simple bubblesort should be sufficient. */ | |
556 | if (needReorderA) { | |
557 | doReorder(bufA, bufAcc, bufAlen); | |
558 | } | |
559 | /* Now copy to working buffer */ | |
560 | int32_t idx; | |
561 | if (ustrANormEnd + bufAlen > ustrALimit) { | |
562 | return ENOMEM; | |
563 | } | |
564 | for (idx = 0; idx < bufAlen; idx++) { | |
565 | *ustrANormEnd++ = bufA[idx]; | |
566 | } | |
567 | } | |
568 | /* OK so far, top of loop clears buffers to start refilling again */ | |
569 | } while ((ustrANormEnd - ustrA < ustrB_len) && (strA < strALimit || unormAlen > 0)); | |
570 | } | |
571 | ||
572 | if (ustrANormEnd - ustrA < ustrB_len) { | |
573 | return 0; /* not enough of strA left for match */ | |
574 | } | |
575 | /* check for match, return if so */ | |
576 | if (memcmp(ustrA, ustrB, ustrB_len * sizeof(ustrB[0])) == 0) { | |
577 | *has_match = true; | |
578 | return 0; | |
579 | } | |
580 | ustrA++; /* advance match position */ | |
581 | } | |
582 | } | |
583 | ||
584 | /* nextBaseAndAnyMarks: | |
585 | * Guts of code to get next bufferful of base character (or first char in string) | |
586 | * and all trailing combining marks. | |
587 | * This is called each time through the main loop of functions above, and does the | |
588 | * following: | |
589 | * 1. If there are characters available in the normalization result buffer (from the | |
590 | * result of normalizing a previous input character), copy the first character and | |
591 | * any following characters that have non-zero combining class to the main buffer. | |
592 | * 2. If there is nothing left in the normalization buffer, then loop processing | |
593 | * input characters as follows: | |
594 | * a) Get the next input character from UTF8, get its normalized and case-folded | |
595 | * result in the normalization buffer. | |
596 | * b) If the first character in the normalization buffer has combining class 0, | |
597 | * break; we will handle this normalization buffer next time through the main | |
598 | * loop. | |
599 | * c) Else copy the current normalization buffer (which has only combining marks) | |
600 | * to the main buffer, and continue with the loop processing input characters. | |
601 | */ | |
602 | ||
603 | static int | |
604 | nextBaseAndAnyMarks(const char** strP, const char *strLimit, bool case_sens, | |
605 | int32_t* unorm, uint8_t* unormcc, int32_t* unormlenP, int32_t* unormstartP, | |
606 | int32_t* buf, uint8_t* bufcc, int32_t* buflenP, | |
607 | bool* needReorderP, bool* startP) | |
608 | { | |
609 | /* update buffers for str */ | |
610 | if (*unormlenP > 0 && *unormstartP < *unormlenP) { | |
611 | /* unorm begins with a base char; buflen should be 0 */ | |
612 | *needReorderP = false; | |
613 | for (*buflenP = 0; true;) { | |
614 | if (*buflenP > 0 && unormcc[*unormstartP] > 0 && unormcc[*unormstartP] < bufcc[(*buflenP) - 1]) { | |
615 | *needReorderP = true; | |
616 | } | |
617 | buf[*buflenP] = unorm[*unormstartP]; | |
618 | bufcc[(*buflenP)++] = unormcc[(*unormstartP)++]; | |
619 | if (*unormstartP >= *unormlenP || unormcc[*unormstartP] == 0) { | |
620 | break; | |
621 | } | |
622 | } | |
623 | } | |
624 | if (*unormstartP >= *unormlenP) { | |
625 | *unormstartP = *unormlenP = 0; | |
626 | while (*strP < strLimit) { | |
627 | int32_t idx; | |
628 | uint32_t bytevalue = (uint8_t)*(*strP)++; | |
629 | /* '/' is not produced by NFD decomposition from another character so we can | |
630 | * check for it before normalization */ | |
631 | if (bytevalue == 0 || bytevalue == 0x2F /*'/'*/) { | |
632 | return EILSEQ; | |
633 | } | |
634 | if (bytevalue < 0x80) { | |
635 | unorm[0] = (!case_sens && bytevalue >= 'A' && bytevalue <= 'Z')? bytevalue += 0x20: bytevalue; | |
636 | *unormlenP = 1; | |
637 | unormcc[0] = 0; | |
638 | *startP = false; | |
639 | break; | |
640 | } else { | |
641 | int32_t u32char = utf8ToU32Code(bytevalue, strP, strLimit); | |
642 | if (u32char <= 0) { | |
643 | return EILSEQ; | |
644 | } | |
645 | *unormlenP = normalizeOptCaseFoldU32Char(u32char, case_sens, unorm, unormcc); | |
646 | if (*unormlenP <= 0) { | |
647 | return EILSEQ; | |
648 | } | |
649 | if (unormcc[0] == 0 || *startP) { | |
650 | *startP = false; | |
651 | break; | |
652 | } | |
653 | } | |
654 | /* the latest char decomposes to just combining sequence, add to buffer being built */ | |
655 | if (*buflenP + *unormlenP > kNCFStreamSafeBufMax) { | |
656 | return EILSEQ; | |
657 | } | |
658 | for (idx = 0; idx < *unormlenP; idx++, (*buflenP)++) { | |
659 | if (*buflenP > 0 && unormcc[idx] > 0 && unormcc[idx] < bufcc[(*buflenP) - 1]) { | |
660 | *needReorderP = true; | |
661 | } | |
662 | buf[*buflenP] = unorm[idx]; | |
663 | bufcc[*buflenP] = unormcc[idx]; | |
664 | } | |
665 | *unormlenP = 0; | |
666 | } | |
667 | } | |
668 | return 0; | |
669 | } | |
670 | ||
671 | /* local prototypes used only by internal functions */ | |
672 | static void swapBufCharCCWithPrevious(int32_t jdx, int32_t buf[], uint8_t bufcc[]); | |
673 | static int32_t adjustCase(bool case_sens, int32_t uSeqLen, | |
674 | int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]); | |
675 | static uint8_t getCombClassU32Char(int32_t u32char); | |
676 | static int32_t decomposeHangul(int32_t u32char, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]); | |
677 | ||
678 | /* Reorder combining marks if necessary. Should be rare, and sequences should | |
679 | * usually be short when does occur => simple bubblesort should be sufficient. */ | |
680 | void | |
681 | doReorder(int32_t* buf, uint8_t* bufcc, int32_t buflen) | |
682 | { | |
683 | int32_t idx, jdx; | |
684 | for (idx = 0; idx < buflen - 1; idx++) { | |
685 | for (jdx = buflen - 1; jdx > idx; jdx--) { | |
686 | if (bufcc[jdx] < bufcc[jdx - 1]) { | |
687 | swapBufCharCCWithPrevious(jdx, buf, bufcc); | |
688 | } | |
689 | } | |
690 | } | |
691 | } | |
692 | /* swap function for bubblesort */ | |
693 | static void | |
694 | swapBufCharCCWithPrevious(int32_t jdx, int32_t buf[], uint8_t bufcc[]) | |
695 | { | |
696 | int32_t bufchar = buf[jdx]; | |
697 | uint8_t bufccval = bufcc[jdx]; | |
698 | buf[jdx] = buf[jdx - 1]; | |
699 | bufcc[jdx] = bufcc[jdx - 1]; | |
700 | buf[jdx - 1] = bufchar; | |
701 | bufcc[jdx - 1] = bufccval; | |
702 | } | |
703 | ||
704 | /* | |
705 | * u32CharToUTF8Bytes, map a valid Unicode character (UTF32 code point) to 1..4 UTF8 bytes, | |
706 | * and returns the number of UTF8 bytes. | |
707 | * | |
708 | * adapted from ICU macro U8_APPEND_UNSAFE (utf8.h). | |
709 | */ | |
710 | int32_t | |
711 | u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerChar]) | |
712 | { | |
713 | int32_t idx = 0; | |
714 | if (u32char <= 0x7F) { | |
715 | utf8Bytes[idx++] = (uint8_t)u32char; | |
716 | } else { | |
717 | if (u32char <= 0x7FF) { | |
718 | utf8Bytes[idx++] = (uint8_t)((u32char >> 6) | 0xC0); | |
719 | } else { | |
720 | if (u32char <= 0xFFFF) { | |
721 | utf8Bytes[idx++] = (uint8_t)((u32char >> 12) | 0xE0); | |
722 | } else { | |
723 | utf8Bytes[idx++] = (uint8_t)((u32char >> 18) | 0xF0); | |
724 | utf8Bytes[idx++] = (uint8_t)(((u32char >> 12) & 0x3F) | 0x80); | |
725 | } | |
726 | utf8Bytes[idx++] = (uint8_t)(((u32char >> 6) & 0x3F) | 0x80); | |
727 | } | |
728 | utf8Bytes[idx++] = (uint8_t)((u32char & 0x3F) | 0x80); | |
729 | } | |
730 | return idx; | |
731 | } | |
732 | ||
733 | /* two macros adapted from ICU's utf8.h */ | |
734 | #define U8_COUNT_TRAIL_BYTES_LOC(leadByte) \ | |
735 | ((uint8_t)(leadByte)<0XF0 ? \ | |
736 | ((uint8_t)(leadByte)>=0XC0)+((uint8_t)(leadByte)>=0XE0) : \ | |
737 | (uint8_t)(leadByte)<0XFE ? 3+((uint8_t)(leadByte)>=0XF8)+((uint8_t)(leadByte)>=0XFC) : 0) | |
738 | ||
739 | #define U8_MASK_LEAD_BYTE_LOC(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) | |
740 | ||
741 | /* array adapted from ICU's utf_impl.c */ | |
742 | static const int32_t utf8_minLegal[4] = { 0, 0X80, 0x800, 0x10000 }; | |
743 | ||
744 | /* | |
745 | * utf8ToU32Code, map a non-ASCII byte value plus a buffer of trail bytes to a UTF32 code point | |
746 | * | |
747 | * adapted from ICU macro U8_NEXT (utf8.h) and function utf8_nextCharSafeBody (utf_impl.c); | |
748 | * verified to produce the same results (adusted for the difference in API signature). | |
749 | * | |
750 | * assumes at entry that: | |
751 | * 1. a non-ASCII byte value (>= 0x80) that purports to be the beginning of a UTF8 character | |
752 | * has been read, and its value is in u32char | |
753 | * 2. *srcPtr points to the input buffer just after that non-ASCII byte, i.e. it purportedly | |
754 | * points to the trail bytes for that UTF8 char. | |
755 | * 3. srcLimit points to end of the input buffer (just after the last byte in the buffer) | |
756 | * | |
757 | * For a valid and complete UTF8 character, the function returns its value and advances | |
758 | * *srcPtr to the first byte after the UTF8 char. Otherwise, the function returns -1 | |
759 | * (and the value in *srcPtr is undefined). | |
760 | * Note that while it does not map to surrogate values (generates an error for malformed | |
761 | * UTF-8 that would map to values in 0xD800..0xD8FF), it does output noncharacter values | |
762 | * whose low 16 bits are 0xFFFE or 0xFFFF without generating an error. | |
763 | * | |
764 | * equivalences used in adapted ICU code: | |
765 | * UChar = uint16_t | |
766 | * UChar32 = int32_t | |
767 | * | |
768 | * This has been validated against ICU behavior. | |
769 | */ | |
770 | STATIC_UNLESS_TEST | |
771 | int32_t | |
772 | utf8ToU32Code(int32_t u32char, const char** srcPtr, const char* srcLimit) | |
773 | { | |
774 | const char* src = *srcPtr; | |
775 | uint8_t pt1, pt2; | |
776 | if (0xE0 < u32char && u32char <= 0xEC && src + 1 < srcLimit && (pt1 = (uint8_t)(src[0] - 0x80)) <= 0x3F && (pt2 = (uint8_t)(src[1] - 0x80)) <= 0x3F) { | |
777 | /* handle U+1000..U+CFFF */ | |
778 | /* no need for (u32char&0xF) because the upper bits are truncated after <<12 in the cast to (uint16_t) */ | |
779 | u32char = (uint16_t)((u32char << 12) | (pt1 << 6) | pt2); | |
780 | src += 2; | |
781 | } else if (u32char < 0xE0 && u32char >= 0xC2 && src < srcLimit && (pt1 = (uint8_t)(src[0] - 0x80)) <= 0x3F) { | |
782 | /* handle U+0080..U+07FF */ | |
783 | u32char = ((u32char & 0x1F) << 6) | pt1; | |
784 | src++; | |
785 | } else { | |
786 | /* "complicated" and error cases, adapted from ICU's utf8_nextCharSafeBody() */ | |
787 | uint8_t count = U8_COUNT_TRAIL_BYTES_LOC(u32char); | |
788 | if (src + count <= srcLimit) { | |
789 | uint8_t trail; | |
790 | ||
791 | U8_MASK_LEAD_BYTE_LOC(u32char, count); | |
792 | switch (count) { | |
793 | /* branches 3, 2 fall through to the next one */ | |
794 | case 0: /* count==0 for illegally leading trail bytes and the illegal bytes 0XFE and 0XFF */ | |
795 | case 5: | |
796 | case 4: /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ | |
797 | break; | |
798 | case 3: | |
799 | trail = *src++ - 0X80; | |
800 | u32char = (u32char << 6) | trail; | |
801 | /* u32char>=0x110 would result in code point>0x10FFFF, outside Unicode */ | |
802 | if (u32char >= 0x110 || trail > 0X3F) { | |
803 | break; | |
804 | } | |
805 | case 2: | |
806 | trail = *src++ - 0X80; | |
807 | u32char = (u32char << 6) | trail; | |
808 | /* | |
809 | * test for a surrogate D800..DFFF: | |
810 | * before the last (u32char<<6), a surrogate is u32char=360..37F | |
811 | */ | |
812 | if (((u32char & 0xFFE0) == 0x360) || trail > 0X3F) { | |
813 | break; | |
814 | } | |
815 | case 1: | |
816 | trail = *src++ - 0X80; | |
817 | u32char = (u32char << 6) | trail; | |
818 | if (trail > 0X3F) { | |
819 | break; | |
820 | } | |
821 | /* correct sequence - all trail bytes have (b7..b6)==(10) */ | |
822 | if (u32char >= utf8_minLegal[count]) { | |
823 | *srcPtr = src; | |
824 | return u32char; | |
825 | } | |
826 | /* no default branch to optimize switch() - all values are covered */ | |
827 | } | |
828 | } | |
829 | u32char = -1; | |
830 | } | |
831 | *srcPtr = src; | |
832 | return u32char; | |
833 | } | |
834 | ||
835 | /* | |
836 | * normalizeCaseFoldU32Code, map a single UTF32 code point to its normalized result | |
837 | * and the combining classes for each resulting char, or indicate it is invalid. | |
838 | * | |
839 | * The normalized and case-folded result might be up to 4 UTF32 characters (current | |
840 | * max, could change in the future). | |
841 | * | |
842 | * u32char - input UTF32 code point | |
843 | * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only | |
844 | * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3) | |
845 | * to receive the normalize result. | |
846 | * combClass - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3) | |
847 | * to receive the combining classes for the characters in u32NormFoldBuf. If | |
848 | * the first entry has non-zero combining class, the remaining entries do too. | |
849 | * | |
850 | * returns -1 if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax | |
851 | * is insufficient (though it is assumed to be at least 3), else the length of the | |
852 | * normalized and case-folded result (currently in the range 1..4). | |
853 | * | |
854 | * This has been validated against ICU behavior. | |
855 | * | |
856 | * This function is highly dependent on the structure of the data trie; for details on | |
857 | * that structure, see comments in normalizeCaseFoldData.h | |
858 | */ | |
859 | STATIC_UNLESS_TEST | |
860 | int32_t | |
861 | normalizeOptCaseFoldU32Char(int32_t u32char, bool case_sens, | |
862 | int32_t u32NormFoldBuf[kNFCSingleCharDecompMax], | |
863 | uint8_t combClass[kNFCSingleCharDecompMax]) | |
864 | { | |
865 | combClass[0] = 0; | |
866 | /* return hi-range PUA as self, except non-characters */ | |
867 | if (u32char >= kU32HiPUAStart) { | |
868 | if ((u32char & 0xFFFE) == 0xFFFE) { | |
869 | return -1; | |
870 | } | |
871 | u32NormFoldBuf[0] = u32char; | |
872 | return 1; | |
873 | } | |
874 | /* for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */ | |
875 | /* 0 - 0x313FF; everything in between in currently invalid. */ | |
876 | int32_t u32charLookup = u32char; | |
877 | if (u32charLookup >= kU32LowRangeLimit) { | |
878 | u32charLookup -= (kU32HiRangeStart - kU32LowRangeLimit); | |
879 | if (u32charLookup < kU32LowRangeLimit || u32charLookup >= (kU32LowRangeLimit + kU32HiRangeLen)) { | |
880 | return -1; /* in the large range of currently-unassigned code points */ | |
881 | } | |
882 | } | |
883 | /* Now we have u32charLookup either in 0..0x313FF representing u32char itself, | |
884 | * or in 0x31400..0x315FF representing u32char 0xE0000..0xE01FF; look it up in | |
885 | * the trie that identifies unassigneds in this range, or maps others to | |
886 | * decomps or combining class or just self. */ | |
887 | uint16_t trieValue; | |
888 | /* TrieHi */ | |
889 | trieValue = nfTrieHi[u32charLookup >> kNFTrieHiShift]; | |
890 | if (trieValue == kInvalidCodeFlag) { | |
891 | return -1; | |
892 | } | |
893 | if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { /* return self; */ | |
894 | u32NormFoldBuf[0] = u32char; | |
895 | combClass[0] = trieValue & kFlagValueMask; | |
896 | return 1; | |
897 | } | |
898 | if (trieValue == kHangulMask) { | |
899 | combClass[1] = combClass[2] = 0; | |
900 | return decomposeHangul(u32char, u32NormFoldBuf); | |
901 | } | |
902 | /* TrieMid */ | |
903 | trieValue = nfTrieMid[trieValue & kNextIndexValueMask][(u32charLookup >> kNFTrieMidShift) & kNFTrieMidMask]; | |
904 | if (trieValue == kInvalidCodeFlag) { | |
905 | return -1; | |
906 | } | |
907 | if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { | |
908 | u32NormFoldBuf[0] = u32char; | |
909 | combClass[0] = trieValue & kFlagValueMask; | |
910 | return adjustCase(case_sens, 1, u32NormFoldBuf); | |
911 | } | |
912 | if ((trieValue & kFlagTestMask) == kInvMaskFlag) { | |
913 | uint16_t invalidMask = nfU16InvMasks[trieValue & kFlagValueMask]; | |
914 | uint16_t testBit = (uint16_t)(1 << (u32charLookup & kNFTrieLoMask)); | |
915 | if (testBit & invalidMask) { | |
916 | /* invalid */ | |
917 | return -1; | |
918 | } else { | |
919 | /* treat like trieValue == 0 above */ | |
920 | u32NormFoldBuf[0] = u32char; | |
921 | return adjustCase(case_sens, 1, u32NormFoldBuf);; | |
922 | } | |
923 | } | |
924 | if (trieValue == kHangulMask) { | |
925 | combClass[1] = combClass[2] = 0; | |
926 | return decomposeHangul(u32char, u32NormFoldBuf); | |
927 | } | |
928 | /* TrieLo */ | |
929 | trieValue = nfTrieLo[trieValue & kNextIndexValueMask][u32charLookup & kNFTrieLoMask]; | |
930 | if (trieValue == kInvalidCodeFlag) { | |
931 | return -1; | |
932 | } | |
933 | if (trieValue == kHangulMask) { | |
934 | combClass[1] = combClass[2] = 0; | |
935 | return decomposeHangul(u32char, u32NormFoldBuf); | |
936 | } | |
937 | if (trieValue < kToU16Seq2Mask || trieValue > kSpecialsEnd) { | |
938 | if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { | |
939 | u32NormFoldBuf[0] = u32char; | |
940 | combClass[0] = trieValue & kFlagValueMask; | |
941 | } else { | |
942 | u32NormFoldBuf[0] = trieValue; | |
943 | } | |
944 | return adjustCase(case_sens, 1, u32NormFoldBuf);; | |
945 | } | |
946 | const uint16_t* u16SeqPtr = NULL; | |
947 | const int32_t* u32SeqPtr = NULL; | |
948 | int32_t uSeqLen = 0; | |
949 | switch (trieValue & kSpecialsMask) { | |
950 | case kToU16Seq2Mask: | |
951 | if (case_sens && (trieValue & kToSeqCaseFoldMask)) { | |
952 | /* don't use the mapping, it is only for case folding */ | |
953 | u32NormFoldBuf[0] = u32char; | |
954 | /* already have combClass[0] = 0 */ | |
955 | return 1; | |
956 | } | |
957 | u16SeqPtr = nfU16Seq2[trieValue & kToSeqIndexMask]; | |
958 | uSeqLen = 2; | |
959 | break; | |
960 | case kToU16Seq3Mask: | |
961 | if (case_sens && (trieValue & kToSeqCaseFoldMask)) { | |
962 | /* don't use the mapping, it is only for case folding */ | |
963 | u32NormFoldBuf[0] = u32char; | |
964 | /* already have combClass[0] = 0 */ | |
965 | return 1; | |
966 | } | |
967 | u16SeqPtr = nfU16Seq3[trieValue & kToSeqIndexMask]; | |
968 | uSeqLen = 3; | |
969 | break; | |
970 | case kToU16SeqMiscMask: | |
971 | u16SeqPtr = &nfU16SeqMisc[trieValue & kToSeqMiscIndexMask]; | |
972 | uSeqLen = *u16SeqPtr & kToSeqMiscLenMask; | |
973 | combClass[0] = (uint8_t)(*u16SeqPtr++ >> kToSeqMiscCCShift); | |
974 | break; | |
975 | case kToU32CharMask: | |
976 | if (case_sens && (trieValue & kToSeqCaseFoldMask)) { | |
977 | /* don't use the mapping, it is only for case folding */ | |
978 | u32NormFoldBuf[0] = u32char; | |
979 | /* already have combClass[0] = 0 */ | |
980 | return 1; | |
981 | } | |
982 | u32SeqPtr = &nfU32Char[trieValue & kToSeqIndexMask]; | |
983 | uSeqLen = 1; | |
984 | break; | |
985 | case kToU32SeqMiscMask: | |
986 | u32SeqPtr = &nfU32SeqMisc[trieValue & kToSeqMiscIndexMask]; | |
987 | uSeqLen = *u32SeqPtr & kToSeqMiscLenMask; | |
988 | combClass[0] = (uint8_t)(*u32SeqPtr++ >> kToSeqMiscCCShift); | |
989 | break; | |
990 | default: | |
991 | return -1; | |
992 | } | |
993 | if (kNFCSingleCharDecompMax < uSeqLen) { | |
994 | return 0; | |
995 | } | |
996 | int32_t idx; | |
997 | for (idx = 0; idx < uSeqLen; idx++) { | |
998 | u32NormFoldBuf[idx] = (u16SeqPtr)? *u16SeqPtr++: *u32SeqPtr++; | |
999 | if (idx > 0) { | |
1000 | combClass[idx] = getCombClassU32Char(u32NormFoldBuf[idx]); | |
1001 | } | |
1002 | } | |
1003 | return adjustCase(case_sens, uSeqLen, u32NormFoldBuf); | |
1004 | } | |
1005 | ||
1006 | /* | |
1007 | * adjustCase, final adjustments to normalizeOptCaseFoldU32Char for case folding | |
1008 | * | |
1009 | * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only | |
1010 | * uSeqLen - length of the sequence specified in the u32NormFoldBuf | |
1011 | * u32NormFoldBuf - buffer of length kNFCSingleCharDecompMax (assume to be at least 3) | |
1012 | * with normalized result. | |
1013 | * | |
1014 | * returns uSeqLen if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax | |
1015 | * is insufficient (though it is assumed to be at least 3), else the length of the | |
1016 | * normalized and case-folded result (currently in the range 1..4). | |
1017 | * | |
1018 | * This function is a reduced version of normalizeOptCaseFoldU32Char above. | |
1019 | */ | |
1020 | ||
1021 | static int32_t | |
1022 | adjustCase(bool case_sens, int32_t uSeqLen, | |
1023 | int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]) | |
1024 | { | |
1025 | if (!case_sens && uSeqLen > 0) { | |
1026 | if (u32NormFoldBuf[0] < kSimpleCaseFoldLimit) { | |
1027 | u32NormFoldBuf[0] = nfBasicCF[u32NormFoldBuf[0]]; | |
1028 | /* There is one case in which this maps to a character with different combining | |
1029 | * class: U+0345 (cc 240) casefolds to U+03B9 (cc 0). However when this is the | |
1030 | * first or only character in the sequence, we want to keep the original | |
1031 | * combining class, so nothing special to do here. | |
1032 | */ | |
1033 | } | |
1034 | /* The following is the only case where we have a casefolding after the first | |
1035 | * character in the sequence. Don't worry about combining class here. that gets | |
1036 | * set later for characters after the first. | |
1037 | */ | |
1038 | if (uSeqLen > 1 && u32NormFoldBuf[uSeqLen - 1] == 0x0345) { | |
1039 | u32NormFoldBuf[uSeqLen - 1] = 0x03B9; | |
1040 | } | |
1041 | } | |
1042 | return uSeqLen; | |
1043 | } | |
1044 | ||
1045 | /* | |
1046 | * getCombClassU32Char, map a single character (in UTF32 form) to its combining class. | |
1047 | * | |
1048 | * u32char - input UTF32 code point. This is assumed to be a valid character that does | |
1049 | * not have a decomposition. | |
1050 | * | |
1051 | * returns combining class of the character. | |
1052 | * | |
1053 | * This is only called for characters after the first is a decomposition expansion. In | |
1054 | * this situation, if we encounter U+03B9 (combining class 0), it is only there as the | |
1055 | * case-folding of U+0345 (combining class 240). In this case it is the combining class | |
1056 | * for U+0345 that we want. In the non-casefold case we won't see U+03B9 here at all. | |
1057 | * | |
1058 | * This function is a reduced version of normalizeOptCaseFoldU32Char above. | |
1059 | */ | |
1060 | static uint8_t | |
1061 | getCombClassU32Char(int32_t u32char) | |
1062 | { | |
1063 | if (u32char >= kU32HiPUAStart) { | |
1064 | return 0; | |
1065 | } | |
1066 | if (u32char == 0x03B9) { | |
1067 | return 240; | |
1068 | } | |
1069 | /* for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */ | |
1070 | /* 0 - 0x313FF; everything in between in currently invalid. */ | |
1071 | int32_t u32charLookup = u32char; | |
1072 | if (u32charLookup >= kU32LowRangeLimit) { | |
1073 | u32charLookup -= (kU32HiRangeStart - kU32LowRangeLimit); | |
1074 | } | |
1075 | /* Now we have u32charLookup either in 0..0x313FF representing u32char itself, | |
1076 | * or in 0x31400..0x315FF representing u32char 0xE0000..0xE01FF; look it up in | |
1077 | * the trie that identifies unassigneds in this range, or maps others to | |
1078 | * decomps or combining class or just self. */ | |
1079 | uint16_t trieValue; | |
1080 | /* TrieHi */ | |
1081 | trieValue = nfTrieHi[u32charLookup >> kNFTrieHiShift]; | |
1082 | if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { | |
1083 | return trieValue & kFlagValueMask; | |
1084 | } | |
1085 | /* TrieMid */ | |
1086 | trieValue = nfTrieMid[trieValue & kNextIndexValueMask][(u32charLookup >> kNFTrieMidShift) & kNFTrieMidMask]; | |
1087 | if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { /* return self; */ | |
1088 | return trieValue & kFlagValueMask; | |
1089 | } | |
1090 | if ((trieValue & kFlagTestMask) == kInvMaskFlag) { | |
1091 | return 0; | |
1092 | } | |
1093 | /* TrieLo */ | |
1094 | trieValue = nfTrieLo[trieValue & kNextIndexValueMask][u32charLookup & kNFTrieMidMask]; | |
1095 | return ((trieValue & kFlagTestMask) == kCombClassFlag)? (trieValue & kFlagValueMask): 0; | |
1096 | } | |
1097 | ||
1098 | /* | |
1099 | * decomposeHangul, map a single UTF32 code point for a composed Hangul | |
1100 | * in the range AC00-D7A3, using algorithmic decomp | |
1101 | * | |
1102 | * The normalized result will be 2 or 3 UTF32 characters. | |
1103 | * | |
1104 | * u32char - input UTF32 code point | |
1105 | * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3) | |
1106 | * to receive the normalize result. | |
1107 | * | |
1108 | * returns the length of the normalized result (2..3). | |
1109 | * | |
1110 | * Adapted from ICU Hangul:decompose in normalizer2impl.h | |
1111 | * | |
1112 | */ | |
1113 | ||
1114 | enum { | |
1115 | HANGUL_BASE=0xAC00, | |
1116 | JAMO_L_BASE=0x1100, /* "lead" jamo */ | |
1117 | JAMO_V_BASE=0x1161, /* "vowel" jamo */ | |
1118 | JAMO_T_BASE=0x11A7, /* "trail" jamo */ | |
1119 | JAMO_L_COUNT=19, | |
1120 | JAMO_V_COUNT=21, | |
1121 | JAMO_T_COUNT=28, | |
1122 | }; | |
1123 | ||
1124 | static int32_t | |
1125 | decomposeHangul(int32_t u32char, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]) | |
1126 | { | |
1127 | u32char -= HANGUL_BASE; | |
1128 | int32_t tIndex = u32char % JAMO_T_COUNT; | |
1129 | u32char /= JAMO_T_COUNT; | |
1130 | u32NormFoldBuf[0] = (uint16_t)(JAMO_L_BASE + u32char / JAMO_V_COUNT); | |
1131 | u32NormFoldBuf[1] = (uint16_t)(JAMO_V_BASE + u32char % JAMO_V_COUNT); | |
1132 | if (tIndex == 0) { | |
1133 | return 2; | |
1134 | } | |
1135 | u32NormFoldBuf[2] = (uint16_t)(JAMO_T_BASE + tIndex); | |
1136 | return 3; | |
1137 | } |