2 * Copyright (C) 2016-2020 Apple, Inc. All rights reserved.
3 * Some portions covered by other copyrights, listed below.
5 * Copyright (C) 2016 and later: Unicode, Inc. and others.
6 * License & terms of use: http://www.unicode.org/copyright.html
8 * Copyright (C) 1999-2015, International Business Machines
9 * Corporation and others. All Rights Reserved.
11 * add APPLE_OSREFERENCE_LICENSE_HEADER stuff...
14 #include <libkern/libkern.h>
15 #include <sys/errno.h>
16 #include <sys/unicode.h>
17 #include "vfs_unicode_data.h"
18 #define STATIC_UNLESS_TEST static
21 /* Maximum number of UTF8 bytes from one Unicode code point (one UTF32 code unit) */
22 kMaxUTF8BytesPerChar
= 4
25 /* local prototypes used by exported functions (and themselves exported for testing) */
27 int32_t utf8ToU32Code(int32_t u32char
, const char** srcPtr
, const char* srcLimit
);
29 int32_t normalizeOptCaseFoldU32Char(int32_t u32char
, bool case_sens
,
30 int32_t u32NormFoldBuf
[kNFCSingleCharDecompMax
],
31 uint8_t combClass
[kNFCSingleCharDecompMax
]);
32 /* local prototypes used by exported functions (not exported for separate testing) */
33 static int nextBaseAndAnyMarks(const char** strP
, const char *strLimit
, bool case_sens
,
34 int32_t* unorm
, uint8_t* unormcc
, int32_t* unormlenP
, int32_t* unormstartP
,
35 int32_t* buf
, uint8_t* bufcc
, int32_t* buflenP
,
36 bool* needReorderP
, bool* startP
);
37 void doReorder(int32_t* buf
, uint8_t* bufcc
, int32_t buflen
);
38 int32_t u32CharToUTF8Bytes(uint32_t u32char
, uint8_t utf8Bytes
[kMaxUTF8BytesPerChar
]);
41 * utf8_normalizeOptCaseFoldGetUVersion
43 * version[0] = Unicode major version; for Unicode 6.3.0 this would be 6
44 * version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3
45 * version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0
46 * version[3] = Code revision level; for any given Unicode version, this value starts
47 * at 0 and is incremented for each significant revision to the
48 * normalizeOptCaseFold functions.
51 utf8_normalizeOptCaseFoldGetUVersion(unsigned char version
[4])
61 * utf8_normalizeOptCaseFoldAndHash
63 * str: The input UTF-8 string (need not be 0 terminated)
64 * str_len: The byte length of the input string (excluding any 0 terminator)
65 * case_sens: False for case-insensitive behavior; generates canonical caseless form.
66 * True for case-sensitive behavior; generates standard NFD.
67 * hash_func: A pointer to a hashing function to compute the hash of the
68 * normalized/case-folded result. buf contains buf_len bytes
69 * of data to be added to the hash using the caller-supplied
71 * hash_ctx: The context for the hash function.
73 * Returns: 0 on success, or
74 * EILSEQ: The input string contains illegal ASCII-range characters
75 * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
76 * contains codepoints that are non-characters or unassigned in
77 * the version of Unicode currently supported (Unicode 9.0).
81 utf8_normalizeOptCaseFoldAndHash(const char *str
,
84 void (*hash_func
)(void *buf
, size_t buf_len
, void *ctx
),
87 const char *strLimit
= str
+ str_len
;
89 /* Data for the next pending single-char norm from input;
90 * This will always begin with a base char (combining class 0)
91 * or the first character in the string, which may no be a base */
92 int32_t unorm
[kNFCSingleCharDecompMax
];
93 uint8_t unormcc
[kNFCSingleCharDecompMax
];
95 int32_t unormstart
= 0;
100 * Each input character may be normalized to a sequence of one or more characters,
101 * some of which may have non-zero combining class. Any sequence of characters
102 * with non-zero combining class resulting from one or more input characters needs
103 * to be accumulated in the main buffer so we can reorder as necessary before
104 * calling the hash function.
106 * At the beginning of the main loop: The normalization buffer and main buffer are
109 * Each time through the main loop we do the following:
110 * 1. If there are characters available in the normalization result buffer (from the
111 * result of normalizing a previous input character), copy the first character and
112 * any following characters that have non-zero combining class to the main buffer.
113 * 2. If there is nothing left in the normalization buffer, then loop processing
114 * input characters as follows:
115 * a) Get the next input character from UTF8, get its normalized and case-folded
116 * result in the normalization buffer.
117 * b) If the first character in the normalization buffer has combining class 0,
118 * break; we will handle this normalization buffer next time through the main
120 * c) Else copy the current normalization buffer (which has only combining marks)
121 * to the main buffer, and continue with the loop processing input characters.
122 * 3. At this point the first character in the main buffer may or may not have
123 * combining class 0, but any subsequent characters (up to the the limit for
124 * stream safe text) will be combining characters with nonzero combining class.
125 * Reorder the combining marks if necessary into canonical order.
126 * 4. Call the hash function for each character in the main buffer.
130 /* Data for the buffers being built up from input */
131 int32_t buf
[kNCFStreamSafeBufMax
];
132 uint8_t bufcc
[kNCFStreamSafeBufMax
];
134 bool needReorder
= false;
137 err
= nextBaseAndAnyMarks(&str
, strLimit
, case_sens
, unorm
, unormcc
, &unormlen
, &unormstart
,
138 buf
, bufcc
, &buflen
, &needReorder
, &start
);
144 /* Now buffer should have all of the combining marks up to the next base char.
145 * Normally it will also start with the last base char encountered (unless the
146 * UTF8 string began with a combining mark). */
147 /* Now reorder combining marks if necessary. */
149 doReorder(buf
, bufcc
, buflen
);
151 /* Now write to hash func */
152 hash_func(buf
, buflen
* sizeof(buf
[0]), hash_ctx
);
154 /* OK so far, top of loop clears buffers to start refilling again */
155 } while (str
< strLimit
|| unormlen
> 0);
160 * utf8_normalizeOptCaseFoldAndCompare
162 * strA: A UTF-8 string to be compared (need not be 0 terminated)
163 * strA_len: The byte length of strA (excluding any 0 terminator)
164 * strB: The second UTF-8 string to be compared (need not be 0 terminated)
165 * strB_len: The byte length of strB (excluding any 0 terminator)
166 * case_sens: False for case-insensitive behavior; compares canonical caseless forms.
167 * True for case-sensitive behavior; compares standard NFD forms.
168 * are_equal: On success, set to true if the strings are equal, or set to false
171 * Returns: 0 on success, or
172 * EILSEQ: One or both of the input strings contains illegal ASCII-range
173 * characters (0x00 or '/'), or is not well-formed stream-safe UTF-8,
174 * or contains codepoints that are non-characters or unassigned in
175 * the version of Unicode currently supported (Unicode 9.0).
176 * Note: The comparison may terminate early when a difference is
177 * detected, and may return 0 and set *are_equal=false even
178 * if one or both strings are invalid.
180 enum { kNFCSingleCharDecompMaxPlusPushback
= kNFCSingleCharDecompMax
+ 4 }; /* room for 03B9 pushback(s) */
183 utf8_normalizeOptCaseFoldAndCompare(const char *strA
,
190 const char *strALimit
= strA
+ strA_len
;
191 const char *strBLimit
= strB
+ strB_len
;
193 /* Data for the next pending single-char norms from each input;
194 * These will always begin with a base char (combining class 0)
195 * or the first character in the string, which may not be a base */
196 int32_t unormA
[kNFCSingleCharDecompMaxPlusPushback
], unormB
[kNFCSingleCharDecompMaxPlusPushback
];
197 uint8_t unormAcc
[kNFCSingleCharDecompMaxPlusPushback
], unormBcc
[kNFCSingleCharDecompMaxPlusPushback
];
198 int32_t unormAlen
= 0, unormBlen
= 0;
199 int32_t unormAstart
= 0, unormBstart
= 0;
201 bool startA
= true, startB
= true;
204 * The main loop here is similar to the main loop in utf8_normalizeOptCaseFoldAndHash,
205 * described above. The differences are:
206 * - We keep a normalization buffer and main buffer for each string.
207 * - In the main loop, we do steps 1-3 for each string.
208 * - In step 4, instead of calling the hash function, we compare the two main
209 * buffers; if they are unequal, we return a non-equal result.
210 * - After the end of the main loop, if we still have data for one string but
211 * not the other, return a non-equal result, else return an equal result.
214 /* Data for the buffers being built up from each input */
215 int32_t bufA
[kNCFStreamSafeBufMax
], bufB
[kNCFStreamSafeBufMax
];
216 uint8_t bufAcc
[kNCFStreamSafeBufMax
], bufBcc
[kNCFStreamSafeBufMax
];
217 int32_t bufAlen
= 0, bufBlen
= 0;
218 bool needReorderA
= false, needReorderB
= false;
221 err
= nextBaseAndAnyMarks(&strA
, strALimit
, case_sens
, unormA
, unormAcc
, &unormAlen
, &unormAstart
,
222 bufA
, bufAcc
, &bufAlen
, &needReorderA
, &startA
);
226 err
= nextBaseAndAnyMarks(&strB
, strBLimit
, case_sens
, unormB
, unormBcc
, &unormBlen
, &unormBstart
,
227 bufB
, bufBcc
, &bufBlen
, &needReorderB
, &startB
);
232 if (bufAlen
> 0 || bufBlen
> 0) {
233 /* Now each buffer should have all of the combining marks up to the next base char.
234 * Normally it will also start with the last base char encountered (unless the
235 * UTF8 string began with a combining mark). */
236 /* Now reorder combining marks if necessary. */
238 doReorder(bufA
, bufAcc
, bufAlen
);
241 doReorder(bufB
, bufBcc
, bufBlen
);
243 /* handle 03B9 pushback */
246 if (bufAlen
> 1 && bufA
[bufAlen
- 1] == 0x03B9 && unormAstart
== 0) {
247 int32_t tailCount
= 0;
248 while (tailCount
< kNFCSingleCharDecompMaxPlusPushback
- unormAlen
&& bufAlen
> 1 && bufA
[bufAlen
- 1] == 0x03B9) {
252 for (idx
= unormAlen
; idx
> 0; idx
--) {
253 unormA
[idx
- 1 + tailCount
] = unormA
[idx
- 1];
254 unormAcc
[idx
- 1 + tailCount
] = unormAcc
[idx
- 1];
256 for (idx
= 0; idx
< tailCount
; idx
++) {
257 unormA
[idx
] = 0x03B9;
260 unormAlen
+= tailCount
;
262 if (bufBlen
> 1 && bufB
[bufBlen
- 1] == 0x03B9 && unormBstart
== 0) {
263 int32_t tailCount
= 0;
264 while (tailCount
< kNFCSingleCharDecompMaxPlusPushback
- unormBlen
&& bufBlen
> 1 && bufB
[bufBlen
- 1] == 0x03B9) {
268 for (idx
= unormBlen
; idx
> 0; idx
--) {
269 unormB
[idx
- 1 + tailCount
] = unormB
[idx
- 1];
270 unormBcc
[idx
- 1 + tailCount
] = unormBcc
[idx
- 1];
272 for (idx
= 0; idx
< tailCount
; idx
++) {
273 unormB
[idx
] = 0x03B9;
276 unormBlen
+= tailCount
;
279 /* Now compare the buffers. */
280 if (bufAlen
!= bufBlen
|| memcmp(bufA
, bufB
, bufAlen
* sizeof(bufA
[0])) != 0) {
285 /* OK so far, top of loop clears buffers to start refilling again */
286 } while ((strA
< strALimit
|| unormAlen
> 0) && (strB
< strBLimit
|| unormBlen
> 0));
288 *are_equal
= (strA
== strALimit
&& unormAlen
== 0 && strB
== strBLimit
&& unormBlen
== 0);
293 * utf8_normalizeOptCaseFold
295 * str: The input UTF-8 string (need not be 0 terminated)
296 * str_len: The byte length of the input string (excluding any 0 terminator)
297 * case_sens: False for case-insensitive behavior; generates canonical caseless form.
298 * True for case-sensitive behavior; generates standard NFD.
299 * ustr: A pointer to a buffer for the resulting UTF-32 string.
300 * ustr_size: The capacity of ustr, in UTF-32 units.
301 * ustr_len: Pointer to a value that will be filled in with the actual length
302 * in UTF-32 units of the string copied to ustr.
304 * Returns: 0 on success, or
305 * EILSEQ: The input string contains illegal ASCII-range characters
306 * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
307 * contains codepoints that are non-characters or unassigned in
308 * the version of Unicode currently supported.
309 * ENOMEM: ustr_size is insufficient for the resulting string. In this
310 * case the value returned in *ustr_len is invalid.
313 utf8_normalizeOptCaseFold(const char *str
,
320 const char *strLimit
= str
+ str_len
;
321 int32_t *ustrCur
= ustr
;
322 const int32_t *ustrLimit
= ustr
+ ustr_size
;
324 /* Data for the next pending single-char norm from input;
325 * This will always begin with a base char (combining class 0) */
326 int32_t unorm
[kNFCSingleCharDecompMax
];
327 uint8_t unormcc
[kNFCSingleCharDecompMax
];
328 int32_t unormlen
= 0;
329 int32_t unormstart
= 0;
335 /* Data for the buffers being built up from input */
336 int32_t buf
[kNCFStreamSafeBufMax
];
337 uint8_t bufcc
[kNCFStreamSafeBufMax
];
339 bool needReorder
= false;
342 err
= nextBaseAndAnyMarks(&str
, strLimit
, case_sens
, unorm
, unormcc
, &unormlen
, &unormstart
,
343 buf
, bufcc
, &buflen
, &needReorder
, &start
);
350 doReorder(buf
, bufcc
, buflen
);
352 /* Now copy to output buffer */
354 if (ustrCur
+ buflen
> ustrLimit
) {
357 for (idx
= 0; idx
< buflen
; idx
++) {
358 *ustrCur
++ = buf
[idx
];
361 /* OK so far, top of loop clears buffers to start refilling again */
362 } while (str
< strLimit
|| unormlen
> 0);
363 *ustr_len
= (uint32_t)(ustrCur
- ustr
); // XXXpjr: the explicit (uint32_t) cast wasn't present in the original code drop
368 * utf8_normalizeOptCaseFoldToUTF8
369 * (This is similar to normalizeOptCaseFold except that this has a different output
370 * buffer type, and adds conversion to UTF8 while copying to output buffer)
372 * str: The input UTF-8 string (need not be 0 terminated)
373 * str_len: The byte length of the input string (excluding any 0 terminator)
374 * case_sens: False for case-insensitive behavior; generates canonical caseless form.
375 * True for case-sensitive behavior; generates standard NFD.
376 * ustr: A pointer to a buffer for the resulting UTF-8 string.
377 * ustr_size: The capacity of ustr, in bytes.
378 * ustr_len: Pointer to a value that will be filled in with the actual length
379 * in bytes of the string copied to ustr.
381 * Returns: 0 on success, or
382 * EILSEQ: The input string contains illegal ASCII-range characters
383 * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
384 * contains codepoints that are non-characters or unassigned in
385 * the version of Unicode currently supported.
386 * ENOMEM: ustr_size is insufficient for the resulting string. In this
387 * case the value returned in *ustr_len is invalid.
390 utf8_normalizeOptCaseFoldToUTF8(const char *str
,
397 const char *strLimit
= str
+ str_len
;
398 char *ustrCur
= ustr
;
399 const char *ustrLimit
= ustr
+ ustr_size
;
401 /* Data for the next pending single-char norm from input;
402 * This will always begin with a base char (combining class 0) */
403 int32_t unorm
[kNFCSingleCharDecompMax
];
404 uint8_t unormcc
[kNFCSingleCharDecompMax
];
405 int32_t unormlen
= 0;
406 int32_t unormstart
= 0;
412 /* Data for the buffers being built up from input */
413 int32_t buf
[kNCFStreamSafeBufMax
];
414 uint8_t bufcc
[kNCFStreamSafeBufMax
];
416 bool needReorder
= false;
419 err
= nextBaseAndAnyMarks(&str
, strLimit
, case_sens
, unorm
, unormcc
, &unormlen
, &unormstart
,
420 buf
, bufcc
, &buflen
, &needReorder
, &start
);
426 uint8_t utf8Bytes
[kMaxUTF8BytesPerChar
];
427 int32_t *bufPtr
= buf
;
429 doReorder(buf
, bufcc
, buflen
);
431 /* Now copy to output buffer */
432 while (buflen
-- > 0) {
433 int32_t idx
, utf8Len
= u32CharToUTF8Bytes((uint32_t)*bufPtr
++, utf8Bytes
);
434 if (ustrCur
+ utf8Len
> ustrLimit
) {
437 for (idx
= 0; idx
< utf8Len
; idx
++) {
438 *ustrCur
++ = (char)utf8Bytes
[idx
];
442 /* OK so far, top of loop clears buffers to start refilling again */
443 } while (str
< strLimit
|| unormlen
> 0);
444 *ustr_len
= ustrCur
- ustr
;
449 * utf8_normalizeOptCaseFoldAndMatchSubstring
451 * strA: A UTF-8 string (need not be 0 terminated) in which to search for the
452 * substring specified by ustrB.
453 * strA_len: The byte length of strA (excluding any 0 terminator)
454 * ustrB: A normalized UTF-32 substring (need not be 0 terminated) to be searched
455 * for in the UTF-32 string resulting from converting strA to the normalized
456 * UTF-32 form specified by the case_sens parameter; ustrB must already be
458 * ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator).
459 * case_sens: False for case-insensitive matching; compares canonical caseless forms.
460 * True for case-sensitive matching; compares standard NFD forms.
461 * buf: Pointer to caller-supplied working memory for storing the portion of
462 * strA which has been converted to normalized UTF-32.
463 * buf_size: The size of buf.
464 * has_match: On success, set to true if strA (when converter to UTF-32 and normalized
465 * per case_sens) contains ustrB, set to false otherwise.
467 * Returns: 0 on success, or
468 * EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is
469 * not well-formed stream-safe UTF-8, or contains codepoints that are
470 * non-characters or unassigned in the version of Unicode currently
472 * Note: The search may terminate early when a match is detected, and
473 * may return 0 and set *has_match=true even if strA is invalid.
474 * ENOMEM: buf_size is insufficient.
477 utf8_normalizeOptCaseFoldAndMatchSubstring(const char *strA
,
479 const int32_t *ustrB
,
487 * ustrA represents the current position in the UTF-32 normalized version of strA
488 * at which we want to test for a match; ustrANormEnd is the position beyond that
489 * which is just after the end of what has already been converted from strA to
490 * UTF-32 normalized form.
491 * Each time through the main loop:
492 * - The first task is to make sure we have enough of strA converted to UTF32
493 * normalized form to test for match with ustrB at the current match position.
494 * If we don't, then convert more of strA to UTF-32 normalized form until we
495 * have enough to compare with ustrB. To do this, run a loop which is like the
496 * main loop in utf8_normalizeOptCaseFoldAndHash except that in step 4, instead of
497 * calling the hash function, we copy the normalized buffer to ustrANormEnd,
498 * advancing the latter. We keep doing this until we have enough additional
499 * converted to match with ustrB.
500 * - Then we test for match of ustrB at the current ustrA position. If there is
501 * a match we return; otherwise, if there is more strA to convert we advance
502 * ustrA and repeat the main loop, otherwise we return without a match.
504 if (ustrB_len
== 0) { /* always matches */
508 *has_match
= false; /* initialize return value */
509 if (ustrB_len
> 2 * strA_len
) {
510 /* If ustrB is clearly too long to find in strA, don't bother normalizing strA.
511 * A UTF-8 character of 1 byte (ASCII) will normalize to 1 UTF-32 unit.
512 * A UTF-8 character of 2-4 bytes will normalize to a maximum of 4 UTF-32 units.
513 * The maximum expansion from unnormalized UTF-8 byte length to normalized
514 * UTF-32 unit length is thus 2. */
518 const char *strALimit
= strA
+ strA_len
;
519 int32_t *ustrA
= (int32_t *)buf
;
520 const int32_t *ustrALimit
= ustrA
+ (buf_size
/ sizeof(int32_t));
521 int32_t *ustrANormEnd
= ustrA
; /* how far we have already normalized in ustrA */
523 /* Data for the next pending single-char norms from each input;
524 * These will always begin with a base char (combining class 0)
525 * or the first character in the string, which may not be a base */
526 int32_t unormA
[kNFCSingleCharDecompMax
];
527 uint8_t unormAcc
[kNFCSingleCharDecompMax
];
528 int32_t unormAlen
= 0;
529 int32_t unormAstart
= 0;
534 /* convert enough more of strA to normalized UTF-32 in ustrA to check for match */
535 if (ustrANormEnd
- ustrA
< ustrB_len
) {
537 /* Data for the buffers being built up from each input */
538 int32_t bufA
[kNCFStreamSafeBufMax
];
539 uint8_t bufAcc
[kNCFStreamSafeBufMax
];
541 bool needReorderA
= false;
544 err
= nextBaseAndAnyMarks(&strA
, strALimit
, case_sens
, unormA
, unormAcc
, &unormAlen
, &unormAstart
,
545 bufA
, bufAcc
, &bufAlen
, &needReorderA
, &startA
);
551 /* Now each buffer should have all of the combining marks up to the next base char.
552 * Normally it will also start with the last base char encountered (unless the
553 * UTF8 string began with a combining mark). */
554 /* Now reorder combining marks if necessary. Should be rare, and sequences should
555 * usually be short when does occur => simple bubblesort should be sufficient. */
557 doReorder(bufA
, bufAcc
, bufAlen
);
559 /* Now copy to working buffer */
561 if (ustrANormEnd
+ bufAlen
> ustrALimit
) {
564 for (idx
= 0; idx
< bufAlen
; idx
++) {
565 *ustrANormEnd
++ = bufA
[idx
];
568 /* OK so far, top of loop clears buffers to start refilling again */
569 } while ((ustrANormEnd
- ustrA
< ustrB_len
) && (strA
< strALimit
|| unormAlen
> 0));
572 if (ustrANormEnd
- ustrA
< ustrB_len
) {
573 return 0; /* not enough of strA left for match */
575 /* check for match, return if so */
576 if (memcmp(ustrA
, ustrB
, ustrB_len
* sizeof(ustrB
[0])) == 0) {
580 ustrA
++; /* advance match position */
584 /* nextBaseAndAnyMarks:
585 * Guts of code to get next bufferful of base character (or first char in string)
586 * and all trailing combining marks.
587 * This is called each time through the main loop of functions above, and does the
589 * 1. If there are characters available in the normalization result buffer (from the
590 * result of normalizing a previous input character), copy the first character and
591 * any following characters that have non-zero combining class to the main buffer.
592 * 2. If there is nothing left in the normalization buffer, then loop processing
593 * input characters as follows:
594 * a) Get the next input character from UTF8, get its normalized and case-folded
595 * result in the normalization buffer.
596 * b) If the first character in the normalization buffer has combining class 0,
597 * break; we will handle this normalization buffer next time through the main
599 * c) Else copy the current normalization buffer (which has only combining marks)
600 * to the main buffer, and continue with the loop processing input characters.
604 nextBaseAndAnyMarks(const char** strP
, const char *strLimit
, bool case_sens
,
605 int32_t* unorm
, uint8_t* unormcc
, int32_t* unormlenP
, int32_t* unormstartP
,
606 int32_t* buf
, uint8_t* bufcc
, int32_t* buflenP
,
607 bool* needReorderP
, bool* startP
)
609 /* update buffers for str */
610 if (*unormlenP
> 0 && *unormstartP
< *unormlenP
) {
611 /* unorm begins with a base char; buflen should be 0 */
612 *needReorderP
= false;
613 for (*buflenP
= 0; true;) {
614 if (*buflenP
> 0 && unormcc
[*unormstartP
] > 0 && unormcc
[*unormstartP
] < bufcc
[(*buflenP
) - 1]) {
615 *needReorderP
= true;
617 buf
[*buflenP
] = unorm
[*unormstartP
];
618 bufcc
[(*buflenP
)++] = unormcc
[(*unormstartP
)++];
619 if (*unormstartP
>= *unormlenP
|| unormcc
[*unormstartP
] == 0) {
624 if (*unormstartP
>= *unormlenP
) {
625 *unormstartP
= *unormlenP
= 0;
626 while (*strP
< strLimit
) {
628 uint32_t bytevalue
= (uint8_t)*(*strP
)++;
629 /* '/' is not produced by NFD decomposition from another character so we can
630 * check for it before normalization */
631 if (bytevalue
== 0 || bytevalue
== 0x2F /*'/'*/) {
634 if (bytevalue
< 0x80) {
635 unorm
[0] = (!case_sens
&& bytevalue
>= 'A' && bytevalue
<= 'Z')? bytevalue
+= 0x20: bytevalue
;
641 int32_t u32char
= utf8ToU32Code(bytevalue
, strP
, strLimit
);
645 *unormlenP
= normalizeOptCaseFoldU32Char(u32char
, case_sens
, unorm
, unormcc
);
646 if (*unormlenP
<= 0) {
649 if (unormcc
[0] == 0 || *startP
) {
654 /* the latest char decomposes to just combining sequence, add to buffer being built */
655 if (*buflenP
+ *unormlenP
> kNCFStreamSafeBufMax
) {
658 for (idx
= 0; idx
< *unormlenP
; idx
++, (*buflenP
)++) {
659 if (*buflenP
> 0 && unormcc
[idx
] > 0 && unormcc
[idx
] < bufcc
[(*buflenP
) - 1]) {
660 *needReorderP
= true;
662 buf
[*buflenP
] = unorm
[idx
];
663 bufcc
[*buflenP
] = unormcc
[idx
];
671 /* local prototypes used only by internal functions */
672 static void swapBufCharCCWithPrevious(int32_t jdx
, int32_t buf
[], uint8_t bufcc
[]);
673 static int32_t adjustCase(bool case_sens
, int32_t uSeqLen
,
674 int32_t u32NormFoldBuf
[kNFCSingleCharDecompMax
]);
675 static uint8_t getCombClassU32Char(int32_t u32char
);
676 static int32_t decomposeHangul(int32_t u32char
, int32_t u32NormFoldBuf
[kNFCSingleCharDecompMax
]);
678 /* Reorder combining marks if necessary. Should be rare, and sequences should
679 * usually be short when does occur => simple bubblesort should be sufficient. */
681 doReorder(int32_t* buf
, uint8_t* bufcc
, int32_t buflen
)
684 for (idx
= 0; idx
< buflen
- 1; idx
++) {
685 for (jdx
= buflen
- 1; jdx
> idx
; jdx
--) {
686 if (bufcc
[jdx
] < bufcc
[jdx
- 1]) {
687 swapBufCharCCWithPrevious(jdx
, buf
, bufcc
);
692 /* swap function for bubblesort */
694 swapBufCharCCWithPrevious(int32_t jdx
, int32_t buf
[], uint8_t bufcc
[])
696 int32_t bufchar
= buf
[jdx
];
697 uint8_t bufccval
= bufcc
[jdx
];
698 buf
[jdx
] = buf
[jdx
- 1];
699 bufcc
[jdx
] = bufcc
[jdx
- 1];
700 buf
[jdx
- 1] = bufchar
;
701 bufcc
[jdx
- 1] = bufccval
;
705 * u32CharToUTF8Bytes, map a valid Unicode character (UTF32 code point) to 1..4 UTF8 bytes,
706 * and returns the number of UTF8 bytes.
708 * adapted from ICU macro U8_APPEND_UNSAFE (utf8.h).
711 u32CharToUTF8Bytes(uint32_t u32char
, uint8_t utf8Bytes
[kMaxUTF8BytesPerChar
])
714 if (u32char
<= 0x7F) {
715 utf8Bytes
[idx
++] = (uint8_t)u32char
;
717 if (u32char
<= 0x7FF) {
718 utf8Bytes
[idx
++] = (uint8_t)((u32char
>> 6) | 0xC0);
720 if (u32char
<= 0xFFFF) {
721 utf8Bytes
[idx
++] = (uint8_t)((u32char
>> 12) | 0xE0);
723 utf8Bytes
[idx
++] = (uint8_t)((u32char
>> 18) | 0xF0);
724 utf8Bytes
[idx
++] = (uint8_t)(((u32char
>> 12) & 0x3F) | 0x80);
726 utf8Bytes
[idx
++] = (uint8_t)(((u32char
>> 6) & 0x3F) | 0x80);
728 utf8Bytes
[idx
++] = (uint8_t)((u32char
& 0x3F) | 0x80);
733 /* two macros adapted from ICU's utf8.h */
734 #define U8_COUNT_TRAIL_BYTES_LOC(leadByte) \
735 ((uint8_t)(leadByte)<0XF0 ? \
736 ((uint8_t)(leadByte)>=0XC0)+((uint8_t)(leadByte)>=0XE0) : \
737 (uint8_t)(leadByte)<0XFE ? 3+((uint8_t)(leadByte)>=0XF8)+((uint8_t)(leadByte)>=0XFC) : 0)
739 #define U8_MASK_LEAD_BYTE_LOC(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
741 /* array adapted from ICU's utf_impl.c */
742 static const int32_t utf8_minLegal
[4] = { 0, 0X80, 0x800, 0x10000 };
745 * utf8ToU32Code, map a non-ASCII byte value plus a buffer of trail bytes to a UTF32 code point
747 * adapted from ICU macro U8_NEXT (utf8.h) and function utf8_nextCharSafeBody (utf_impl.c);
748 * verified to produce the same results (adusted for the difference in API signature).
750 * assumes at entry that:
751 * 1. a non-ASCII byte value (>= 0x80) that purports to be the beginning of a UTF8 character
752 * has been read, and its value is in u32char
753 * 2. *srcPtr points to the input buffer just after that non-ASCII byte, i.e. it purportedly
754 * points to the trail bytes for that UTF8 char.
755 * 3. srcLimit points to end of the input buffer (just after the last byte in the buffer)
757 * For a valid and complete UTF8 character, the function returns its value and advances
758 * *srcPtr to the first byte after the UTF8 char. Otherwise, the function returns -1
759 * (and the value in *srcPtr is undefined).
760 * Note that while it does not map to surrogate values (generates an error for malformed
761 * UTF-8 that would map to values in 0xD800..0xD8FF), it does output noncharacter values
762 * whose low 16 bits are 0xFFFE or 0xFFFF without generating an error.
764 * equivalences used in adapted ICU code:
768 * This has been validated against ICU behavior.
772 utf8ToU32Code(int32_t u32char
, const char** srcPtr
, const char* srcLimit
)
774 const char* src
= *srcPtr
;
776 if (0xE0 < u32char
&& u32char
<= 0xEC && src
+ 1 < srcLimit
&& (pt1
= (uint8_t)(src
[0] - 0x80)) <= 0x3F && (pt2
= (uint8_t)(src
[1] - 0x80)) <= 0x3F) {
777 /* handle U+1000..U+CFFF */
778 /* no need for (u32char&0xF) because the upper bits are truncated after <<12 in the cast to (uint16_t) */
779 u32char
= (uint16_t)((u32char
<< 12) | (pt1
<< 6) | pt2
);
781 } else if (u32char
< 0xE0 && u32char
>= 0xC2 && src
< srcLimit
&& (pt1
= (uint8_t)(src
[0] - 0x80)) <= 0x3F) {
782 /* handle U+0080..U+07FF */
783 u32char
= ((u32char
& 0x1F) << 6) | pt1
;
786 /* "complicated" and error cases, adapted from ICU's utf8_nextCharSafeBody() */
787 uint8_t count
= U8_COUNT_TRAIL_BYTES_LOC(u32char
);
788 if (src
+ count
<= srcLimit
) {
791 U8_MASK_LEAD_BYTE_LOC(u32char
, count
);
793 /* branches 3, 2 fall through to the next one */
794 case 0: /* count==0 for illegally leading trail bytes and the illegal bytes 0XFE and 0XFF */
796 case 4: /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
799 trail
= *src
++ - 0X80;
800 u32char
= (u32char
<< 6) | trail
;
801 /* u32char>=0x110 would result in code point>0x10FFFF, outside Unicode */
802 if (u32char
>= 0x110 || trail
> 0X3F) {
806 trail
= *src
++ - 0X80;
807 u32char
= (u32char
<< 6) | trail
;
809 * test for a surrogate D800..DFFF:
810 * before the last (u32char<<6), a surrogate is u32char=360..37F
812 if (((u32char
& 0xFFE0) == 0x360) || trail
> 0X3F) {
816 trail
= *src
++ - 0X80;
817 u32char
= (u32char
<< 6) | trail
;
821 /* correct sequence - all trail bytes have (b7..b6)==(10) */
822 if (u32char
>= utf8_minLegal
[count
]) {
826 /* no default branch to optimize switch() - all values are covered */
836 * normalizeCaseFoldU32Code, map a single UTF32 code point to its normalized result
837 * and the combining classes for each resulting char, or indicate it is invalid.
839 * The normalized and case-folded result might be up to 4 UTF32 characters (current
840 * max, could change in the future).
842 * u32char - input UTF32 code point
843 * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only
844 * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
845 * to receive the normalize result.
846 * combClass - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
847 * to receive the combining classes for the characters in u32NormFoldBuf. If
848 * the first entry has non-zero combining class, the remaining entries do too.
850 * returns -1 if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax
851 * is insufficient (though it is assumed to be at least 3), else the length of the
852 * normalized and case-folded result (currently in the range 1..4).
854 * This has been validated against ICU behavior.
856 * This function is highly dependent on the structure of the data trie; for details on
857 * that structure, see comments in normalizeCaseFoldData.h
861 normalizeOptCaseFoldU32Char(int32_t u32char
, bool case_sens
,
862 int32_t u32NormFoldBuf
[kNFCSingleCharDecompMax
],
863 uint8_t combClass
[kNFCSingleCharDecompMax
])
866 /* return hi-range PUA as self, except non-characters */
867 if (u32char
>= kU32HiPUAStart
) {
868 if ((u32char
& 0xFFFE) == 0xFFFE) {
871 u32NormFoldBuf
[0] = u32char
;
874 /* for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */
875 /* 0 - 0x313FF; everything in between in currently invalid. */
876 int32_t u32charLookup
= u32char
;
877 if (u32charLookup
>= kU32LowRangeLimit
) {
878 u32charLookup
-= (kU32HiRangeStart
- kU32LowRangeLimit
);
879 if (u32charLookup
< kU32LowRangeLimit
|| u32charLookup
>= (kU32LowRangeLimit
+ kU32HiRangeLen
)) {
880 return -1; /* in the large range of currently-unassigned code points */
883 /* Now we have u32charLookup either in 0..0x313FF representing u32char itself,
884 * or in 0x31400..0x315FF representing u32char 0xE0000..0xE01FF; look it up in
885 * the trie that identifies unassigneds in this range, or maps others to
886 * decomps or combining class or just self. */
889 trieValue
= nfTrieHi
[u32charLookup
>> kNFTrieHiShift
];
890 if (trieValue
== kInvalidCodeFlag
) {
893 if (trieValue
== 0 || (trieValue
& kFlagTestMask
) == kCombClassFlag
) { /* return self; */
894 u32NormFoldBuf
[0] = u32char
;
895 combClass
[0] = trieValue
& kFlagValueMask
;
898 if (trieValue
== kHangulMask
) {
899 combClass
[1] = combClass
[2] = 0;
900 return decomposeHangul(u32char
, u32NormFoldBuf
);
903 trieValue
= nfTrieMid
[trieValue
& kNextIndexValueMask
][(u32charLookup
>> kNFTrieMidShift
) & kNFTrieMidMask
];
904 if (trieValue
== kInvalidCodeFlag
) {
907 if (trieValue
== 0 || (trieValue
& kFlagTestMask
) == kCombClassFlag
) {
908 u32NormFoldBuf
[0] = u32char
;
909 combClass
[0] = trieValue
& kFlagValueMask
;
910 return adjustCase(case_sens
, 1, u32NormFoldBuf
);
912 if ((trieValue
& kFlagTestMask
) == kInvMaskFlag
) {
913 uint16_t invalidMask
= nfU16InvMasks
[trieValue
& kFlagValueMask
];
914 uint16_t testBit
= (uint16_t)(1 << (u32charLookup
& kNFTrieLoMask
));
915 if (testBit
& invalidMask
) {
919 /* treat like trieValue == 0 above */
920 u32NormFoldBuf
[0] = u32char
;
921 return adjustCase(case_sens
, 1, u32NormFoldBuf
);;
924 if (trieValue
== kHangulMask
) {
925 combClass
[1] = combClass
[2] = 0;
926 return decomposeHangul(u32char
, u32NormFoldBuf
);
929 trieValue
= nfTrieLo
[trieValue
& kNextIndexValueMask
][u32charLookup
& kNFTrieLoMask
];
930 if (trieValue
== kInvalidCodeFlag
) {
933 if (trieValue
== kHangulMask
) {
934 combClass
[1] = combClass
[2] = 0;
935 return decomposeHangul(u32char
, u32NormFoldBuf
);
937 if (trieValue
< kToU16Seq2Mask
|| trieValue
> kSpecialsEnd
) {
938 if (trieValue
== 0 || (trieValue
& kFlagTestMask
) == kCombClassFlag
) {
939 u32NormFoldBuf
[0] = u32char
;
940 combClass
[0] = trieValue
& kFlagValueMask
;
942 u32NormFoldBuf
[0] = trieValue
;
944 return adjustCase(case_sens
, 1, u32NormFoldBuf
);;
946 const uint16_t* u16SeqPtr
= NULL
;
947 const int32_t* u32SeqPtr
= NULL
;
949 switch (trieValue
& kSpecialsMask
) {
951 if (case_sens
&& (trieValue
& kToSeqCaseFoldMask
)) {
952 /* don't use the mapping, it is only for case folding */
953 u32NormFoldBuf
[0] = u32char
;
954 /* already have combClass[0] = 0 */
957 u16SeqPtr
= nfU16Seq2
[trieValue
& kToSeqIndexMask
];
961 if (case_sens
&& (trieValue
& kToSeqCaseFoldMask
)) {
962 /* don't use the mapping, it is only for case folding */
963 u32NormFoldBuf
[0] = u32char
;
964 /* already have combClass[0] = 0 */
967 u16SeqPtr
= nfU16Seq3
[trieValue
& kToSeqIndexMask
];
970 case kToU16SeqMiscMask
:
971 u16SeqPtr
= &nfU16SeqMisc
[trieValue
& kToSeqMiscIndexMask
];
972 uSeqLen
= *u16SeqPtr
& kToSeqMiscLenMask
;
973 combClass
[0] = (uint8_t)(*u16SeqPtr
++ >> kToSeqMiscCCShift
);
976 if (case_sens
&& (trieValue
& kToSeqCaseFoldMask
)) {
977 /* don't use the mapping, it is only for case folding */
978 u32NormFoldBuf
[0] = u32char
;
979 /* already have combClass[0] = 0 */
982 u32SeqPtr
= &nfU32Char
[trieValue
& kToSeqIndexMask
];
985 case kToU32SeqMiscMask
:
986 u32SeqPtr
= &nfU32SeqMisc
[trieValue
& kToSeqMiscIndexMask
];
987 uSeqLen
= *u32SeqPtr
& kToSeqMiscLenMask
;
988 combClass
[0] = (uint8_t)(*u32SeqPtr
++ >> kToSeqMiscCCShift
);
993 if (kNFCSingleCharDecompMax
< uSeqLen
) {
997 for (idx
= 0; idx
< uSeqLen
; idx
++) {
998 u32NormFoldBuf
[idx
] = (u16SeqPtr
)? *u16SeqPtr
++: *u32SeqPtr
++;
1000 combClass
[idx
] = getCombClassU32Char(u32NormFoldBuf
[idx
]);
1003 return adjustCase(case_sens
, uSeqLen
, u32NormFoldBuf
);
1007 * adjustCase, final adjustments to normalizeOptCaseFoldU32Char for case folding
1009 * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only
1010 * uSeqLen - length of the sequence specified in the u32NormFoldBuf
1011 * u32NormFoldBuf - buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
1012 * with normalized result.
1014 * returns uSeqLen if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax
1015 * is insufficient (though it is assumed to be at least 3), else the length of the
1016 * normalized and case-folded result (currently in the range 1..4).
1018 * This function is a reduced version of normalizeOptCaseFoldU32Char above.
1022 adjustCase(bool case_sens
, int32_t uSeqLen
,
1023 int32_t u32NormFoldBuf
[kNFCSingleCharDecompMax
])
1025 if (!case_sens
&& uSeqLen
> 0) {
1026 if (u32NormFoldBuf
[0] < kSimpleCaseFoldLimit
) {
1027 u32NormFoldBuf
[0] = nfBasicCF
[u32NormFoldBuf
[0]];
1028 /* There is one case in which this maps to a character with different combining
1029 * class: U+0345 (cc 240) casefolds to U+03B9 (cc 0). However when this is the
1030 * first or only character in the sequence, we want to keep the original
1031 * combining class, so nothing special to do here.
1034 /* The following is the only case where we have a casefolding after the first
1035 * character in the sequence. Don't worry about combining class here. that gets
1036 * set later for characters after the first.
1038 if (uSeqLen
> 1 && u32NormFoldBuf
[uSeqLen
- 1] == 0x0345) {
1039 u32NormFoldBuf
[uSeqLen
- 1] = 0x03B9;
1046 * getCombClassU32Char, map a single character (in UTF32 form) to its combining class.
1048 * u32char - input UTF32 code point. This is assumed to be a valid character that does
1049 * not have a decomposition.
1051 * returns combining class of the character.
1053 * This is only called for characters after the first is a decomposition expansion. In
1054 * this situation, if we encounter U+03B9 (combining class 0), it is only there as the
1055 * case-folding of U+0345 (combining class 240). In this case it is the combining class
1056 * for U+0345 that we want. In the non-casefold case we won't see U+03B9 here at all.
1058 * This function is a reduced version of normalizeOptCaseFoldU32Char above.
1061 getCombClassU32Char(int32_t u32char
)
1063 if (u32char
>= kU32HiPUAStart
) {
1066 if (u32char
== 0x03B9) {
1069 /* for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */
1070 /* 0 - 0x313FF; everything in between in currently invalid. */
1071 int32_t u32charLookup
= u32char
;
1072 if (u32charLookup
>= kU32LowRangeLimit
) {
1073 u32charLookup
-= (kU32HiRangeStart
- kU32LowRangeLimit
);
1075 /* Now we have u32charLookup either in 0..0x313FF representing u32char itself,
1076 * or in 0x31400..0x315FF representing u32char 0xE0000..0xE01FF; look it up in
1077 * the trie that identifies unassigneds in this range, or maps others to
1078 * decomps or combining class or just self. */
1081 trieValue
= nfTrieHi
[u32charLookup
>> kNFTrieHiShift
];
1082 if (trieValue
== 0 || (trieValue
& kFlagTestMask
) == kCombClassFlag
) {
1083 return trieValue
& kFlagValueMask
;
1086 trieValue
= nfTrieMid
[trieValue
& kNextIndexValueMask
][(u32charLookup
>> kNFTrieMidShift
) & kNFTrieMidMask
];
1087 if (trieValue
== 0 || (trieValue
& kFlagTestMask
) == kCombClassFlag
) { /* return self; */
1088 return trieValue
& kFlagValueMask
;
1090 if ((trieValue
& kFlagTestMask
) == kInvMaskFlag
) {
1094 trieValue
= nfTrieLo
[trieValue
& kNextIndexValueMask
][u32charLookup
& kNFTrieMidMask
];
1095 return ((trieValue
& kFlagTestMask
) == kCombClassFlag
)? (trieValue
& kFlagValueMask
): 0;
1099 * decomposeHangul, map a single UTF32 code point for a composed Hangul
1100 * in the range AC00-D7A3, using algorithmic decomp
1102 * The normalized result will be 2 or 3 UTF32 characters.
1104 * u32char - input UTF32 code point
1105 * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
1106 * to receive the normalize result.
1108 * returns the length of the normalized result (2..3).
1110 * Adapted from ICU Hangul:decompose in normalizer2impl.h
1116 JAMO_L_BASE
=0x1100, /* "lead" jamo */
1117 JAMO_V_BASE
=0x1161, /* "vowel" jamo */
1118 JAMO_T_BASE
=0x11A7, /* "trail" jamo */
1125 decomposeHangul(int32_t u32char
, int32_t u32NormFoldBuf
[kNFCSingleCharDecompMax
])
1127 u32char
-= HANGUL_BASE
;
1128 int32_t tIndex
= u32char
% JAMO_T_COUNT
;
1129 u32char
/= JAMO_T_COUNT
;
1130 u32NormFoldBuf
[0] = (uint16_t)(JAMO_L_BASE
+ u32char
/ JAMO_V_COUNT
);
1131 u32NormFoldBuf
[1] = (uint16_t)(JAMO_V_BASE
+ u32char
% JAMO_V_COUNT
);
1135 u32NormFoldBuf
[2] = (uint16_t)(JAMO_T_BASE
+ tIndex
);