2  * Copyright (C) 2016-2020 Apple, Inc. All rights reserved. 
   3  * Some portions covered by other copyrights, listed below. 
   5  * Copyright (C) 2016 and later: Unicode, Inc. and others. 
   6  * License & terms of use: http://www.unicode.org/copyright.html 
   8  * Copyright (C) 1999-2015, International Business Machines 
   9  * Corporation and others.  All Rights Reserved. 
  11  * add APPLE_OSREFERENCE_LICENSE_HEADER stuff... 
  14 #include <libkern/libkern.h> 
  15 #include <sys/errno.h> 
  16 #include <sys/unicode.h> 
  17 #include "vfs_unicode_data.h" 
  18 #define STATIC_UNLESS_TEST static 
  21         /* Maximum number of UTF8 bytes from one Unicode code point (one UTF32 code unit) */ 
  22         kMaxUTF8BytesPerChar 
= 4 
  25 /* local prototypes used by exported functions (and themselves exported for testing) */ 
  27 int32_t utf8ToU32Code(int32_t u32char
, const char** srcPtr
, const char* srcLimit
); 
  29 int32_t normalizeOptCaseFoldU32Char(int32_t u32char
, bool case_sens
, 
  30     int32_t u32NormFoldBuf
[kNFCSingleCharDecompMax
], 
  31     uint8_t combClass
[kNFCSingleCharDecompMax
]); 
  32 /* local prototypes used by exported functions (not exported for separate testing) */ 
  33 static int nextBaseAndAnyMarks(const char** strP
, const char *strLimit
, bool case_sens
, 
  34     int32_t* unorm
, uint8_t* unormcc
, int32_t* unormlenP
, int32_t* unormstartP
, 
  35     int32_t* buf
, uint8_t* bufcc
, int32_t* buflenP
, 
  36     bool* needReorderP
, bool* startP
); 
  37 void doReorder(int32_t* buf
, uint8_t* bufcc
, int32_t buflen
); 
  38 int32_t u32CharToUTF8Bytes(uint32_t u32char
, uint8_t utf8Bytes
[kMaxUTF8BytesPerChar
]); 
  41  * utf8_normalizeOptCaseFoldGetUVersion 
  43  * version[0] = Unicode major version; for Unicode 6.3.0 this would be 6 
  44  * version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3 
  45  * version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0 
  46  * version[3] = Code revision level; for any given Unicode version, this value starts 
  47  *              at 0 and is incremented for each significant revision to the 
  48  *              normalizeOptCaseFold functions. 
  51 utf8_normalizeOptCaseFoldGetUVersion(unsigned char version
[4]) 
  61  * utf8_normalizeOptCaseFoldAndHash 
  63  * str:       The input UTF-8 string (need not be 0 terminated) 
  64  * str_len:   The byte length of the input string (excluding any 0 terminator) 
  65  * case_sens: False for case-insensitive behavior; generates canonical caseless form. 
  66  *            True for case-sensitive behavior; generates standard NFD. 
  67  * hash_func: A pointer to a hashing function to compute the hash of the 
  68  *            normalized/case-folded result. buf contains buf_len bytes 
  69  *            of data to be added to the hash using the caller-supplied 
  71  * hash_ctx:  The context for the hash function. 
  73  * Returns: 0 on success, or 
  74  *          EILSEQ: The input string contains illegal ASCII-range characters 
  75  *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or 
  76  *                  contains codepoints that are non-characters or unassigned in 
  77  *                  the version of Unicode currently supported (Unicode 9.0). 
  81 utf8_normalizeOptCaseFoldAndHash(const char *str
, 
  84     void      (*hash_func
)(void *buf
, size_t buf_len
, void *ctx
), 
  87         const char *strLimit 
= str 
+ str_len
; 
  89         /* Data for the next pending single-char norm from input; 
  90          *  This will always begin with a base char (combining class 0) 
  91          *  or the first character in the string, which may no be a base */ 
  92         int32_t unorm
[kNFCSingleCharDecompMax
]; 
  93         uint8_t unormcc
[kNFCSingleCharDecompMax
]; 
  95         int32_t unormstart 
= 0; 
 100          * Each input character may be normalized to a sequence of one or more characters, 
 101          * some of which may have non-zero combining class. Any sequence of characters 
 102          * with non-zero combining class resulting from one or more input characters needs 
 103          * to be accumulated in the main buffer so we can reorder as necessary before 
 104          * calling the hash function. 
 106          * At the beginning of the main loop: The normalization buffer and main buffer are 
 109          * Each time through the main loop we do the following: 
 110          * 1. If there are characters available in the normalization result buffer (from the 
 111          *    result of normalizing a previous input character), copy the first character and 
 112          *    any following characters that have non-zero combining class to the main buffer. 
 113          * 2. If there is nothing left in the normalization buffer, then loop processing 
 114          *    input characters as follows: 
 115          *   a) Get the next input character from UTF8, get its normalized and case-folded 
 116          *      result in the normalization buffer. 
 117          *   b) If the first character in the normalization buffer has combining class 0, 
 118          *      break; we will handle this normalization buffer next time through the main 
 120          *   c) Else copy the current normalization buffer (which has only combining marks) 
 121          *      to the main buffer, and continue with the loop processing input characters. 
 122          * 3. At this point the first character in the main buffer may or may not have 
 123          *    combining class 0, but any subsequent characters (up to the the limit for 
 124          *    stream safe text) will be combining characters with nonzero combining class. 
 125          *    Reorder the combining marks if necessary into canonical order. 
 126          * 4. Call the hash function for each character in the main buffer. 
 130                 /* Data for the buffers being built up from input */ 
 131                 int32_t buf
[kNCFStreamSafeBufMax
]; 
 132                 uint8_t bufcc
[kNCFStreamSafeBufMax
]; 
 134                 bool needReorder 
= false; 
 137                 err 
= nextBaseAndAnyMarks(&str
, strLimit
, case_sens
, unorm
, unormcc
, &unormlen
, &unormstart
, 
 138                     buf
, bufcc
, &buflen
, &needReorder
, &start
); 
 144                         /* Now buffer should have all of the combining marks up to the next base char. 
 145                          * Normally it will also start with the last base char encountered (unless the 
 146                          * UTF8 string began with a combining mark). */ 
 147                         /* Now reorder combining marks if necessary. */ 
 149                                 doReorder(buf
, bufcc
, buflen
); 
 151                         /* Now write to hash func */ 
 152                         hash_func(buf
, buflen 
* sizeof(buf
[0]), hash_ctx
); 
 154                 /* OK so far, top of loop clears buffers to start refilling again */ 
 155         } while (str 
< strLimit 
|| unormlen 
> 0); 
 160  * utf8_normalizeOptCaseFoldAndCompare 
 162  * strA:      A UTF-8 string to be compared (need not be 0 terminated) 
 163  * strA_len:  The byte length of strA (excluding any 0 terminator) 
 164  * strB:      The second UTF-8 string to be compared (need not be 0 terminated) 
 165  * strB_len:  The byte length of strB (excluding any 0 terminator) 
 166  * case_sens: False for case-insensitive behavior; compares canonical caseless forms. 
 167  *            True for case-sensitive behavior; compares standard NFD forms. 
 168  * are_equal: On success, set to true if the strings are equal, or set to false 
 171  * Returns: 0 on success, or 
 172  *          EILSEQ: One or both of the input strings contains illegal ASCII-range 
 173  *                  characters (0x00 or '/'), or is not well-formed stream-safe UTF-8, 
 174  *                  or contains codepoints that are non-characters or unassigned in 
 175  *                  the version of Unicode currently supported (Unicode 9.0). 
 176  *                  Note: The comparison may terminate early when a difference is 
 177  *                        detected, and may return 0 and set *are_equal=false even 
 178  *                        if one or both strings are invalid. 
 180 enum { kNFCSingleCharDecompMaxPlusPushback 
= kNFCSingleCharDecompMax 
+ 4 }; /* room for 03B9 pushback(s) */ 
 183 utf8_normalizeOptCaseFoldAndCompare(const char *strA
, 
 190         const char *strALimit 
= strA 
+ strA_len
; 
 191         const char *strBLimit 
= strB 
+ strB_len
; 
 193         /* Data for the next pending single-char norms from each input; 
 194          *  These will always begin with a base char (combining class 0) 
 195          *  or the first character in the string, which may not be a base */ 
 196         int32_t unormA
[kNFCSingleCharDecompMaxPlusPushback
], unormB
[kNFCSingleCharDecompMaxPlusPushback
]; 
 197         uint8_t unormAcc
[kNFCSingleCharDecompMaxPlusPushback
], unormBcc
[kNFCSingleCharDecompMaxPlusPushback
]; 
 198         int32_t unormAlen 
= 0, unormBlen 
= 0; 
 199         int32_t unormAstart 
= 0, unormBstart 
= 0; 
 201         bool startA 
= true, startB 
= true; 
 204          * The main loop here is similar to the main loop in utf8_normalizeOptCaseFoldAndHash, 
 205          * described above. The differences are: 
 206          * - We keep a normalization buffer and main buffer for each string. 
 207          * - In the main loop, we do steps 1-3 for each string. 
 208          * - In step 4, instead of calling the hash function, we compare the two main 
 209          *   buffers; if they are unequal, we return a non-equal result. 
 210          * - After the end of the main loop, if we still have data for one string but 
 211          *   not the other, return a non-equal result, else return an equal result. 
 214                 /* Data for the buffers being built up from each input */ 
 215                 int32_t bufA
[kNCFStreamSafeBufMax
], bufB
[kNCFStreamSafeBufMax
]; 
 216                 uint8_t bufAcc
[kNCFStreamSafeBufMax
], bufBcc
[kNCFStreamSafeBufMax
]; 
 217                 int32_t bufAlen 
= 0, bufBlen 
= 0; 
 218                 bool needReorderA 
= false, needReorderB 
= false; 
 221                 err 
= nextBaseAndAnyMarks(&strA
, strALimit
, case_sens
, unormA
, unormAcc
, &unormAlen
, &unormAstart
, 
 222                     bufA
, bufAcc
, &bufAlen
, &needReorderA
, &startA
); 
 226                 err 
= nextBaseAndAnyMarks(&strB
, strBLimit
, case_sens
, unormB
, unormBcc
, &unormBlen
, &unormBstart
, 
 227                     bufB
, bufBcc
, &bufBlen
, &needReorderB
, &startB
); 
 232                 if (bufAlen 
> 0 || bufBlen 
> 0) { 
 233                         /* Now each buffer should have all of the combining marks up to the next base char. 
 234                          * Normally it will also start with the last base char encountered (unless the 
 235                          * UTF8 string began with a combining mark). */ 
 236                         /* Now reorder combining marks if necessary. */ 
 238                                 doReorder(bufA
, bufAcc
, bufAlen
); 
 241                                 doReorder(bufB
, bufBcc
, bufBlen
); 
 243                         /* handle 03B9 pushback */ 
 246                                 if (bufAlen 
> 1 && bufA
[bufAlen 
- 1] == 0x03B9 && unormAstart 
== 0) { 
 247                                         int32_t tailCount 
= 0; 
 248                                         while (tailCount 
< kNFCSingleCharDecompMaxPlusPushback 
- unormAlen 
&& bufAlen 
> 1 && bufA
[bufAlen 
- 1] == 0x03B9) { 
 252                                         for (idx 
= unormAlen
; idx 
> 0; idx
--) { 
 253                                                 unormA
[idx 
- 1 + tailCount
] = unormA
[idx 
- 1]; 
 254                                                 unormAcc
[idx 
- 1 + tailCount
] = unormAcc
[idx 
- 1]; 
 256                                         for (idx 
= 0; idx 
< tailCount
; idx
++) { 
 257                                                 unormA
[idx
] = 0x03B9; 
 260                                         unormAlen 
+= tailCount
; 
 262                                 if (bufBlen 
> 1 && bufB
[bufBlen 
- 1] == 0x03B9 && unormBstart 
== 0) { 
 263                                         int32_t tailCount 
= 0; 
 264                                         while (tailCount 
< kNFCSingleCharDecompMaxPlusPushback 
- unormBlen 
&& bufBlen 
> 1 && bufB
[bufBlen 
- 1] == 0x03B9) { 
 268                                         for (idx 
= unormBlen
; idx 
> 0; idx
--) { 
 269                                                 unormB
[idx 
- 1 + tailCount
] = unormB
[idx 
- 1]; 
 270                                                 unormBcc
[idx 
- 1 + tailCount
] = unormBcc
[idx 
- 1]; 
 272                                         for (idx 
= 0; idx 
< tailCount
; idx
++) { 
 273                                                 unormB
[idx
] = 0x03B9; 
 276                                         unormBlen 
+= tailCount
; 
 279                         /* Now compare the buffers. */ 
 280                         if (bufAlen 
!= bufBlen 
|| memcmp(bufA
, bufB
, bufAlen 
* sizeof(bufA
[0])) != 0) { 
 285                 /* OK so far, top of loop clears buffers to start refilling again */ 
 286         } while ((strA 
< strALimit 
|| unormAlen 
> 0) && (strB 
< strBLimit 
|| unormBlen 
> 0)); 
 288         *are_equal 
= (strA 
== strALimit 
&& unormAlen 
== 0 && strB 
== strBLimit 
&& unormBlen 
== 0); 
 293  * utf8_normalizeOptCaseFold 
 295  * str:       The input UTF-8 string (need not be 0 terminated) 
 296  * str_len:   The byte length of the input string (excluding any 0 terminator) 
 297  * case_sens: False for case-insensitive behavior; generates canonical caseless form. 
 298  *            True for case-sensitive behavior; generates standard NFD. 
 299  * ustr:      A pointer to a buffer for the resulting UTF-32 string. 
 300  * ustr_size: The capacity of ustr, in UTF-32 units. 
 301  * ustr_len:  Pointer to a value that will be filled in with the actual length 
 302  *            in UTF-32 units of the string copied to ustr. 
 304  * Returns: 0 on success, or 
 305  *          EILSEQ: The input string contains illegal ASCII-range characters 
 306  *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or 
 307  *                  contains codepoints that are non-characters or unassigned in 
 308  *                  the version of Unicode currently supported. 
 309  *          ENOMEM: ustr_size is insufficient for the resulting string. In this 
 310  *                  case the value returned in *ustr_len is invalid. 
 313 utf8_normalizeOptCaseFold(const char *str
, 
 320         const char *strLimit 
= str 
+ str_len
; 
 321         int32_t *ustrCur 
= ustr
; 
 322         const int32_t *ustrLimit 
= ustr 
+ ustr_size
; 
 324         /* Data for the next pending single-char norm from input; 
 325          *  This will always begin with a base char (combining class 0) */ 
 326         int32_t unorm
[kNFCSingleCharDecompMax
]; 
 327         uint8_t unormcc
[kNFCSingleCharDecompMax
]; 
 328         int32_t unormlen 
= 0; 
 329         int32_t unormstart 
= 0; 
 335                 /* Data for the buffers being built up from input */ 
 336                 int32_t buf
[kNCFStreamSafeBufMax
]; 
 337                 uint8_t bufcc
[kNCFStreamSafeBufMax
]; 
 339                 bool needReorder 
= false; 
 342                 err 
= nextBaseAndAnyMarks(&str
, strLimit
, case_sens
, unorm
, unormcc
, &unormlen
, &unormstart
, 
 343                     buf
, bufcc
, &buflen
, &needReorder
, &start
); 
 350                                 doReorder(buf
, bufcc
, buflen
); 
 352                         /* Now copy to output buffer */ 
 354                         if (ustrCur 
+ buflen 
> ustrLimit
) { 
 357                         for (idx 
= 0; idx 
< buflen
; idx
++) { 
 358                                 *ustrCur
++ = buf
[idx
]; 
 361                 /* OK so far, top of loop clears buffers to start refilling again */ 
 362         } while (str 
< strLimit 
|| unormlen 
> 0); 
 363         *ustr_len 
= (uint32_t)(ustrCur 
- ustr
); // XXXpjr: the explicit (uint32_t) cast wasn't present in the original code drop 
 368  * utf8_normalizeOptCaseFoldToUTF8 
 369  * (This is similar to normalizeOptCaseFold except that this has a different output 
 370  * buffer type, and adds conversion to UTF8 while copying to output buffer) 
 372  * str:       The input UTF-8 string (need not be 0 terminated) 
 373  * str_len:   The byte length of the input string (excluding any 0 terminator) 
 374  * case_sens: False for case-insensitive behavior; generates canonical caseless form. 
 375  *            True for case-sensitive behavior; generates standard NFD. 
 376  * ustr:      A pointer to a buffer for the resulting UTF-8 string. 
 377  * ustr_size: The capacity of ustr, in bytes. 
 378  * ustr_len:  Pointer to a value that will be filled in with the actual length 
 379  *            in bytes of the string copied to ustr. 
 381  * Returns: 0 on success, or 
 382  *          EILSEQ: The input string contains illegal ASCII-range characters 
 383  *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or 
 384  *                  contains codepoints that are non-characters or unassigned in 
 385  *                  the version of Unicode currently supported. 
 386  *          ENOMEM: ustr_size is insufficient for the resulting string. In this 
 387  *                  case the value returned in *ustr_len is invalid. 
 390 utf8_normalizeOptCaseFoldToUTF8(const char *str
, 
 397         const char *strLimit 
= str 
+ str_len
; 
 398         char *ustrCur 
= ustr
; 
 399         const char *ustrLimit 
= ustr 
+ ustr_size
; 
 401         /* Data for the next pending single-char norm from input; 
 402          *  This will always begin with a base char (combining class 0) */ 
 403         int32_t unorm
[kNFCSingleCharDecompMax
]; 
 404         uint8_t unormcc
[kNFCSingleCharDecompMax
]; 
 405         int32_t unormlen 
= 0; 
 406         int32_t unormstart 
= 0; 
 412                 /* Data for the buffers being built up from input */ 
 413                 int32_t buf
[kNCFStreamSafeBufMax
]; 
 414                 uint8_t bufcc
[kNCFStreamSafeBufMax
]; 
 416                 bool needReorder 
= false; 
 419                 err 
= nextBaseAndAnyMarks(&str
, strLimit
, case_sens
, unorm
, unormcc
, &unormlen
, &unormstart
, 
 420                     buf
, bufcc
, &buflen
, &needReorder
, &start
); 
 426                         uint8_t utf8Bytes
[kMaxUTF8BytesPerChar
]; 
 427                         int32_t *bufPtr 
= buf
; 
 429                                 doReorder(buf
, bufcc
, buflen
); 
 431                         /* Now copy to output buffer */ 
 432                         while (buflen
-- > 0) { 
 433                                 int32_t idx
, utf8Len 
= u32CharToUTF8Bytes((uint32_t)*bufPtr
++, utf8Bytes
); 
 434                                 if (ustrCur 
+ utf8Len 
> ustrLimit
) { 
 437                                 for (idx 
= 0; idx 
< utf8Len
; idx
++) { 
 438                                         *ustrCur
++ = (char)utf8Bytes
[idx
]; 
 442                 /* OK so far, top of loop clears buffers to start refilling again */ 
 443         } while (str 
< strLimit 
|| unormlen 
> 0); 
 444         *ustr_len 
= ustrCur 
- ustr
; 
 449  * utf8_normalizeOptCaseFoldAndMatchSubstring 
 451  * strA:      A UTF-8 string (need not be 0 terminated) in which to search for the 
 452  *            substring specified by ustrB. 
 453  * strA_len:  The byte length of strA (excluding any 0 terminator) 
 454  * ustrB:     A normalized UTF-32 substring (need not be 0 terminated) to be searched 
 455  *            for in the UTF-32 string resulting from converting strA to the normalized 
 456  *            UTF-32 form specified by the case_sens parameter; ustrB must already be 
 458  * ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator). 
 459  * case_sens: False for case-insensitive matching; compares canonical caseless forms. 
 460  *            True for case-sensitive matching; compares standard NFD forms. 
 461  * buf:       Pointer to caller-supplied working memory for storing the portion of 
 462  *            strA which has been converted to normalized UTF-32. 
 463  * buf_size:  The size of buf. 
 464  * has_match: On success, set to true if strA (when converter to UTF-32 and normalized 
 465  *            per case_sens) contains ustrB, set to false otherwise. 
 467  * Returns: 0 on success, or 
 468  *          EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is 
 469  *                  not well-formed stream-safe UTF-8, or contains codepoints that are 
 470  *                  non-characters or unassigned in the version of Unicode currently 
 472  *                  Note: The search may terminate early when a match is detected, and 
 473  *                        may return 0 and set *has_match=true even if strA is invalid. 
 474  *          ENOMEM: buf_size is insufficient. 
 477 utf8_normalizeOptCaseFoldAndMatchSubstring(const char    *strA
, 
 479     const int32_t *ustrB
, 
 487          * ustrA represents the current position in the UTF-32 normalized version of strA 
 488          * at which we want to test for a match; ustrANormEnd is the position beyond that 
 489          * which is just after the end of what has already been converted from strA to 
 490          * UTF-32 normalized form. 
 491          * Each time through the main loop: 
 492          * - The first task is to make sure we have enough of strA converted to UTF32 
 493          *   normalized form to test for match with ustrB at the current match position. 
 494          *   If we don't, then convert more of strA to UTF-32 normalized form until we 
 495          *   have enough to compare with ustrB. To do this, run a loop which is like the 
 496          *   main loop in utf8_normalizeOptCaseFoldAndHash except that in step 4, instead of 
 497          *   calling the hash function, we copy the normalized buffer to ustrANormEnd, 
 498          *   advancing the latter. We keep doing this until we have enough additional 
 499          *   converted to match with ustrB. 
 500          * - Then we test for match of ustrB at the current ustrA position. If there is 
 501          *   a match we return; otherwise, if there is more strA to convert we advance 
 502          *   ustrA  and repeat the main loop, otherwise we return without a match. 
 504         if (ustrB_len 
== 0) { /* always matches */ 
 508         *has_match 
= false; /* initialize return value */ 
 509         if (ustrB_len 
> 2 * strA_len
) { 
 510                 /* If ustrB is clearly too long to find in strA, don't bother normalizing strA. 
 511                  * A UTF-8 character of 1 byte (ASCII) will normalize to 1 UTF-32 unit. 
 512                  * A UTF-8 character of 2-4 bytes will normalize to a maximum of 4 UTF-32 units. 
 513                  * The maximum expansion from unnormalized UTF-8 byte length to normalized 
 514                  *  UTF-32 unit length is thus 2. */ 
 518         const char *strALimit 
= strA 
+ strA_len
; 
 519         int32_t *ustrA 
= (int32_t *)buf
; 
 520         const int32_t *ustrALimit 
= ustrA 
+ (buf_size 
/ sizeof(int32_t)); 
 521         int32_t *ustrANormEnd 
= ustrA
; /* how far we have already normalized in ustrA */ 
 523         /* Data for the next pending single-char norms from each input; 
 524          *  These will always begin with a base char (combining class 0) 
 525          *  or the first character in the string, which may not be a base */ 
 526         int32_t unormA
[kNFCSingleCharDecompMax
]; 
 527         uint8_t unormAcc
[kNFCSingleCharDecompMax
]; 
 528         int32_t unormAlen 
= 0; 
 529         int32_t unormAstart 
= 0; 
 534                 /* convert enough more of strA to normalized UTF-32 in ustrA to check for match */ 
 535                 if (ustrANormEnd 
- ustrA 
< ustrB_len
) { 
 537                                 /* Data for the buffers being built up from each input */ 
 538                                 int32_t bufA
[kNCFStreamSafeBufMax
]; 
 539                                 uint8_t bufAcc
[kNCFStreamSafeBufMax
]; 
 541                                 bool needReorderA 
= false; 
 544                                 err 
= nextBaseAndAnyMarks(&strA
, strALimit
, case_sens
, unormA
, unormAcc
, &unormAlen
, &unormAstart
, 
 545                                     bufA
, bufAcc
, &bufAlen
, &needReorderA
, &startA
); 
 551                                         /* Now each buffer should have all of the combining marks up to the next base char. 
 552                                          * Normally it will also start with the last base char encountered (unless the 
 553                                          * UTF8 string began with a combining mark). */ 
 554                                         /* Now reorder combining marks if necessary. Should be rare, and sequences should 
 555                                          * usually be short when does occur => simple bubblesort should be sufficient. */ 
 557                                                 doReorder(bufA
, bufAcc
, bufAlen
); 
 559                                         /* Now copy to working buffer */ 
 561                                         if (ustrANormEnd 
+ bufAlen 
> ustrALimit
) { 
 564                                         for (idx 
= 0; idx 
< bufAlen
; idx
++) { 
 565                                                 *ustrANormEnd
++ = bufA
[idx
]; 
 568                                 /* OK so far, top of loop clears buffers to start refilling again */ 
 569                         } while ((ustrANormEnd 
- ustrA 
< ustrB_len
) && (strA 
< strALimit 
|| unormAlen 
> 0)); 
 572                 if (ustrANormEnd 
- ustrA 
< ustrB_len
) { 
 573                         return 0; /* not enough of strA left for match */ 
 575                 /* check for match, return if so */ 
 576                 if (memcmp(ustrA
, ustrB
, ustrB_len 
* sizeof(ustrB
[0])) == 0) { 
 580                 ustrA
++; /* advance match position */ 
 584 /* nextBaseAndAnyMarks: 
 585  * Guts of code to get next bufferful of base character (or first char in string) 
 586  * and all trailing combining marks. 
 587  * This is called each time through the main loop of functions above, and does the 
 589  * 1. If there are characters available in the normalization result buffer (from the 
 590  *    result of normalizing a previous input character), copy the first character and 
 591  *    any following characters that have non-zero combining class to the main buffer. 
 592  * 2. If there is nothing left in the normalization buffer, then loop processing 
 593  *    input characters as follows: 
 594  *   a) Get the next input character from UTF8, get its normalized and case-folded 
 595  *      result in the normalization buffer. 
 596  *   b) If the first character in the normalization buffer has combining class 0, 
 597  *      break; we will handle this normalization buffer next time through the main 
 599  *   c) Else copy the current normalization buffer (which has only combining marks) 
 600  *      to the main buffer, and continue with the loop processing input characters. 
 604 nextBaseAndAnyMarks(const char** strP
, const char *strLimit
, bool case_sens
, 
 605     int32_t* unorm
, uint8_t* unormcc
, int32_t* unormlenP
, int32_t* unormstartP
, 
 606     int32_t* buf
, uint8_t* bufcc
, int32_t* buflenP
, 
 607     bool* needReorderP
, bool* startP
) 
 609         /* update buffers for str */ 
 610         if (*unormlenP 
> 0 && *unormstartP 
< *unormlenP
) { 
 611                 /* unorm begins with a base char; buflen should be 0 */ 
 612                 *needReorderP 
= false; 
 613                 for (*buflenP 
= 0; true;) { 
 614                         if (*buflenP 
> 0 && unormcc
[*unormstartP
] > 0 && unormcc
[*unormstartP
] < bufcc
[(*buflenP
) - 1]) { 
 615                                 *needReorderP 
= true; 
 617                         buf
[*buflenP
] = unorm
[*unormstartP
]; 
 618                         bufcc
[(*buflenP
)++] = unormcc
[(*unormstartP
)++]; 
 619                         if (*unormstartP 
>= *unormlenP 
|| unormcc
[*unormstartP
] == 0) { 
 624         if (*unormstartP 
>= *unormlenP
) { 
 625                 *unormstartP 
= *unormlenP 
= 0; 
 626                 while (*strP 
< strLimit
) { 
 628                         uint32_t bytevalue 
= (uint8_t)*(*strP
)++; 
 629                         /* '/' is not produced by NFD decomposition from another character so we can 
 630                          * check for it before normalization */ 
 631                         if (bytevalue 
== 0 || bytevalue 
== 0x2F /*'/'*/) { 
 634                         if (bytevalue 
< 0x80) { 
 635                                 unorm
[0] = (!case_sens 
&& bytevalue 
>= 'A' && bytevalue 
<= 'Z')? bytevalue 
+= 0x20: bytevalue
; 
 641                                 int32_t u32char 
= utf8ToU32Code(bytevalue
, strP
, strLimit
); 
 645                                 *unormlenP 
= normalizeOptCaseFoldU32Char(u32char
, case_sens
, unorm
, unormcc
); 
 646                                 if (*unormlenP 
<= 0) { 
 649                                 if (unormcc
[0] == 0 || *startP
) { 
 654                         /* the latest char decomposes to just combining sequence, add to buffer being built */ 
 655                         if (*buflenP 
+ *unormlenP 
> kNCFStreamSafeBufMax
) { 
 658                         for (idx 
= 0; idx 
< *unormlenP
; idx
++, (*buflenP
)++) { 
 659                                 if (*buflenP 
> 0 && unormcc
[idx
] > 0 && unormcc
[idx
] < bufcc
[(*buflenP
) - 1]) { 
 660                                         *needReorderP 
= true; 
 662                                 buf
[*buflenP
] = unorm
[idx
]; 
 663                                 bufcc
[*buflenP
] = unormcc
[idx
]; 
 671 /*  local prototypes used only by internal functions */ 
 672 static void swapBufCharCCWithPrevious(int32_t jdx
, int32_t buf
[], uint8_t bufcc
[]); 
 673 static int32_t adjustCase(bool case_sens
, int32_t uSeqLen
, 
 674     int32_t u32NormFoldBuf
[kNFCSingleCharDecompMax
]); 
 675 static uint8_t getCombClassU32Char(int32_t u32char
); 
 676 static int32_t decomposeHangul(int32_t u32char
, int32_t u32NormFoldBuf
[kNFCSingleCharDecompMax
]); 
 678 /* Reorder combining marks if necessary. Should be rare, and sequences should 
 679  * usually be short when does occur => simple bubblesort should be sufficient. */ 
 681 doReorder(int32_t* buf
, uint8_t* bufcc
, int32_t buflen
) 
 684         for (idx 
= 0; idx 
< buflen 
- 1; idx
++) { 
 685                 for (jdx 
= buflen 
- 1; jdx 
> idx
; jdx
--) { 
 686                         if (bufcc
[jdx
] < bufcc
[jdx 
- 1]) { 
 687                                 swapBufCharCCWithPrevious(jdx
, buf
, bufcc
); 
 692 /*  swap function for bubblesort */ 
 694 swapBufCharCCWithPrevious(int32_t jdx
, int32_t buf
[], uint8_t bufcc
[]) 
 696         int32_t bufchar 
= buf
[jdx
]; 
 697         uint8_t bufccval 
= bufcc
[jdx
]; 
 698         buf
[jdx
] = buf
[jdx 
- 1]; 
 699         bufcc
[jdx
] = bufcc
[jdx 
- 1]; 
 700         buf
[jdx 
- 1] = bufchar
; 
 701         bufcc
[jdx 
- 1] = bufccval
; 
 705  * u32CharToUTF8Bytes, map a valid Unicode character (UTF32 code point) to 1..4 UTF8 bytes, 
 706  * and returns the number of UTF8 bytes. 
 708  * adapted from ICU macro U8_APPEND_UNSAFE (utf8.h). 
 711 u32CharToUTF8Bytes(uint32_t u32char
, uint8_t utf8Bytes
[kMaxUTF8BytesPerChar
]) 
 714         if (u32char 
<= 0x7F) { 
 715                 utf8Bytes
[idx
++] = (uint8_t)u32char
; 
 717                 if (u32char 
<= 0x7FF) { 
 718                         utf8Bytes
[idx
++] = (uint8_t)((u32char 
>> 6) | 0xC0); 
 720                         if (u32char 
<= 0xFFFF) { 
 721                                 utf8Bytes
[idx
++] = (uint8_t)((u32char 
>> 12) | 0xE0); 
 723                                 utf8Bytes
[idx
++] = (uint8_t)((u32char 
>> 18) | 0xF0); 
 724                                 utf8Bytes
[idx
++] = (uint8_t)(((u32char 
>> 12) & 0x3F) | 0x80); 
 726                         utf8Bytes
[idx
++] = (uint8_t)(((u32char 
>> 6) & 0x3F) | 0x80); 
 728                 utf8Bytes
[idx
++] = (uint8_t)((u32char 
& 0x3F) | 0x80); 
 733 /* two macros adapted from ICU's utf8.h */ 
 734 #define U8_COUNT_TRAIL_BYTES_LOC(leadByte) \ 
 735 ((uint8_t)(leadByte)<0XF0 ? \ 
 736 ((uint8_t)(leadByte)>=0XC0)+((uint8_t)(leadByte)>=0XE0) : \ 
 737 (uint8_t)(leadByte)<0XFE ? 3+((uint8_t)(leadByte)>=0XF8)+((uint8_t)(leadByte)>=0XFC) : 0) 
 739 #define U8_MASK_LEAD_BYTE_LOC(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) 
 741 /* array adapted from ICU's utf_impl.c */ 
 742 static const int32_t utf8_minLegal
[4] = { 0, 0X80, 0x800, 0x10000 }; 
 745  * utf8ToU32Code, map a non-ASCII byte value plus a buffer of trail bytes to a UTF32 code point 
 747  * adapted from ICU macro U8_NEXT (utf8.h) and function utf8_nextCharSafeBody (utf_impl.c); 
 748  * verified to produce the same results (adusted for the difference in API signature). 
 750  * assumes at entry that: 
 751  * 1. a non-ASCII byte value (>= 0x80) that purports to be the beginning of a UTF8 character 
 752  *    has been read, and its value is in u32char 
 753  * 2. *srcPtr points to the input buffer just after that non-ASCII byte, i.e. it purportedly 
 754  *    points to the trail bytes for that UTF8 char. 
 755  * 3. srcLimit points to end of the input buffer (just after the last byte in the buffer) 
 757  * For a valid and complete UTF8 character, the function returns its value and advances 
 758  * *srcPtr to the first byte after the UTF8 char. Otherwise, the function returns -1 
 759  * (and the value in *srcPtr is undefined). 
 760  * Note that while it does not map to surrogate values (generates an error for malformed 
 761  * UTF-8 that would map to values in 0xD800..0xD8FF), it does output noncharacter values 
 762  * whose low 16 bits are 0xFFFE or 0xFFFF without generating an error. 
 764  * equivalences used in adapted ICU code: 
 768  * This has been validated against ICU behavior. 
 772 utf8ToU32Code(int32_t u32char
, const char** srcPtr
, const char* srcLimit
) 
 774         const char* src 
= *srcPtr
; 
 776         if (0xE0 < u32char 
&& u32char 
<= 0xEC && src 
+ 1 < srcLimit 
&& (pt1 
= (uint8_t)(src
[0] - 0x80)) <= 0x3F && (pt2 
= (uint8_t)(src
[1] - 0x80)) <= 0x3F) { 
 777                 /* handle U+1000..U+CFFF */ 
 778                 /* no need for (u32char&0xF) because the upper bits are truncated after <<12 in the cast to (uint16_t) */ 
 779                 u32char 
= (uint16_t)((u32char 
<< 12) | (pt1 
<< 6) | pt2
); 
 781         } else if (u32char 
< 0xE0 && u32char 
>= 0xC2 && src 
< srcLimit 
&& (pt1 
= (uint8_t)(src
[0] - 0x80)) <= 0x3F) { 
 782                 /* handle U+0080..U+07FF */ 
 783                 u32char 
= ((u32char 
& 0x1F) << 6) | pt1
; 
 786                 /* "complicated" and error cases, adapted from ICU's utf8_nextCharSafeBody() */ 
 787                 uint8_t count 
= U8_COUNT_TRAIL_BYTES_LOC(u32char
); 
 788                 if (src 
+ count 
<= srcLimit
) { 
 791                         U8_MASK_LEAD_BYTE_LOC(u32char
, count
); 
 793                         /* branches 3, 2 fall through to the next one */ 
 794                         case 0:         /* count==0 for illegally leading trail bytes and the illegal bytes 0XFE and 0XFF */ 
 796                         case 4:          /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ 
 799                                 trail 
= *src
++ - 0X80; 
 800                                 u32char 
= (u32char 
<< 6) | trail
; 
 801                                 /* u32char>=0x110 would result in code point>0x10FFFF, outside Unicode */ 
 802                                 if (u32char 
>= 0x110 || trail 
> 0X3F) { 
 806                                 trail 
= *src
++ - 0X80; 
 807                                 u32char 
= (u32char 
<< 6) | trail
; 
 809                                  * test for a surrogate D800..DFFF: 
 810                                  * before the last (u32char<<6), a surrogate is u32char=360..37F 
 812                                 if (((u32char 
& 0xFFE0) == 0x360) || trail 
> 0X3F) { 
 816                                 trail 
= *src
++ - 0X80; 
 817                                 u32char 
= (u32char 
<< 6) | trail
; 
 821                                 /* correct sequence - all trail bytes have (b7..b6)==(10) */ 
 822                                 if (u32char 
>= utf8_minLegal
[count
]) { 
 826                                 /* no default branch to optimize switch()  - all values are covered */ 
 836  * normalizeCaseFoldU32Code, map a single UTF32 code point to its normalized result 
 837  * and the combining classes for each resulting char, or indicate it is invalid. 
 839  * The normalized and case-folded result might be up to 4 UTF32 characters (current 
 840  * max, could change in the future). 
 842  * u32char - input UTF32 code point 
 843  * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only 
 844  * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3) 
 845  *          to receive the normalize result. 
 846  * combClass - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3) 
 847  *          to receive the combining classes for the characters in u32NormFoldBuf. If 
 848  *          the first entry has non-zero combining class, the remaining entries do too. 
 850  * returns -1 if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax 
 851  * is insufficient (though it is assumed to be at least 3), else the length of the 
 852  * normalized and case-folded result (currently in the range 1..4). 
 854  * This has been validated against ICU behavior. 
 856  * This function is highly dependent on the structure of the data trie; for details on 
 857  * that structure, see comments in normalizeCaseFoldData.h 
 861 normalizeOptCaseFoldU32Char(int32_t u32char
, bool case_sens
, 
 862     int32_t u32NormFoldBuf
[kNFCSingleCharDecompMax
], 
 863     uint8_t combClass
[kNFCSingleCharDecompMax
]) 
 866         /*  return hi-range PUA as self, except non-characters */ 
 867         if (u32char 
>= kU32HiPUAStart
) { 
 868                 if ((u32char 
& 0xFFFE) == 0xFFFE) { 
 871                 u32NormFoldBuf
[0] = u32char
; 
 874         /*  for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */ 
 875         /*  0 - 0x313FF; everything in between in currently invalid. */ 
 876         int32_t u32charLookup 
= u32char
; 
 877         if (u32charLookup 
>= kU32LowRangeLimit
) { 
 878                 u32charLookup 
-= (kU32HiRangeStart 
- kU32LowRangeLimit
); 
 879                 if (u32charLookup 
< kU32LowRangeLimit 
|| u32charLookup 
>= (kU32LowRangeLimit 
+ kU32HiRangeLen
)) { 
 880                         return -1; /* in the large range of currently-unassigned code points */ 
 883         /* Now we have u32charLookup either in 0..0x313FF representing u32char itself, 
 884          *  or in 0x31400..0x315FF representing u32char 0xE0000..0xE01FF; look it up in 
 885          *  the trie that identifies unassigneds in this range, or maps others to 
 886          *  decomps or combining class or just self. */ 
 889         trieValue 
= nfTrieHi
[u32charLookup 
>> kNFTrieHiShift
]; 
 890         if (trieValue 
== kInvalidCodeFlag
) { 
 893         if (trieValue 
== 0 || (trieValue 
& kFlagTestMask
) == kCombClassFlag
) { /*  return self; */ 
 894                 u32NormFoldBuf
[0] = u32char
; 
 895                 combClass
[0] = trieValue 
& kFlagValueMask
; 
 898         if (trieValue 
== kHangulMask
) { 
 899                 combClass
[1] = combClass
[2] = 0; 
 900                 return decomposeHangul(u32char
, u32NormFoldBuf
); 
 903         trieValue 
= nfTrieMid
[trieValue 
& kNextIndexValueMask
][(u32charLookup 
>> kNFTrieMidShift
) & kNFTrieMidMask
]; 
 904         if (trieValue 
== kInvalidCodeFlag
) { 
 907         if (trieValue 
== 0 || (trieValue 
& kFlagTestMask
) == kCombClassFlag
) { 
 908                 u32NormFoldBuf
[0] = u32char
; 
 909                 combClass
[0] = trieValue 
& kFlagValueMask
; 
 910                 return adjustCase(case_sens
, 1, u32NormFoldBuf
); 
 912         if ((trieValue 
& kFlagTestMask
) == kInvMaskFlag
) { 
 913                 uint16_t invalidMask 
= nfU16InvMasks
[trieValue 
& kFlagValueMask
]; 
 914                 uint16_t testBit 
= (uint16_t)(1 << (u32charLookup 
& kNFTrieLoMask
)); 
 915                 if (testBit 
& invalidMask
) { 
 919                         /* treat like trieValue == 0 above */ 
 920                         u32NormFoldBuf
[0] = u32char
; 
 921                         return adjustCase(case_sens
, 1, u32NormFoldBuf
);; 
 924         if (trieValue 
== kHangulMask
) { 
 925                 combClass
[1] = combClass
[2] = 0; 
 926                 return decomposeHangul(u32char
, u32NormFoldBuf
); 
 929         trieValue 
= nfTrieLo
[trieValue 
& kNextIndexValueMask
][u32charLookup 
& kNFTrieLoMask
]; 
 930         if (trieValue 
== kInvalidCodeFlag
) { 
 933         if (trieValue 
== kHangulMask
) { 
 934                 combClass
[1] = combClass
[2] = 0; 
 935                 return decomposeHangul(u32char
, u32NormFoldBuf
); 
 937         if (trieValue 
< kToU16Seq2Mask 
|| trieValue 
> kSpecialsEnd
) { 
 938                 if (trieValue 
== 0 || (trieValue 
& kFlagTestMask
) == kCombClassFlag
) { 
 939                         u32NormFoldBuf
[0] = u32char
; 
 940                         combClass
[0] = trieValue 
& kFlagValueMask
; 
 942                         u32NormFoldBuf
[0] = trieValue
; 
 944                 return adjustCase(case_sens
, 1, u32NormFoldBuf
);; 
 946         const uint16_t* u16SeqPtr 
= NULL
; 
 947         const int32_t*  u32SeqPtr 
= NULL
; 
 949         switch (trieValue 
& kSpecialsMask
) { 
 951                 if (case_sens 
&& (trieValue 
& kToSeqCaseFoldMask
)) { 
 952                         /* don't use the mapping, it is only for case folding */ 
 953                         u32NormFoldBuf
[0] = u32char
; 
 954                         /* already have combClass[0] = 0 */ 
 957                 u16SeqPtr 
= nfU16Seq2
[trieValue 
& kToSeqIndexMask
]; 
 961                 if (case_sens 
&& (trieValue 
& kToSeqCaseFoldMask
)) { 
 962                         /* don't use the mapping, it is only for case folding */ 
 963                         u32NormFoldBuf
[0] = u32char
; 
 964                         /* already have combClass[0] = 0 */ 
 967                 u16SeqPtr 
= nfU16Seq3
[trieValue 
& kToSeqIndexMask
]; 
 970         case kToU16SeqMiscMask
: 
 971                 u16SeqPtr 
= &nfU16SeqMisc
[trieValue 
& kToSeqMiscIndexMask
]; 
 972                 uSeqLen 
= *u16SeqPtr 
& kToSeqMiscLenMask
; 
 973                 combClass
[0] = (uint8_t)(*u16SeqPtr
++ >> kToSeqMiscCCShift
); 
 976                 if (case_sens 
&& (trieValue 
& kToSeqCaseFoldMask
)) { 
 977                         /* don't use the mapping, it is only for case folding */ 
 978                         u32NormFoldBuf
[0] = u32char
; 
 979                         /* already have combClass[0] = 0 */ 
 982                 u32SeqPtr 
= &nfU32Char
[trieValue 
& kToSeqIndexMask
]; 
 985         case kToU32SeqMiscMask
: 
 986                 u32SeqPtr 
= &nfU32SeqMisc
[trieValue 
& kToSeqMiscIndexMask
]; 
 987                 uSeqLen 
= *u32SeqPtr 
& kToSeqMiscLenMask
; 
 988                 combClass
[0] = (uint8_t)(*u32SeqPtr
++ >> kToSeqMiscCCShift
); 
 993         if (kNFCSingleCharDecompMax 
< uSeqLen
) { 
 997         for (idx 
= 0; idx 
< uSeqLen
; idx
++) { 
 998                 u32NormFoldBuf
[idx
] = (u16SeqPtr
)? *u16SeqPtr
++: *u32SeqPtr
++; 
1000                         combClass
[idx
] = getCombClassU32Char(u32NormFoldBuf
[idx
]); 
1003         return adjustCase(case_sens
, uSeqLen
, u32NormFoldBuf
); 
1007  * adjustCase, final adjustments to normalizeOptCaseFoldU32Char for case folding 
1009  * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only 
1010  * uSeqLen - length of the sequence specified in the u32NormFoldBuf 
1011  * u32NormFoldBuf - buffer of length kNFCSingleCharDecompMax (assume to be at least 3) 
1012  *          with normalized result. 
1014  * returns uSeqLen if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax 
1015  * is insufficient (though it is assumed to be at least 3), else the length of the 
1016  * normalized and case-folded result (currently in the range 1..4). 
1018  * This function is a reduced version of normalizeOptCaseFoldU32Char above. 
1022 adjustCase(bool case_sens
, int32_t uSeqLen
, 
1023     int32_t u32NormFoldBuf
[kNFCSingleCharDecompMax
]) 
1025         if (!case_sens 
&& uSeqLen 
> 0) { 
1026                 if (u32NormFoldBuf
[0] < kSimpleCaseFoldLimit
) { 
1027                         u32NormFoldBuf
[0] = nfBasicCF
[u32NormFoldBuf
[0]]; 
1028                         /* There is one case in which this maps to a character with different combining 
1029                          * class: U+0345 (cc 240) casefolds to U+03B9 (cc 0). However when this is the 
1030                          * first or only character in the sequence, we want to keep the original 
1031                          * combining class, so nothing special to do here. 
1034                 /* The following is the only case where we have a casefolding after the first 
1035                  * character in the sequence. Don't worry about combining class here. that gets 
1036                  * set later for characters after the first. 
1038                 if (uSeqLen 
> 1 && u32NormFoldBuf
[uSeqLen 
- 1] == 0x0345) { 
1039                         u32NormFoldBuf
[uSeqLen 
- 1] = 0x03B9; 
1046  * getCombClassU32Char, map a single character (in UTF32 form) to its combining class. 
1048  * u32char - input UTF32 code point. This is assumed to be a valid character that does 
1049  * not have a decomposition. 
1051  * returns combining class of the character. 
1053  * This is only called for characters after the first is a decomposition expansion. In 
1054  * this situation, if we encounter U+03B9 (combining class 0), it is only there as the 
1055  * case-folding of U+0345 (combining class 240). In this case it is the combining class 
1056  * for U+0345 that we want. In the non-casefold case we won't see U+03B9 here at all. 
1058  * This function is a reduced version of normalizeOptCaseFoldU32Char above. 
1061 getCombClassU32Char(int32_t u32char
) 
1063         if (u32char 
>= kU32HiPUAStart
) { 
1066         if (u32char 
== 0x03B9) { 
1069         /*  for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */ 
1070         /*  0 - 0x313FF; everything in between in currently invalid. */ 
1071         int32_t u32charLookup 
= u32char
; 
1072         if (u32charLookup 
>= kU32LowRangeLimit
) { 
1073                 u32charLookup 
-= (kU32HiRangeStart 
- kU32LowRangeLimit
); 
1075         /* Now we have u32charLookup either in 0..0x313FF representing u32char itself, 
1076          *  or in 0x31400..0x315FF representing u32char 0xE0000..0xE01FF; look it up in 
1077          *  the trie that identifies unassigneds in this range, or maps others to 
1078          *  decomps or combining class or just self. */ 
1081         trieValue 
= nfTrieHi
[u32charLookup 
>> kNFTrieHiShift
]; 
1082         if (trieValue 
== 0 || (trieValue 
& kFlagTestMask
) == kCombClassFlag
) { 
1083                 return trieValue 
& kFlagValueMask
; 
1086         trieValue 
= nfTrieMid
[trieValue 
& kNextIndexValueMask
][(u32charLookup 
>> kNFTrieMidShift
) & kNFTrieMidMask
]; 
1087         if (trieValue 
== 0 || (trieValue 
& kFlagTestMask
) == kCombClassFlag
) { /*  return self; */ 
1088                 return trieValue 
& kFlagValueMask
; 
1090         if ((trieValue 
& kFlagTestMask
) == kInvMaskFlag
) { 
1094         trieValue 
= nfTrieLo
[trieValue 
& kNextIndexValueMask
][u32charLookup 
& kNFTrieMidMask
]; 
1095         return ((trieValue 
& kFlagTestMask
) == kCombClassFlag
)? (trieValue 
& kFlagValueMask
): 0; 
1099  * decomposeHangul, map a single UTF32 code point for a composed Hangul 
1100  * in the range AC00-D7A3, using algorithmic decomp 
1102  * The normalized result will be 2 or 3 UTF32 characters. 
1104  * u32char - input UTF32 code point 
1105  * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3) 
1106  *          to receive the normalize result. 
1108  * returns the length of the normalized result (2..3). 
1110  * Adapted from ICU Hangul:decompose in normalizer2impl.h 
1116         JAMO_L_BASE
=0x1100,     /* "lead" jamo */ 
1117         JAMO_V_BASE
=0x1161,     /* "vowel" jamo */ 
1118         JAMO_T_BASE
=0x11A7,     /* "trail" jamo */ 
1125 decomposeHangul(int32_t u32char
, int32_t u32NormFoldBuf
[kNFCSingleCharDecompMax
]) 
1127         u32char 
-= HANGUL_BASE
; 
1128         int32_t tIndex 
= u32char 
% JAMO_T_COUNT
; 
1129         u32char 
/= JAMO_T_COUNT
; 
1130         u32NormFoldBuf
[0] = (uint16_t)(JAMO_L_BASE 
+ u32char 
/ JAMO_V_COUNT
); 
1131         u32NormFoldBuf
[1] = (uint16_t)(JAMO_V_BASE 
+ u32char 
% JAMO_V_COUNT
); 
1135         u32NormFoldBuf
[2] = (uint16_t)(JAMO_T_BASE 
+ tIndex
);