bsd/vfs/vfs_unicode.c

   1 /*
   2  * Copyright (C) 2016-2020 Apple, Inc. All rights reserved.
   3  * Some portions covered by other copyrights, listed below.
   4  *---
   5  * Copyright (C) 2016 and later: Unicode, Inc. and others.
   6  * License & terms of use: http://www.unicode.org/copyright.html
   7  *---
   8  * Copyright (C) 1999-2015, International Business Machines
   9  * Corporation and others.  All Rights Reserved.
  10  *
  11  * add APPLE_OSREFERENCE_LICENSE_HEADER stuff...
  12  */
  13
  14 #include <libkern/libkern.h>
  15 #include <sys/errno.h>
  16 #include <sys/unicode.h>
  17 #include "vfs_unicode_data.h"
  18 #define STATIC_UNLESS_TEST static
  19
  20 enum {
  21         /* Maximum number of UTF8 bytes from one Unicode code point (one UTF32 code unit) */
  22         kMaxUTF8BytesPerChar = 4
  23 };
  24
  25 /* local prototypes used by exported functions (and themselves exported for testing) */
  26 STATIC_UNLESS_TEST
  27 int32_t utf8ToU32Code(int32_t u32char, const char** srcPtr, const char* srcLimit);
  28 STATIC_UNLESS_TEST
  29 int32_t normalizeOptCaseFoldU32Char(int32_t u32char, bool case_sens,
  30     int32_t u32NormFoldBuf[kNFCSingleCharDecompMax],
  31     uint8_t combClass[kNFCSingleCharDecompMax]);
  32 /* local prototypes used by exported functions (not exported for separate testing) */
  33 static int nextBaseAndAnyMarks(const char** strP, const char *strLimit, bool case_sens,
  34     int32_t* unorm, uint8_t* unormcc, int32_t* unormlenP, int32_t* unormstartP,
  35     int32_t* buf, uint8_t* bufcc, int32_t* buflenP,
  36     bool* needReorderP, bool* startP);
  37 void doReorder(int32_t* buf, uint8_t* bufcc, int32_t buflen);
  38 int32_t u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerChar]);
  39
  40 /*
  41  * utf8_normalizeOptCaseFoldGetUVersion
  42  *
  43  * version[0] = Unicode major version; for Unicode 6.3.0 this would be 6
  44  * version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3
  45  * version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0
  46  * version[3] = Code revision level; for any given Unicode version, this value starts
  47  *              at 0 and is incremented for each significant revision to the
  48  *              normalizeOptCaseFold functions.
  49  */
  50 void
  51 utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4])
  52 {
  53         version[0] = 13;
  54         version[1] = 0;
  55         version[2] = 0;
  56         version[3] = 0;
  57         return;
  58 }
  59
  60 /*
  61  * utf8_normalizeOptCaseFoldAndHash
  62  *
  63  * str:       The input UTF-8 string (need not be 0 terminated)
  64  * str_len:   The byte length of the input string (excluding any 0 terminator)
  65  * case_sens: False for case-insensitive behavior; generates canonical caseless form.
  66  *            True for case-sensitive behavior; generates standard NFD.
  67  * hash_func: A pointer to a hashing function to compute the hash of the
  68  *            normalized/case-folded result. buf contains buf_len bytes
  69  *            of data to be added to the hash using the caller-supplied
  70  *            context (ctx).
  71  * hash_ctx:  The context for the hash function.
  72  *
  73  * Returns: 0 on success, or
  74  *          EILSEQ: The input string contains illegal ASCII-range characters
  75  *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
  76  *                  contains codepoints that are non-characters or unassigned in
  77  *                  the version of Unicode currently supported (Unicode 9.0).
  78  */
  79
  80 int
  81 utf8_normalizeOptCaseFoldAndHash(const char *str,
  82     size_t      str_len,
  83     bool        case_sens,
  84     void      (*hash_func)(void *buf, size_t buf_len, void *ctx),
  85     void       *hash_ctx)
  86 {
  87         const char *strLimit = str + str_len;
  88
  89         /* Data for the next pending single-char norm from input;
  90          *  This will always begin with a base char (combining class 0)
  91          *  or the first character in the string, which may no be a base */
  92         int32_t unorm[kNFCSingleCharDecompMax];
  93         uint8_t unormcc[kNFCSingleCharDecompMax];
  94         int32_t unormlen = 0;
  95         int32_t unormstart = 0;
  96
  97         bool start = true;
  98
  99         /* main loop:
 100          * Each input character may be normalized to a sequence of one or more characters,
 101          * some of which may have non-zero combining class. Any sequence of characters
 102          * with non-zero combining class resulting from one or more input characters needs
 103          * to be accumulated in the main buffer so we can reorder as necessary before
 104          * calling the hash function.
 105          *
 106          * At the beginning of the main loop: The normalization buffer and main buffer are
 107          * both empty.
 108          *
 109          * Each time through the main loop we do the following:
 110          * 1. If there are characters available in the normalization result buffer (from the
 111          *    result of normalizing a previous input character), copy the first character and
 112          *    any following characters that have non-zero combining class to the main buffer.
 113          * 2. If there is nothing left in the normalization buffer, then loop processing
 114          *    input characters as follows:
 115          *   a) Get the next input character from UTF8, get its normalized and case-folded
 116          *      result in the normalization buffer.
 117          *   b) If the first character in the normalization buffer has combining class 0,
 118          *      break; we will handle this normalization buffer next time through the main
 119          *      loop.
 120          *   c) Else copy the current normalization buffer (which has only combining marks)
 121          *      to the main buffer, and continue with the loop processing input characters.
 122          * 3. At this point the first character in the main buffer may or may not have
 123          *    combining class 0, but any subsequent characters (up to the the limit for
 124          *    stream safe text) will be combining characters with nonzero combining class.
 125          *    Reorder the combining marks if necessary into canonical order.
 126          * 4. Call the hash function for each character in the main buffer.
 127          *
 128          */
 129         do {
 130                 /* Data for the buffers being built up from input */
 131                 int32_t buf[kNCFStreamSafeBufMax];
 132                 uint8_t bufcc[kNCFStreamSafeBufMax];
 133                 int32_t buflen = 0;
 134                 bool needReorder = false;
 135                 int err;
 136
 137                 err = nextBaseAndAnyMarks(&str, strLimit, case_sens, unorm, unormcc, &unormlen, &unormstart,
 138                     buf, bufcc, &buflen, &needReorder, &start);
 139                 if (err != 0) {
 140                         return err;
 141                 }
 142
 143                 if (buflen > 0) {
 144                         /* Now buffer should have all of the combining marks up to the next base char.
 145                          * Normally it will also start with the last base char encountered (unless the
 146                          * UTF8 string began with a combining mark). */
 147                         /* Now reorder combining marks if necessary. */
 148                         if (needReorder) {
 149                                 doReorder(buf, bufcc, buflen);
 150                         }
 151                         /* Now write to hash func */
 152                         hash_func(buf, buflen * sizeof(buf[0]), hash_ctx);
 153                 }
 154                 /* OK so far, top of loop clears buffers to start refilling again */
 155         } while (str < strLimit || unormlen > 0);
 156         return 0;
 157 }
 158
 159 /*
 160  * utf8_normalizeOptCaseFoldAndCompare
 161  *
 162  * strA:      A UTF-8 string to be compared (need not be 0 terminated)
 163  * strA_len:  The byte length of strA (excluding any 0 terminator)
 164  * strB:      The second UTF-8 string to be compared (need not be 0 terminated)
 165  * strB_len:  The byte length of strB (excluding any 0 terminator)
 166  * case_sens: False for case-insensitive behavior; compares canonical caseless forms.
 167  *            True for case-sensitive behavior; compares standard NFD forms.
 168  * are_equal: On success, set to true if the strings are equal, or set to false
 169  *            if they are not.
 170  *
 171  * Returns: 0 on success, or
 172  *          EILSEQ: One or both of the input strings contains illegal ASCII-range
 173  *                  characters (0x00 or '/'), or is not well-formed stream-safe UTF-8,
 174  *                  or contains codepoints that are non-characters or unassigned in
 175  *                  the version of Unicode currently supported (Unicode 9.0).
 176  *                  Note: The comparison may terminate early when a difference is
 177  *                        detected, and may return 0 and set *are_equal=false even
 178  *                        if one or both strings are invalid.
 179  */
 180 enum { kNFCSingleCharDecompMaxPlusPushback = kNFCSingleCharDecompMax + 4 }; /* room for 03B9 pushback(s) */
 181
 182 int
 183 utf8_normalizeOptCaseFoldAndCompare(const char *strA,
 184     size_t      strA_len,
 185     const char *strB,
 186     size_t      strB_len,
 187     bool        case_sens,
 188     bool       *are_equal)
 189 {
 190         const char *strALimit = strA + strA_len;
 191         const char *strBLimit = strB + strB_len;
 192
 193         /* Data for the next pending single-char norms from each input;
 194          *  These will always begin with a base char (combining class 0)
 195          *  or the first character in the string, which may not be a base */
 196         int32_t unormA[kNFCSingleCharDecompMaxPlusPushback], unormB[kNFCSingleCharDecompMaxPlusPushback];
 197         uint8_t unormAcc[kNFCSingleCharDecompMaxPlusPushback], unormBcc[kNFCSingleCharDecompMaxPlusPushback];
 198         int32_t unormAlen = 0, unormBlen = 0;
 199         int32_t unormAstart = 0, unormBstart = 0;
 200
 201         bool startA = true, startB = true;
 202
 203         /* main loop:
 204          * The main loop here is similar to the main loop in utf8_normalizeOptCaseFoldAndHash,
 205          * described above. The differences are:
 206          * - We keep a normalization buffer and main buffer for each string.
 207          * - In the main loop, we do steps 1-3 for each string.
 208          * - In step 4, instead of calling the hash function, we compare the two main
 209          *   buffers; if they are unequal, we return a non-equal result.
 210          * - After the end of the main loop, if we still have data for one string but
 211          *   not the other, return a non-equal result, else return an equal result.
 212          */
 213         do {
 214                 /* Data for the buffers being built up from each input */
 215                 int32_t bufA[kNCFStreamSafeBufMax], bufB[kNCFStreamSafeBufMax];
 216                 uint8_t bufAcc[kNCFStreamSafeBufMax], bufBcc[kNCFStreamSafeBufMax];
 217                 int32_t bufAlen = 0, bufBlen = 0;
 218                 bool needReorderA = false, needReorderB = false;
 219                 int err;
 220
 221                 err = nextBaseAndAnyMarks(&strA, strALimit, case_sens, unormA, unormAcc, &unormAlen, &unormAstart,
 222                     bufA, bufAcc, &bufAlen, &needReorderA, &startA);
 223                 if (err != 0) {
 224                         return err;
 225                 }
 226                 err = nextBaseAndAnyMarks(&strB, strBLimit, case_sens, unormB, unormBcc, &unormBlen, &unormBstart,
 227                     bufB, bufBcc, &bufBlen, &needReorderB, &startB);
 228                 if (err != 0) {
 229                         return err;
 230                 }
 231
 232                 if (bufAlen > 0 || bufBlen > 0) {
 233                         /* Now each buffer should have all of the combining marks up to the next base char.
 234                          * Normally it will also start with the last base char encountered (unless the
 235                          * UTF8 string began with a combining mark). */
 236                         /* Now reorder combining marks if necessary. */
 237                         if (needReorderA) {
 238                                 doReorder(bufA, bufAcc, bufAlen);
 239                         }
 240                         if (needReorderB) {
 241                                 doReorder(bufB, bufBcc, bufBlen);
 242                         }
 243                         /* handle 03B9 pushback */
 244                         int32_t idx;
 245                         if (!case_sens) {
 246                                 if (bufAlen > 1 && bufA[bufAlen - 1] == 0x03B9 && unormAstart == 0) {
 247                                         int32_t tailCount = 0;
 248                                         while (tailCount < kNFCSingleCharDecompMaxPlusPushback - unormAlen && bufAlen > 1 && bufA[bufAlen - 1] == 0x03B9) {
 249                                                 tailCount++;
 250                                                 bufAlen--;
 251                                         }
 252                                         for (idx = unormAlen; idx > 0; idx--) {
 253                                                 unormA[idx - 1 + tailCount] = unormA[idx - 1];
 254                                                 unormAcc[idx - 1 + tailCount] = unormAcc[idx - 1];
 255                                         }
 256                                         for (idx = 0; idx < tailCount; idx++) {
 257                                                 unormA[idx] = 0x03B9;
 258                                                 unormAcc[idx] = 0;
 259                                         }
 260                                         unormAlen += tailCount;
 261                                 }
 262                                 if (bufBlen > 1 && bufB[bufBlen - 1] == 0x03B9 && unormBstart == 0) {
 263                                         int32_t tailCount = 0;
 264                                         while (tailCount < kNFCSingleCharDecompMaxPlusPushback - unormBlen && bufBlen > 1 && bufB[bufBlen - 1] == 0x03B9) {
 265                                                 tailCount++;
 266                                                 bufBlen--;
 267                                         }
 268                                         for (idx = unormBlen; idx > 0; idx--) {
 269                                                 unormB[idx - 1 + tailCount] = unormB[idx - 1];
 270                                                 unormBcc[idx - 1 + tailCount] = unormBcc[idx - 1];
 271                                         }
 272                                         for (idx = 0; idx < tailCount; idx++) {
 273                                                 unormB[idx] = 0x03B9;
 274                                                 unormBcc[idx] = 0;
 275                                         }
 276                                         unormBlen += tailCount;
 277                                 }
 278                         }
 279                         /* Now compare the buffers. */
 280                         if (bufAlen != bufBlen || memcmp(bufA, bufB, bufAlen * sizeof(bufA[0])) != 0) {
 281                                 *are_equal = false;
 282                                 return 0;
 283                         }
 284                 }
 285                 /* OK so far, top of loop clears buffers to start refilling again */
 286         } while ((strA < strALimit || unormAlen > 0) && (strB < strBLimit || unormBlen > 0));
 287
 288         *are_equal = (strA == strALimit && unormAlen == 0 && strB == strBLimit && unormBlen == 0);
 289         return 0;
 290 }
 291
 292 /*
 293  * utf8_normalizeOptCaseFold
 294  *
 295  * str:       The input UTF-8 string (need not be 0 terminated)
 296  * str_len:   The byte length of the input string (excluding any 0 terminator)
 297  * case_sens: False for case-insensitive behavior; generates canonical caseless form.
 298  *            True for case-sensitive behavior; generates standard NFD.
 299  * ustr:      A pointer to a buffer for the resulting UTF-32 string.
 300  * ustr_size: The capacity of ustr, in UTF-32 units.
 301  * ustr_len:  Pointer to a value that will be filled in with the actual length
 302  *            in UTF-32 units of the string copied to ustr.
 303  *
 304  * Returns: 0 on success, or
 305  *          EILSEQ: The input string contains illegal ASCII-range characters
 306  *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
 307  *                  contains codepoints that are non-characters or unassigned in
 308  *                  the version of Unicode currently supported.
 309  *          ENOMEM: ustr_size is insufficient for the resulting string. In this
 310  *                  case the value returned in *ustr_len is invalid.
 311  */
 312 int
 313 utf8_normalizeOptCaseFold(const char *str,
 314     size_t      str_len,
 315     bool        case_sens,
 316     int32_t    *ustr,
 317     int32_t     ustr_size,
 318     int32_t    *ustr_len)
 319 {
 320         const char *strLimit = str + str_len;
 321         int32_t *ustrCur = ustr;
 322         const int32_t *ustrLimit = ustr + ustr_size;
 323
 324         /* Data for the next pending single-char norm from input;
 325          *  This will always begin with a base char (combining class 0) */
 326         int32_t unorm[kNFCSingleCharDecompMax];
 327         uint8_t unormcc[kNFCSingleCharDecompMax];
 328         int32_t unormlen = 0;
 329         int32_t unormstart = 0;
 330
 331         bool start = true;
 332
 333         *ustr_len = 0;
 334         do {
 335                 /* Data for the buffers being built up from input */
 336                 int32_t buf[kNCFStreamSafeBufMax];
 337                 uint8_t bufcc[kNCFStreamSafeBufMax];
 338                 int32_t buflen = 0;
 339                 bool needReorder = false;
 340                 int err;
 341
 342                 err = nextBaseAndAnyMarks(&str, strLimit, case_sens, unorm, unormcc, &unormlen, &unormstart,
 343                     buf, bufcc, &buflen, &needReorder, &start);
 344                 if (err != 0) {
 345                         return err;
 346                 }
 347
 348                 if (buflen > 0) {
 349                         if (needReorder) {
 350                                 doReorder(buf, bufcc, buflen);
 351                         }
 352                         /* Now copy to output buffer */
 353                         int32_t idx;
 354                         if (ustrCur + buflen > ustrLimit) {
 355                                 return ENOMEM;
 356                         }
 357                         for (idx = 0; idx < buflen; idx++) {
 358                                 *ustrCur++ = buf[idx];
 359                         }
 360                 }
 361                 /* OK so far, top of loop clears buffers to start refilling again */
 362         } while (str < strLimit || unormlen > 0);
 363         *ustr_len = (uint32_t)(ustrCur - ustr); // XXXpjr: the explicit (uint32_t) cast wasn't present in the original code drop
 364         return 0;
 365 }
 366
 367 /*
 368  * utf8_normalizeOptCaseFoldToUTF8
 369  * (This is similar to normalizeOptCaseFold except that this has a different output
 370  * buffer type, and adds conversion to UTF8 while copying to output buffer)
 371  *
 372  * str:       The input UTF-8 string (need not be 0 terminated)
 373  * str_len:   The byte length of the input string (excluding any 0 terminator)
 374  * case_sens: False for case-insensitive behavior; generates canonical caseless form.
 375  *            True for case-sensitive behavior; generates standard NFD.
 376  * ustr:      A pointer to a buffer for the resulting UTF-8 string.
 377  * ustr_size: The capacity of ustr, in bytes.
 378  * ustr_len:  Pointer to a value that will be filled in with the actual length
 379  *            in bytes of the string copied to ustr.
 380  *
 381  * Returns: 0 on success, or
 382  *          EILSEQ: The input string contains illegal ASCII-range characters
 383  *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
 384  *                  contains codepoints that are non-characters or unassigned in
 385  *                  the version of Unicode currently supported.
 386  *          ENOMEM: ustr_size is insufficient for the resulting string. In this
 387  *                  case the value returned in *ustr_len is invalid.
 388  */
 389 int
 390 utf8_normalizeOptCaseFoldToUTF8(const char *str,
 391     size_t      str_len,
 392     bool        case_sens,
 393     char       *ustr,
 394     size_t      ustr_size,
 395     size_t     *ustr_len)
 396 {
 397         const char *strLimit = str + str_len;
 398         char *ustrCur = ustr;
 399         const char *ustrLimit = ustr + ustr_size;
 400
 401         /* Data for the next pending single-char norm from input;
 402          *  This will always begin with a base char (combining class 0) */
 403         int32_t unorm[kNFCSingleCharDecompMax];
 404         uint8_t unormcc[kNFCSingleCharDecompMax];
 405         int32_t unormlen = 0;
 406         int32_t unormstart = 0;
 407
 408         bool start = true;
 409
 410         *ustr_len = 0;
 411         do {
 412                 /* Data for the buffers being built up from input */
 413                 int32_t buf[kNCFStreamSafeBufMax];
 414                 uint8_t bufcc[kNCFStreamSafeBufMax];
 415                 int32_t buflen = 0;
 416                 bool needReorder = false;
 417                 int err;
 418
 419                 err = nextBaseAndAnyMarks(&str, strLimit, case_sens, unorm, unormcc, &unormlen, &unormstart,
 420                     buf, bufcc, &buflen, &needReorder, &start);
 421                 if (err != 0) {
 422                         return err;
 423                 }
 424
 425                 if (buflen > 0) {
 426                         uint8_t utf8Bytes[kMaxUTF8BytesPerChar];
 427                         int32_t *bufPtr = buf;
 428                         if (needReorder) {
 429                                 doReorder(buf, bufcc, buflen);
 430                         }
 431                         /* Now copy to output buffer */
 432                         while (buflen-- > 0) {
 433                                 int32_t idx, utf8Len = u32CharToUTF8Bytes((uint32_t)*bufPtr++, utf8Bytes);
 434                                 if (ustrCur + utf8Len > ustrLimit) {
 435                                         return ENOMEM;
 436                                 }
 437                                 for (idx = 0; idx < utf8Len; idx++) {
 438                                         *ustrCur++ = (char)utf8Bytes[idx];
 439                                 }
 440                         }
 441                 }
 442                 /* OK so far, top of loop clears buffers to start refilling again */
 443         } while (str < strLimit || unormlen > 0);
 444         *ustr_len = ustrCur - ustr;
 445         return 0;
 446 }
 447
 448 /*
 449  * utf8_normalizeOptCaseFoldAndMatchSubstring
 450  *
 451  * strA:      A UTF-8 string (need not be 0 terminated) in which to search for the
 452  *            substring specified by ustrB.
 453  * strA_len:  The byte length of strA (excluding any 0 terminator)
 454  * ustrB:     A normalized UTF-32 substring (need not be 0 terminated) to be searched
 455  *            for in the UTF-32 string resulting from converting strA to the normalized
 456  *            UTF-32 form specified by the case_sens parameter; ustrB must already be
 457  *            in that form.
 458  * ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator).
 459  * case_sens: False for case-insensitive matching; compares canonical caseless forms.
 460  *            True for case-sensitive matching; compares standard NFD forms.
 461  * buf:       Pointer to caller-supplied working memory for storing the portion of
 462  *            strA which has been converted to normalized UTF-32.
 463  * buf_size:  The size of buf.
 464  * has_match: On success, set to true if strA (when converter to UTF-32 and normalized
 465  *            per case_sens) contains ustrB, set to false otherwise.
 466  *
 467  * Returns: 0 on success, or
 468  *          EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is
 469  *                  not well-formed stream-safe UTF-8, or contains codepoints that are
 470  *                  non-characters or unassigned in the version of Unicode currently
 471  *                  supported.
 472  *                  Note: The search may terminate early when a match is detected, and
 473  *                        may return 0 and set *has_match=true even if strA is invalid.
 474  *          ENOMEM: buf_size is insufficient.
 475  */
 476 int
 477 utf8_normalizeOptCaseFoldAndMatchSubstring(const char    *strA,
 478     size_t         strA_len,
 479     const int32_t *ustrB,
 480     int32_t        ustrB_len,
 481     bool           case_sens,
 482     void          *buf,
 483     size_t         buf_size,
 484     bool          *has_match)
 485 {
 486         /*
 487          * ustrA represents the current position in the UTF-32 normalized version of strA
 488          * at which we want to test for a match; ustrANormEnd is the position beyond that
 489          * which is just after the end of what has already been converted from strA to
 490          * UTF-32 normalized form.
 491          * Each time through the main loop:
 492          * - The first task is to make sure we have enough of strA converted to UTF32
 493          *   normalized form to test for match with ustrB at the current match position.
 494          *   If we don't, then convert more of strA to UTF-32 normalized form until we
 495          *   have enough to compare with ustrB. To do this, run a loop which is like the
 496          *   main loop in utf8_normalizeOptCaseFoldAndHash except that in step 4, instead of
 497          *   calling the hash function, we copy the normalized buffer to ustrANormEnd,
 498          *   advancing the latter. We keep doing this until we have enough additional
 499          *   converted to match with ustrB.
 500          * - Then we test for match of ustrB at the current ustrA position. If there is
 501          *   a match we return; otherwise, if there is more strA to convert we advance
 502          *   ustrA  and repeat the main loop, otherwise we return without a match.
 503          */
 504         if (ustrB_len == 0) { /* always matches */
 505                 *has_match = true;
 506                 return 0;
 507         }
 508         *has_match = false; /* initialize return value */
 509         if (ustrB_len > 2 * strA_len) {
 510                 /* If ustrB is clearly too long to find in strA, don't bother normalizing strA.
 511                  * A UTF-8 character of 1 byte (ASCII) will normalize to 1 UTF-32 unit.
 512                  * A UTF-8 character of 2-4 bytes will normalize to a maximum of 4 UTF-32 units.
 513                  * The maximum expansion from unnormalized UTF-8 byte length to normalized
 514                  *  UTF-32 unit length is thus 2. */
 515                 return 0;
 516         }
 517
 518         const char *strALimit = strA + strA_len;
 519         int32_t *ustrA = (int32_t *)buf;
 520         const int32_t *ustrALimit = ustrA + (buf_size / sizeof(int32_t));
 521         int32_t *ustrANormEnd = ustrA; /* how far we have already normalized in ustrA */
 522
 523         /* Data for the next pending single-char norms from each input;
 524          *  These will always begin with a base char (combining class 0)
 525          *  or the first character in the string, which may not be a base */
 526         int32_t unormA[kNFCSingleCharDecompMax];
 527         uint8_t unormAcc[kNFCSingleCharDecompMax];
 528         int32_t unormAlen = 0;
 529         int32_t unormAstart = 0;
 530
 531         bool startA = true;
 532
 533         while (true) {
 534                 /* convert enough more of strA to normalized UTF-32 in ustrA to check for match */
 535                 if (ustrANormEnd - ustrA < ustrB_len) {
 536                         do {
 537                                 /* Data for the buffers being built up from each input */
 538                                 int32_t bufA[kNCFStreamSafeBufMax];
 539                                 uint8_t bufAcc[kNCFStreamSafeBufMax];
 540                                 int32_t bufAlen = 0;
 541                                 bool needReorderA = false;
 542                                 int err;
 543
 544                                 err = nextBaseAndAnyMarks(&strA, strALimit, case_sens, unormA, unormAcc, &unormAlen, &unormAstart,
 545                                     bufA, bufAcc, &bufAlen, &needReorderA, &startA);
 546                                 if (err != 0) {
 547                                         return err;
 548                                 }
 549
 550                                 if (bufAlen > 0) {
 551                                         /* Now each buffer should have all of the combining marks up to the next base char.
 552                                          * Normally it will also start with the last base char encountered (unless the
 553                                          * UTF8 string began with a combining mark). */
 554                                         /* Now reorder combining marks if necessary. Should be rare, and sequences should
 555                                          * usually be short when does occur => simple bubblesort should be sufficient. */
 556                                         if (needReorderA) {
 557                                                 doReorder(bufA, bufAcc, bufAlen);
 558                                         }
 559                                         /* Now copy to working buffer */
 560                                         int32_t idx;
 561                                         if (ustrANormEnd + bufAlen > ustrALimit) {
 562                                                 return ENOMEM;
 563                                         }
 564                                         for (idx = 0; idx < bufAlen; idx++) {
 565                                                 *ustrANormEnd++ = bufA[idx];
 566                                         }
 567                                 }
 568                                 /* OK so far, top of loop clears buffers to start refilling again */
 569                         } while ((ustrANormEnd - ustrA < ustrB_len) && (strA < strALimit || unormAlen > 0));
 570                 }
 571
 572                 if (ustrANormEnd - ustrA < ustrB_len) {
 573                         return 0; /* not enough of strA left for match */
 574                 }
 575                 /* check for match, return if so */
 576                 if (memcmp(ustrA, ustrB, ustrB_len * sizeof(ustrB[0])) == 0) {
 577                         *has_match = true;
 578                         return 0;
 579                 }
 580                 ustrA++; /* advance match position */
 581         }
 582 }
 583
 584 /* nextBaseAndAnyMarks:
 585  * Guts of code to get next bufferful of base character (or first char in string)
 586  * and all trailing combining marks.
 587  * This is called each time through the main loop of functions above, and does the
 588  * following:
 589  * 1. If there are characters available in the normalization result buffer (from the
 590  *    result of normalizing a previous input character), copy the first character and
 591  *    any following characters that have non-zero combining class to the main buffer.
 592  * 2. If there is nothing left in the normalization buffer, then loop processing
 593  *    input characters as follows:
 594  *   a) Get the next input character from UTF8, get its normalized and case-folded
 595  *      result in the normalization buffer.
 596  *   b) If the first character in the normalization buffer has combining class 0,
 597  *      break; we will handle this normalization buffer next time through the main
 598  *      loop.
 599  *   c) Else copy the current normalization buffer (which has only combining marks)
 600  *      to the main buffer, and continue with the loop processing input characters.
 601  */
 602
 603 static int
 604 nextBaseAndAnyMarks(const char** strP, const char *strLimit, bool case_sens,
 605     int32_t* unorm, uint8_t* unormcc, int32_t* unormlenP, int32_t* unormstartP,
 606     int32_t* buf, uint8_t* bufcc, int32_t* buflenP,
 607     bool* needReorderP, bool* startP)
 608 {
 609         /* update buffers for str */
 610         if (*unormlenP > 0 && *unormstartP < *unormlenP) {
 611                 /* unorm begins with a base char; buflen should be 0 */
 612                 *needReorderP = false;
 613                 for (*buflenP = 0; true;) {
 614                         if (*buflenP > 0 && unormcc[*unormstartP] > 0 && unormcc[*unormstartP] < bufcc[(*buflenP) - 1]) {
 615                                 *needReorderP = true;
 616                         }
 617                         buf[*buflenP] = unorm[*unormstartP];
 618                         bufcc[(*buflenP)++] = unormcc[(*unormstartP)++];
 619                         if (*unormstartP >= *unormlenP || unormcc[*unormstartP] == 0) {
 620                                 break;
 621                         }
 622                 }
 623         }
 624         if (*unormstartP >= *unormlenP) {
 625                 *unormstartP = *unormlenP = 0;
 626                 while (*strP < strLimit) {
 627                         int32_t idx;
 628                         uint32_t bytevalue = (uint8_t)*(*strP)++;
 629                         /* '/' is not produced by NFD decomposition from another character so we can
 630                          * check for it before normalization */
 631                         if (bytevalue == 0 || bytevalue == 0x2F /*'/'*/) {
 632                                 return EILSEQ;
 633                         }
 634                         if (bytevalue < 0x80) {
 635                                 unorm[0] = (!case_sens && bytevalue >= 'A' && bytevalue <= 'Z')? bytevalue += 0x20: bytevalue;
 636                                 *unormlenP = 1;
 637                                 unormcc[0] = 0;
 638                                 *startP = false;
 639                                 break;
 640                         } else {
 641                                 int32_t u32char = utf8ToU32Code(bytevalue, strP, strLimit);
 642                                 if (u32char <= 0) {
 643                                         return EILSEQ;
 644                                 }
 645                                 *unormlenP = normalizeOptCaseFoldU32Char(u32char, case_sens, unorm, unormcc);
 646                                 if (*unormlenP <= 0) {
 647                                         return EILSEQ;
 648                                 }
 649                                 if (unormcc[0] == 0 || *startP) {
 650                                         *startP = false;
 651                                         break;
 652                                 }
 653                         }
 654                         /* the latest char decomposes to just combining sequence, add to buffer being built */
 655                         if (*buflenP + *unormlenP > kNCFStreamSafeBufMax) {
 656                                 return EILSEQ;
 657                         }
 658                         for (idx = 0; idx < *unormlenP; idx++, (*buflenP)++) {
 659                                 if (*buflenP > 0 && unormcc[idx] > 0 && unormcc[idx] < bufcc[(*buflenP) - 1]) {
 660                                         *needReorderP = true;
 661                                 }
 662                                 buf[*buflenP] = unorm[idx];
 663                                 bufcc[*buflenP] = unormcc[idx];
 664                         }
 665                         *unormlenP = 0;
 666                 }
 667         }
 668         return 0;
 669 }
 670
 671 /*  local prototypes used only by internal functions */
 672 static void swapBufCharCCWithPrevious(int32_t jdx, int32_t buf[], uint8_t bufcc[]);
 673 static int32_t adjustCase(bool case_sens, int32_t uSeqLen,
 674     int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]);
 675 static uint8_t getCombClassU32Char(int32_t u32char);
 676 static int32_t decomposeHangul(int32_t u32char, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]);
 677
 678 /* Reorder combining marks if necessary. Should be rare, and sequences should
 679  * usually be short when does occur => simple bubblesort should be sufficient. */
 680 void
 681 doReorder(int32_t* buf, uint8_t* bufcc, int32_t buflen)
 682 {
 683         int32_t idx, jdx;
 684         for (idx = 0; idx < buflen - 1; idx++) {
 685                 for (jdx = buflen - 1; jdx > idx; jdx--) {
 686                         if (bufcc[jdx] < bufcc[jdx - 1]) {
 687                                 swapBufCharCCWithPrevious(jdx, buf, bufcc);
 688                         }
 689                 }
 690         }
 691 }
 692 /*  swap function for bubblesort */
 693 static void
 694 swapBufCharCCWithPrevious(int32_t jdx, int32_t buf[], uint8_t bufcc[])
 695 {
 696         int32_t bufchar = buf[jdx];
 697         uint8_t bufccval = bufcc[jdx];
 698         buf[jdx] = buf[jdx - 1];
 699         bufcc[jdx] = bufcc[jdx - 1];
 700         buf[jdx - 1] = bufchar;
 701         bufcc[jdx - 1] = bufccval;
 702 }
 703
 704 /*
 705  * u32CharToUTF8Bytes, map a valid Unicode character (UTF32 code point) to 1..4 UTF8 bytes,
 706  * and returns the number of UTF8 bytes.
 707  *
 708  * adapted from ICU macro U8_APPEND_UNSAFE (utf8.h).
 709  */
 710 int32_t
 711 u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerChar])
 712 {
 713         int32_t idx = 0;
 714         if (u32char <= 0x7F) {
 715                 utf8Bytes[idx++] = (uint8_t)u32char;
 716         } else {
 717                 if (u32char <= 0x7FF) {
 718                         utf8Bytes[idx++] = (uint8_t)((u32char >> 6) | 0xC0);
 719                 } else {
 720                         if (u32char <= 0xFFFF) {
 721                                 utf8Bytes[idx++] = (uint8_t)((u32char >> 12) | 0xE0);
 722                         } else {
 723                                 utf8Bytes[idx++] = (uint8_t)((u32char >> 18) | 0xF0);
 724                                 utf8Bytes[idx++] = (uint8_t)(((u32char >> 12) & 0x3F) | 0x80);
 725                         }
 726                         utf8Bytes[idx++] = (uint8_t)(((u32char >> 6) & 0x3F) | 0x80);
 727                 }
 728                 utf8Bytes[idx++] = (uint8_t)((u32char & 0x3F) | 0x80);
 729         }
 730         return idx;
 731 }
 732
 733 /* two macros adapted from ICU's utf8.h */
 734 #define U8_COUNT_TRAIL_BYTES_LOC(leadByte) \
 735 ((uint8_t)(leadByte)<0XF0 ? \
 736 ((uint8_t)(leadByte)>=0XC0)+((uint8_t)(leadByte)>=0XE0) : \
 737 (uint8_t)(leadByte)<0XFE ? 3+((uint8_t)(leadByte)>=0XF8)+((uint8_t)(leadByte)>=0XFC) : 0)
 738
 739 #define U8_MASK_LEAD_BYTE_LOC(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
 740
 741 /* array adapted from ICU's utf_impl.c */
 742 static const int32_t utf8_minLegal[4] = { 0, 0X80, 0x800, 0x10000 };
 743
 744 /*
 745  * utf8ToU32Code, map a non-ASCII byte value plus a buffer of trail bytes to a UTF32 code point
 746  *
 747  * adapted from ICU macro U8_NEXT (utf8.h) and function utf8_nextCharSafeBody (utf_impl.c);
 748  * verified to produce the same results (adusted for the difference in API signature).
 749  *
 750  * assumes at entry that:
 751  * 1. a non-ASCII byte value (>= 0x80) that purports to be the beginning of a UTF8 character
 752  *    has been read, and its value is in u32char
 753  * 2. *srcPtr points to the input buffer just after that non-ASCII byte, i.e. it purportedly
 754  *    points to the trail bytes for that UTF8 char.
 755  * 3. srcLimit points to end of the input buffer (just after the last byte in the buffer)
 756  *
 757  * For a valid and complete UTF8 character, the function returns its value and advances
 758  * *srcPtr to the first byte after the UTF8 char. Otherwise, the function returns -1
 759  * (and the value in *srcPtr is undefined).
 760  * Note that while it does not map to surrogate values (generates an error for malformed
 761  * UTF-8 that would map to values in 0xD800..0xD8FF), it does output noncharacter values
 762  * whose low 16 bits are 0xFFFE or 0xFFFF without generating an error.
 763  *
 764  * equivalences used in adapted ICU code:
 765  * UChar = uint16_t
 766  * UChar32 = int32_t
 767  *
 768  * This has been validated against ICU behavior.
 769  */
 770 STATIC_UNLESS_TEST
 771 int32_t
 772 utf8ToU32Code(int32_t u32char, const char** srcPtr, const char* srcLimit)
 773 {
 774         const char* src = *srcPtr;
 775         uint8_t pt1, pt2;
 776         if (0xE0 < u32char && u32char <= 0xEC && src + 1 < srcLimit && (pt1 = (uint8_t)(src[0] - 0x80)) <= 0x3F && (pt2 = (uint8_t)(src[1] - 0x80)) <= 0x3F) {
 777                 /* handle U+1000..U+CFFF */
 778                 /* no need for (u32char&0xF) because the upper bits are truncated after <<12 in the cast to (uint16_t) */
 779                 u32char = (uint16_t)((u32char << 12) | (pt1 << 6) | pt2);
 780                 src += 2;
 781         } else if (u32char < 0xE0 && u32char >= 0xC2 && src < srcLimit && (pt1 = (uint8_t)(src[0] - 0x80)) <= 0x3F) {
 782                 /* handle U+0080..U+07FF */
 783                 u32char = ((u32char & 0x1F) << 6) | pt1;
 784                 src++;
 785         } else {
 786                 /* "complicated" and error cases, adapted from ICU's utf8_nextCharSafeBody() */
 787                 uint8_t count = U8_COUNT_TRAIL_BYTES_LOC(u32char);
 788                 if (src + count <= srcLimit) {
 789                         uint8_t trail;
 790
 791                         U8_MASK_LEAD_BYTE_LOC(u32char, count);
 792                         switch (count) {
 793                         /* branches 3, 2 fall through to the next one */
 794                         case 0:         /* count==0 for illegally leading trail bytes and the illegal bytes 0XFE and 0XFF */
 795                         case 5:
 796                         case 4:          /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
 797                                 break;
 798                         case 3:
 799                                 trail = *src++ - 0X80;
 800                                 u32char = (u32char << 6) | trail;
 801                                 /* u32char>=0x110 would result in code point>0x10FFFF, outside Unicode */
 802                                 if (u32char >= 0x110 || trail > 0X3F) {
 803                                         break;
 804                                 }
 805                         case 2:
 806                                 trail = *src++ - 0X80;
 807                                 u32char = (u32char << 6) | trail;
 808                                 /*
 809                                  * test for a surrogate D800..DFFF:
 810                                  * before the last (u32char<<6), a surrogate is u32char=360..37F
 811                                  */
 812                                 if (((u32char & 0xFFE0) == 0x360) || trail > 0X3F) {
 813                                         break;
 814                                 }
 815                         case 1:
 816                                 trail = *src++ - 0X80;
 817                                 u32char = (u32char << 6) | trail;
 818                                 if (trail > 0X3F) {
 819                                         break;
 820                                 }
 821                                 /* correct sequence - all trail bytes have (b7..b6)==(10) */
 822                                 if (u32char >= utf8_minLegal[count]) {
 823                                         *srcPtr = src;
 824                                         return u32char;
 825                                 }
 826                                 /* no default branch to optimize switch()  - all values are covered */
 827                         }
 828                 }
 829                 u32char = -1;
 830         }
 831         *srcPtr = src;
 832         return u32char;
 833 }
 834
 835 /*
 836  * normalizeCaseFoldU32Code, map a single UTF32 code point to its normalized result
 837  * and the combining classes for each resulting char, or indicate it is invalid.
 838  *
 839  * The normalized and case-folded result might be up to 4 UTF32 characters (current
 840  * max, could change in the future).
 841  *
 842  * u32char - input UTF32 code point
 843  * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only
 844  * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
 845  *          to receive the normalize result.
 846  * combClass - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
 847  *          to receive the combining classes for the characters in u32NormFoldBuf. If
 848  *          the first entry has non-zero combining class, the remaining entries do too.
 849  *
 850  * returns -1 if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax
 851  * is insufficient (though it is assumed to be at least 3), else the length of the
 852  * normalized and case-folded result (currently in the range 1..4).
 853  *
 854  * This has been validated against ICU behavior.
 855  *
 856  * This function is highly dependent on the structure of the data trie; for details on
 857  * that structure, see comments in normalizeCaseFoldData.h
 858  */
 859 STATIC_UNLESS_TEST
 860 int32_t
 861 normalizeOptCaseFoldU32Char(int32_t u32char, bool case_sens,
 862     int32_t u32NormFoldBuf[kNFCSingleCharDecompMax],
 863     uint8_t combClass[kNFCSingleCharDecompMax])
 864 {
 865         combClass[0] = 0;
 866         /*  return hi-range PUA as self, except non-characters */
 867         if (u32char >= kU32HiPUAStart) {
 868                 if ((u32char & 0xFFFE) == 0xFFFE) {
 869                         return -1;
 870                 }
 871                 u32NormFoldBuf[0] = u32char;
 872                 return 1;
 873         }
 874         /*  for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */
 875         /*  0 - 0x313FF; everything in between in currently invalid. */
 876         int32_t u32charLookup = u32char;
 877         if (u32charLookup >= kU32LowRangeLimit) {
 878                 u32charLookup -= (kU32HiRangeStart - kU32LowRangeLimit);
 879                 if (u32charLookup < kU32LowRangeLimit || u32charLookup >= (kU32LowRangeLimit + kU32HiRangeLen)) {
 880                         return -1; /* in the large range of currently-unassigned code points */
 881                 }
 882         }
 883         /* Now we have u32charLookup either in 0..0x313FF representing u32char itself,
 884          *  or in 0x31400..0x315FF representing u32char 0xE0000..0xE01FF; look it up in
 885          *  the trie that identifies unassigneds in this range, or maps others to
 886          *  decomps or combining class or just self. */
 887         uint16_t trieValue;
 888         /*  TrieHi */
 889         trieValue = nfTrieHi[u32charLookup >> kNFTrieHiShift];
 890         if (trieValue == kInvalidCodeFlag) {
 891                 return -1;
 892         }
 893         if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { /*  return self; */
 894                 u32NormFoldBuf[0] = u32char;
 895                 combClass[0] = trieValue & kFlagValueMask;
 896                 return 1;
 897         }
 898         if (trieValue == kHangulMask) {
 899                 combClass[1] = combClass[2] = 0;
 900                 return decomposeHangul(u32char, u32NormFoldBuf);
 901         }
 902         /*  TrieMid */
 903         trieValue = nfTrieMid[trieValue & kNextIndexValueMask][(u32charLookup >> kNFTrieMidShift) & kNFTrieMidMask];
 904         if (trieValue == kInvalidCodeFlag) {
 905                 return -1;
 906         }
 907         if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) {
 908                 u32NormFoldBuf[0] = u32char;
 909                 combClass[0] = trieValue & kFlagValueMask;
 910                 return adjustCase(case_sens, 1, u32NormFoldBuf);
 911         }
 912         if ((trieValue & kFlagTestMask) == kInvMaskFlag) {
 913                 uint16_t invalidMask = nfU16InvMasks[trieValue & kFlagValueMask];
 914                 uint16_t testBit = (uint16_t)(1 << (u32charLookup & kNFTrieLoMask));
 915                 if (testBit & invalidMask) {
 916                         /* invalid */
 917                         return -1;
 918                 } else {
 919                         /* treat like trieValue == 0 above */
 920                         u32NormFoldBuf[0] = u32char;
 921                         return adjustCase(case_sens, 1, u32NormFoldBuf);;
 922                 }
 923         }
 924         if (trieValue == kHangulMask) {
 925                 combClass[1] = combClass[2] = 0;
 926                 return decomposeHangul(u32char, u32NormFoldBuf);
 927         }
 928         /*  TrieLo */
 929         trieValue = nfTrieLo[trieValue & kNextIndexValueMask][u32charLookup & kNFTrieLoMask];
 930         if (trieValue == kInvalidCodeFlag) {
 931                 return -1;
 932         }
 933         if (trieValue == kHangulMask) {
 934                 combClass[1] = combClass[2] = 0;
 935                 return decomposeHangul(u32char, u32NormFoldBuf);
 936         }
 937         if (trieValue < kToU16Seq2Mask || trieValue > kSpecialsEnd) {
 938                 if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) {
 939                         u32NormFoldBuf[0] = u32char;
 940                         combClass[0] = trieValue & kFlagValueMask;
 941                 } else {
 942                         u32NormFoldBuf[0] = trieValue;
 943                 }
 944                 return adjustCase(case_sens, 1, u32NormFoldBuf);;
 945         }
 946         const uint16_t* u16SeqPtr = NULL;
 947         const int32_t*  u32SeqPtr = NULL;
 948         int32_t         uSeqLen = 0;
 949         switch (trieValue & kSpecialsMask) {
 950         case kToU16Seq2Mask:
 951                 if (case_sens && (trieValue & kToSeqCaseFoldMask)) {
 952                         /* don't use the mapping, it is only for case folding */
 953                         u32NormFoldBuf[0] = u32char;
 954                         /* already have combClass[0] = 0 */
 955                         return 1;
 956                 }
 957                 u16SeqPtr = nfU16Seq2[trieValue & kToSeqIndexMask];
 958                 uSeqLen = 2;
 959                 break;
 960         case kToU16Seq3Mask:
 961                 if (case_sens && (trieValue & kToSeqCaseFoldMask)) {
 962                         /* don't use the mapping, it is only for case folding */
 963                         u32NormFoldBuf[0] = u32char;
 964                         /* already have combClass[0] = 0 */
 965                         return 1;
 966                 }
 967                 u16SeqPtr = nfU16Seq3[trieValue & kToSeqIndexMask];
 968                 uSeqLen = 3;
 969                 break;
 970         case kToU16SeqMiscMask:
 971                 u16SeqPtr = &nfU16SeqMisc[trieValue & kToSeqMiscIndexMask];
 972                 uSeqLen = *u16SeqPtr & kToSeqMiscLenMask;
 973                 combClass[0] = (uint8_t)(*u16SeqPtr++ >> kToSeqMiscCCShift);
 974                 break;
 975         case kToU32CharMask:
 976                 if (case_sens && (trieValue & kToSeqCaseFoldMask)) {
 977                         /* don't use the mapping, it is only for case folding */
 978                         u32NormFoldBuf[0] = u32char;
 979                         /* already have combClass[0] = 0 */
 980                         return 1;
 981                 }
 982                 u32SeqPtr = &nfU32Char[trieValue & kToSeqIndexMask];
 983                 uSeqLen = 1;
 984                 break;
 985         case kToU32SeqMiscMask:
 986                 u32SeqPtr = &nfU32SeqMisc[trieValue & kToSeqMiscIndexMask];
 987                 uSeqLen = *u32SeqPtr & kToSeqMiscLenMask;
 988                 combClass[0] = (uint8_t)(*u32SeqPtr++ >> kToSeqMiscCCShift);
 989                 break;
 990         default:
 991                 return -1;
 992         }
 993         if (kNFCSingleCharDecompMax < uSeqLen) {
 994                 return 0;
 995         }
 996         int32_t idx;
 997         for (idx = 0; idx < uSeqLen; idx++) {
 998                 u32NormFoldBuf[idx] = (u16SeqPtr)? *u16SeqPtr++: *u32SeqPtr++;
 999                 if (idx > 0) {
1000                         combClass[idx] = getCombClassU32Char(u32NormFoldBuf[idx]);
1001                 }
1002         }
1003         return adjustCase(case_sens, uSeqLen, u32NormFoldBuf);
1004 }
1005
1006 /*
1007  * adjustCase, final adjustments to normalizeOptCaseFoldU32Char for case folding
1008  *
1009  * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only
1010  * uSeqLen - length of the sequence specified in the u32NormFoldBuf
1011  * u32NormFoldBuf - buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
1012  *          with normalized result.
1013  *
1014  * returns uSeqLen if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax
1015  * is insufficient (though it is assumed to be at least 3), else the length of the
1016  * normalized and case-folded result (currently in the range 1..4).
1017  *
1018  * This function is a reduced version of normalizeOptCaseFoldU32Char above.
1019  */
1020
1021 static int32_t
1022 adjustCase(bool case_sens, int32_t uSeqLen,
1023     int32_t u32NormFoldBuf[kNFCSingleCharDecompMax])
1024 {
1025         if (!case_sens && uSeqLen > 0) {
1026                 if (u32NormFoldBuf[0] < kSimpleCaseFoldLimit) {
1027                         u32NormFoldBuf[0] = nfBasicCF[u32NormFoldBuf[0]];
1028                         /* There is one case in which this maps to a character with different combining
1029                          * class: U+0345 (cc 240) casefolds to U+03B9 (cc 0). However when this is the
1030                          * first or only character in the sequence, we want to keep the original
1031                          * combining class, so nothing special to do here.
1032                          */
1033                 }
1034                 /* The following is the only case where we have a casefolding after the first
1035                  * character in the sequence. Don't worry about combining class here. that gets
1036                  * set later for characters after the first.
1037                  */
1038                 if (uSeqLen > 1 && u32NormFoldBuf[uSeqLen - 1] == 0x0345) {
1039                         u32NormFoldBuf[uSeqLen - 1] = 0x03B9;
1040                 }
1041         }
1042         return uSeqLen;
1043 }
1044
1045 /*
1046  * getCombClassU32Char, map a single character (in UTF32 form) to its combining class.
1047  *
1048  * u32char - input UTF32 code point. This is assumed to be a valid character that does
1049  * not have a decomposition.
1050  *
1051  * returns combining class of the character.
1052  *
1053  * This is only called for characters after the first is a decomposition expansion. In
1054  * this situation, if we encounter U+03B9 (combining class 0), it is only there as the
1055  * case-folding of U+0345 (combining class 240). In this case it is the combining class
1056  * for U+0345 that we want. In the non-casefold case we won't see U+03B9 here at all.
1057  *
1058  * This function is a reduced version of normalizeOptCaseFoldU32Char above.
1059  */
1060 static uint8_t
1061 getCombClassU32Char(int32_t u32char)
1062 {
1063         if (u32char >= kU32HiPUAStart) {
1064                 return 0;
1065         }
1066         if (u32char == 0x03B9) {
1067                 return 240;
1068         }
1069         /*  for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */
1070         /*  0 - 0x313FF; everything in between in currently invalid. */
1071         int32_t u32charLookup = u32char;
1072         if (u32charLookup >= kU32LowRangeLimit) {
1073                 u32charLookup -= (kU32HiRangeStart - kU32LowRangeLimit);
1074         }
1075         /* Now we have u32charLookup either in 0..0x313FF representing u32char itself,
1076          *  or in 0x31400..0x315FF representing u32char 0xE0000..0xE01FF; look it up in
1077          *  the trie that identifies unassigneds in this range, or maps others to
1078          *  decomps or combining class or just self. */
1079         uint16_t trieValue;
1080         /*  TrieHi */
1081         trieValue = nfTrieHi[u32charLookup >> kNFTrieHiShift];
1082         if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) {
1083                 return trieValue & kFlagValueMask;
1084         }
1085         /*  TrieMid */
1086         trieValue = nfTrieMid[trieValue & kNextIndexValueMask][(u32charLookup >> kNFTrieMidShift) & kNFTrieMidMask];
1087         if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { /*  return self; */
1088                 return trieValue & kFlagValueMask;
1089         }
1090         if ((trieValue & kFlagTestMask) == kInvMaskFlag) {
1091                 return 0;
1092         }
1093         /*  TrieLo */
1094         trieValue = nfTrieLo[trieValue & kNextIndexValueMask][u32charLookup & kNFTrieMidMask];
1095         return ((trieValue & kFlagTestMask) == kCombClassFlag)? (trieValue & kFlagValueMask): 0;
1096 }
1097
1098 /*
1099  * decomposeHangul, map a single UTF32 code point for a composed Hangul
1100  * in the range AC00-D7A3, using algorithmic decomp
1101  *
1102  * The normalized result will be 2 or 3 UTF32 characters.
1103  *
1104  * u32char - input UTF32 code point
1105  * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
1106  *          to receive the normalize result.
1107  *
1108  * returns the length of the normalized result (2..3).
1109  *
1110  * Adapted from ICU Hangul:decompose in normalizer2impl.h
1111  *
1112  */
1113
1114 enum {
1115         HANGUL_BASE=0xAC00,
1116         JAMO_L_BASE=0x1100,     /* "lead" jamo */
1117         JAMO_V_BASE=0x1161,     /* "vowel" jamo */
1118         JAMO_T_BASE=0x11A7,     /* "trail" jamo */
1119         JAMO_L_COUNT=19,
1120         JAMO_V_COUNT=21,
1121         JAMO_T_COUNT=28,
1122 };
1123
1124 static int32_t
1125 decomposeHangul(int32_t u32char, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax])
1126 {
1127         u32char -= HANGUL_BASE;
1128         int32_t tIndex = u32char % JAMO_T_COUNT;
1129         u32char /= JAMO_T_COUNT;
1130         u32NormFoldBuf[0] = (uint16_t)(JAMO_L_BASE + u32char / JAMO_V_COUNT);
1131         u32NormFoldBuf[1] = (uint16_t)(JAMO_V_BASE + u32char % JAMO_V_COUNT);
1132         if (tIndex == 0) {
1133                 return 2;
1134         }
1135         u32NormFoldBuf[2] = (uint16_t)(JAMO_T_BASE + tIndex);
1136         return 3;
1137 }