+/* The following code (inside NEW_XATTR) was ported from apfs. */
+#if NEW_XATTR
+/*
+ * This is the same as fext2cluster_check(), but overflow is asserted against.
+ *
+ * This is useful for places which hold the invariant that `fext' is already
+ * representable when translated but additionally want to assert that that is
+ * the case.
+ */
+static uint64_t
+fext2cluster(const HFSPlusExtentDescriptor *fext, uint32_t fs_bsize)
+{
+ uint64_t off;
+ const bool ok = fext2cluster_check(fext, fs_bsize, &off);
+ xattr_assert(ok);
+ return off;
+}
+
+/*
+ * Translate `fext' to a cluster layer virtual offset, set via `out', that is
+ * suitable for IO to the single xattr vnode.
+ *
+ * For any particular file extent, this will map to a unique logical offset
+ * that may be used to key into the ubc. The returned value has the property
+ * such that file extents on adjacent physical blocks will always be mapped to
+ * different page sized multiples. Internally, this just multiplies by the
+ * larger of the pagesize and the blocksize (see xattr_cluster_scale() for the
+ * derivation). For further details on the importance of this in the overall
+ * scheme of xattr IO, see uio_set_fext().
+ *
+ * This return trues if `fext' is representable in cluster layer virtual
+ * offset. It may return false for corrupted file extents:
+ * - extents that likely extend beyond the size of the underlying drive
+ *
+ * The second point should not happen under normal circumstances even for large
+ * drives. Large drives (by the logic in hfs_newfs()) are automatically
+ * formatted to use large block sizes: drives sized >= 16TB use 16kiB fs
+ * blocksize, at which point the virtual offset computed is equal to the device
+ * offset (even on a 16kiB pagesize system).
+ * So as long as a drive does not exceed 2^63 bytes in capacity (which is the
+ * precision of `off_t'), the internal multiplication should not overflow.
+ */
+static bool
+fext2cluster_check(const HFSPlusExtentDescriptor *fext, uint32_t fs_bsize,
+ uint64_t *out)
+{
+ const uint64_t pbn = fext->startBlock;
+
+ // xattrs has invalid start block
+ if (!pbn) {
+ return false;
+ }
+
+ // scale pbn
+ uint64_t off;
+ if (__builtin_mul_overflow(pbn, xattr_cluster_scale(fs_bsize), &off)) {
+ return false;
+ }
+
+ *out = off;
+
+ // the whole fext should be in range
+ if (__builtin_add_overflow(off, fext->blockCount * fs_bsize, &off)) {
+ return false;
+ }
+
+ // don't exceed signed 64bit precision
+ if (off > INT64_MAX) {
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Return the scale factor for translating xattr physical block numbers into
+ * a logical offset into the single xattr vnode's ubc.
+ *
+ * Translated blocks have two key requirements:
+ * - They must not overlap.
+ * Otherwise two different blocks' contents will collide.
+ * - They must not fall within the same page.
+ * Otherwise reading a whole page may pull in multiple xattr's blocks, but
+ * only decrypt with one of the xattr's keys.
+ *
+ * A table of all possible configurations:
+ * pagesize, fs blocksize, scale
+ * 4k, 4k, 4k
+ * 4k, 8k, 8k
+ * ...
+ * 4k, 64k, 64k
+ * 16k, 4k, 16k
+ * 16k, 8k, 16k
+ * 16k, 16k, 16k
+ * 16k, 32k, 32k
+ * 16k, 64k, 64k
+ *
+ * This may be expressed as
+ * scale = max(pagesize, fs blocksize)
+ */
+static uint32_t
+xattr_cluster_scale(uint32_t fs_bsize)
+{
+ return MAX(PAGE_SIZE, fs_bsize);
+}
+
+/*
+ * See off_in_range().
+ */
+static bool
+off_in_range2(uint64_t off, uint64_t start, uint64_t len)
+{
+ return (start <= off) && (off < (start + len));
+}
+
+/*
+ * Return true if `off' returned from fext2cluster() was produced from
+ * `fext'.
+ */
+static bool
+cluster_off_in_fext(uint64_t off, const HFSPlusExtentDescriptor *fext,
+ uint32_t fs_bsize)
+{
+ return off_in_range2(off, fext2cluster(fext, fs_bsize), fext->blockCount * fs_bsize);
+}
+
+/*
+ * Allocate space to save a file extent reference for xattr IO.
+ *
+ * This provides a mechanism to communicate file extents from top level, stream
+ * based xattr IO functions down into lower level IO functions
+ * (_blockmap() and _strategy()).
+ *
+ * This doesn't really return a file extent, it returns a reference into
+ * `info->xattr_fexts' which may be pointed to a file extent reference.
+ * Alternatively this could just return an integral index index, but then we'd
+ * need some way to signal failure.
+ *
+ * Note: the returned reference cannot be assigned directly; it must be set
+ * via xattr_fext_set() to correctly synchronize with a racing call to
+ * xattr_fext_find().
+ *
+ * This call will not block; it will return NULL if no free spaces are
+ * available. On success, follow with a call to xattr_fext_free().
+ *
+ * In terms of the implementation, this is a basic zone allocator for thread
+ * local storage in disguise. it supports only one file extent to be
+ * set by up to 64 threads.
+ * For further details, see the documentation for each field above the
+ * definition of `xattr_io_info_t'.
+ */
+static const HFSPlusExtentDescriptor **
+xattr_fext_alloc(xattr_io_info_t *info)
+{
+ const HFSPlusExtentDescriptor **ret;
+ size_t i;
+
+ // search for the first free bit
+ lck_spin_lock(&info->lock);
+ if (!xbm_find_free(info, &i)) {
+ // no free fexts
+ ret = NULL;
+ goto fail;
+ }
+
+ // mark that position as allocated
+ xbm_set_used(info, i);
+ ret = &info->xattr_fexts[i];
+ xattr_assert(!*ret);
+
+fail:
+ lck_spin_unlock(&info->lock);
+ return ret;
+}
+
+/*
+ * Free the memory associated with an xattr fext referenced return from
+ * xattr_fext_alloc().
+ * This simply amounts to clearing the bit within `info->free_bm' that
+ * corresponds to `xfext'. While not strictly neccessary, we also clear out the
+ * xattr fext itself to hold the invariant that a clear bit within the free
+ * bitmap has a corresponding NULL fext reference.
+ */
+static void
+xattr_fext_free(xattr_io_info_t *info, const HFSPlusExtentDescriptor **xfext)
+{
+ lck_spin_lock(&info->lock);
+ const size_t i = xattr_fext_index(info, xfext);
+ xbm_clear_used(info, i);
+ info->xattr_fexts[i] = NULL;
+ lck_spin_unlock(&info->lock);
+}
+
+/*
+ * Given an allocated xattr fext from xattr_fext_alloc() assign it to reference
+ * `fext'. A copy of this fext may be returned by a subsequent call to
+ * xattr_fext_find().
+ *
+ * This may be called multiple times for the same value of `xfext'.
+ * `fext' will be borrowed until a subsequent call to xattr_fext_set() for a
+ * different file extent or xattr_fext_free() for `xfext'. It must have
+ * lifetime that spans at least as long from when it's first set to when it's
+ * cleared either by xattr_fext_free() or xattr_fext_clear().
+ *
+ * Note: `fext' may be introspected by other threads via xattr_fext_find()
+ * (and in terms of getxattr(), two threads may use each other's file extents
+ * if they race to read the same xattr).
+ */
+static void
+xattr_fext_set(xattr_io_info_t *info, const HFSPlusExtentDescriptor **xfext,
+ const HFSPlusExtentDescriptor *fext)
+{
+ xattr_assert(xbm_valid_index(info, xattr_fext_index(info, xfext)));
+ lck_spin_lock(&info->lock);
+ *xfext = fext;
+ lck_spin_unlock(&info->lock);
+}
+
+/*
+ * Given a cluster layer virtual offset, attempt to look up a file extent set
+ * via a previous call to xattr_fext_set().
+ *
+ * If such a fext is found, its value is copied to `out' and true is returned.
+ * Note: `out' should reference wired memory: it will be stored to while a spin
+ * lock is held; accesses must not fault.
+ *
+ * off_out will contain the "unvirtualized" offset
+ */
+bool
+hfs_xattr_fext_find(xattr_io_info_t *info, uint32_t fs_bsize, uint64_t off,
+ HFSPlusExtentDescriptor *out, uint64_t *off_out)
+{
+ bool found = false;
+ lck_spin_lock(&info->lock);
+
+ // search through all in-use fexts
+ xbm_iter_t iter = xbm_make_iter(info->free_bm, xbm_size(info));
+ while(xbm_iter_next(&iter)) {
+ const HFSPlusExtentDescriptor *fext = info->xattr_fexts[xbm_iter_peek(&iter)];
+ if (!fext || !cluster_off_in_fext(off, fext, fs_bsize)) {
+ continue;
+ }
+ // `off' intersects; return `fext'
+ *out = *fext;
+ found = true;
+ break;
+ }
+
+ lck_spin_unlock(&info->lock);
+
+ if (found) {
+ *off_out = ((uint64_t)out->startBlock * fs_bsize) + off - fext2cluster(out, fs_bsize);
+ }
+
+ return found;
+}
+
+/*
+ * Given an allocated xattr fext, clear its reference to any `fext' passed to a
+ * previous call to xattr_fext_set().
+ *
+ * This will end the lifetime of such a fext and prevent xattr_fext_find() from
+ * taking a reference to it. From here, its backing memory can be deallocated.
+ *
+ * Unlike xattr_fext_free(), `xfext' will remain allocated and it may passed to
+ * xattr_fext_set() again.
+ */
+static void
+xattr_fext_clear(xattr_io_info_t *info, const HFSPlusExtentDescriptor **xfext)
+{
+ xattr_assert(xbm_valid_index(info, xattr_fext_index(info, xfext)));
+ lck_spin_lock(&info->lock);
+ *xfext = NULL;
+ lck_spin_unlock(&info->lock);
+}
+
+/*
+ * For an xattr file extent, `xfext', returned from a previous call to
+ * xattr_fext_alloc(), return its index within `info->xattr_fexts'.
+ */
+static size_t
+xattr_fext_index(const xattr_io_info_t *info, const HFSPlusExtentDescriptor **xfext)
+{
+ xattr_assert((info->xattr_fexts <= xfext) &&
+ (xfext < &info->xattr_fexts[xbm_size(info)]));
+ return ((uintptr_t)xfext - (uintptr_t)info->xattr_fexts) / sizeof(*xfext);
+}
+
+static void
+bitmap_set_range(uint64_t *bm, int64_t index, int64_t count)
+{
+ int dstshift0, dstshift1;
+ uint64_t dstmask0, dstmask1;
+ int64_t bmi;
+
+ dstshift0 = index % 64;
+ dstshift1 = 64 - (index % 64);
+ dstmask0 = ~0ULL << (index % 64);
+ dstmask1 = (dstshift1 == 64) ? 0ULL : (~0ULL >> (64 - (index % 64)));
+
+ bmi = index / 64;
+ while (count >= 64) {
+ bm[bmi] = (bm[bmi] & ~dstmask0) | ((~0ULL << dstshift0) & dstmask0);
+ if (dstmask1)
+ bm[bmi + 1] = (bm[bmi + 1] & ~dstmask1) | ((~0ULL >> dstshift1) & dstmask1);
+ bmi++;
+ count -= 64;
+ }
+ if (count) {
+ // adjust dstmask to cover just the bits remaining
+ dstmask0 = ((1ULL << count) - 1) << (index % 64);
+ dstmask1 = (dstshift1 == 64) ? 0ULL : (((1ULL << count) - 1) >> (64 - (index % 64)));
+ bm[bmi] = (bm[bmi] & ~dstmask0) | ((~0ULL << dstshift0) & dstmask0);
+ if ((count > (64 - dstshift0)) && dstmask1)
+ bm[bmi + 1] = (bm[bmi + 1] & ~dstmask1) | ((~0ULL >> dstshift1) & dstmask1);
+ }
+}
+
+static void
+bitmap_clear_range(uint64_t *bm, int64_t index, int64_t count)
+{
+ int dstshift0, dstshift1;
+ uint64_t dstmask0, dstmask1;
+ int64_t bmi;
+
+ dstshift0 = index % 64;
+ dstshift1 = 64 - (index % 64);
+ dstmask0 = ~0ULL << (index % 64);
+ dstmask1 = (dstshift1 == 64) ? 0ULL : (~0ULL >> (64 - (index % 64)));
+
+ bmi = index / 64;
+ while (count >= 64) {
+ bm[bmi] = (bm[bmi] & ~dstmask0) | ((0ULL << dstshift0) & dstmask0);
+ if (dstmask1)
+ bm[bmi + 1] = (bm[bmi + 1] & ~dstmask1) | ((0ULL >> dstshift1) & dstmask1);
+ bmi++;
+ count -= 64;
+ }
+ if (count) {
+ // adjust dstmask to cover just the bits remaining
+ dstmask0 = ((1ULL << count) - 1) << (index % 64);
+ dstmask1 = (dstshift1 == 64) ? 0ULL : (((1ULL << count) - 1) >> (64 - (index % 64)));
+ bm[bmi] = (bm[bmi] & ~dstmask0) | ((0ULL << dstshift0) & dstmask0);
+ if ((count > (64 - dstshift0)) && dstmask1)
+ bm[bmi + 1] = (bm[bmi + 1] & ~dstmask1) | ((0ULL >> dstshift1) & dstmask1);
+ }
+}
+
+static int
+ctzll(uint64_t val)
+{
+ return (val == 0) ? 64 : __builtin_ctzll(val);
+}
+
+// search forwards through range and return index of first "bit" (0 or 1) encountered
+static int
+bitmap_range_find_first(int bit, const uint64_t *bm, int64_t index, int64_t count, int64_t *clear_index)
+{
+ int64_t bmi, check_count;
+ uint64_t val;
+ int pos;
+
+ bmi = index / 64;
+ while (count > 0) {
+ check_count = 64 - (index % 64);
+ if (check_count > count)
+ check_count = count;
+ val = bm[bmi] >> (index % 64);
+ if (!bit)
+ val = ~val;
+ pos = ctzll(val);
+ if (pos < check_count) {
+ *clear_index = index + pos;
+ return 1;
+ }
+ index += check_count;
+ count -= check_count;
+ bmi++;
+ }
+ return 0;
+}
+
+/*
+ * Search for the first free (clear) bit within `info's free xattr fext bitmap.
+ * Return false if no bits are clear.
+ * If some bit is found, its index is set to `*out' and true is returned.
+ */
+static bool
+xbm_find_free(const xattr_io_info_t *info, size_t *out)
+{
+ return bm_find(info->free_bm, 0, xbm_size(info), false, out);
+}
+
+/*
+ * Set the bit at index `i' within `info's underlying free xattr fext bitmap.
+ *
+ * info->lock must be held.
+ * It only makes sense to operate on one bit at a time, so this wraps
+ * bitmap_set_range().
+ */
+static void
+xbm_set_used(xattr_io_info_t *info, size_t i)
+{
+ xattr_assert(xbm_valid_index(info, i));
+ bitmap_set_range(info->free_bm, i, 1);
+}
+
+/*
+ * Clear the bit at index `i' within `info's underlying free xattr fext bitmap.
+ * This is the opposite of xbm_set_used().
+ */
+static void
+xbm_clear_used(xattr_io_info_t *info, size_t i)
+{
+ xattr_assert(xbm_valid_index(info, i));
+ bitmap_clear_range(info->free_bm, i, 1);
+}
+
+/*
+ * Return whether the given bitmap index is a valid index into `info's free
+ * bitmap.
+ */
+static bool
+xbm_valid_index(const xattr_io_info_t *info, int64_t i)
+{
+ return bm_valid_index(i, xbm_size(info));
+}
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(A) (sizeof(A) / sizeof(A[0]))
+#endif
+
+/*
+ * Return the total number of *bits* in `info's free xattr fext bitmap.
+ */
+static size_t
+xbm_size(const xattr_io_info_t *info)
+{
+ // one bit per xattr fext
+ return ARRAY_SIZE(info->xattr_fexts);
+}
+
+/*
+ * Factory for an iterator over all the set (true value) bits in `bitmap'.
+ * `len' is the length in *bits* of `bitmap'.
+ *
+ * The iterator is created in an uninitialized state, a call to xbm_iter_next()
+ * is required to find the first set bit (this is different than
+ * make_fext_iterator()). As a consequence of this, an iterator may iterate
+ * zero times if no bits within `bitmap' are set. After each successful call to
+ * xbm_iter_next(), xbm_iter_peek() will return the zero-based index of each
+ * set bit.
+ *
+ * The intended use is along the lines of:
+ * uint64_t *bm = ...; // a bitmap 123 bits (two uint64_ts) long
+ * xbm_iterator_t iter = xbm_make_iter(bm, 123);
+ * while(xbm_iter_next(&iter)) {
+ * size_t i = xbm_iter_peek(&iter);
+ * printf("index %lu is set\n", i);
+ * }
+ *
+ * In terms of the iterator internals, we hold the invariant that a valid
+ * iterator always has `i' in [0, len). A valid iterator is one for which
+ * xbm_iter_peek() will return an index in range of `bitmap'. To bootstrap the
+ * first iteration, `i' is set to SIZE_MAX; further details are in
+ * xbm_iter_next().
+ */
+static xbm_iter_t
+xbm_make_iter(const uint64_t *bitmap, size_t len)
+{
+ xattr_assert(len);
+ return (xbm_iter_t) {
+ .bitmap = bitmap,
+ .i = SIZE_MAX, // This will overflow-to-zero on the first call to xbm_iter_next()
+ .len = len
+ };
+}
+
+/*
+ * Advance `iter' to internally reference the next set bit within the bitmap.
+ * If there are no more set bits, this returns false.
+ *
+ * On success, the index of the next set bit can be retrieved by
+ * xbm_iter_peek().
+ *
+ * Internally, this searches for the first bit set at bit index >= iter->i+1.
+ * For a new iterator, the value of `i' is initialized to SIZE_MAX so `i'+1
+ * will unsigned integer overflow (which is well defined) to zero.
+ */
+static bool
+xbm_iter_next(xbm_iter_t *iter)
+{
+ size_t i;
+ // find the next set bit > `i'
+ const bool found = bm_find(iter->bitmap, iter->i + 1, iter->len, true, &i);
+
+ // if no bit is found, invalidate the iterator by setting i=len
+ iter->i = found ? i : iter->len;
+ return found;
+}
+
+/*
+ * Return the index a the set bit after a successful call to xbm_iter_next().
+ */
+static size_t
+xbm_iter_peek(const xbm_iter_t *iter)
+{
+ xattr_assert(iter->i < iter->len);
+ return iter->i;
+}
+
+/*
+ * Search for the first bit with `value' within bitmap `bm' >= bit index `from'
+ * for at most `len' *bits*. Whether such a bit exists is returned and if
+ * that's the case, its bit index is written via `out'.
+ *
+ * This is just a fancy wrapper around bitmap_range_find_first().
+ */
+static bool
+bm_find(const uint64_t *bm, size_t from, size_t len, bool value, size_t *out)
+{
+ xattr_assert(bm_valid_index(from, len));
+
+ // search for `value' in [from, len)
+ int64_t i;
+ if (!bitmap_range_find_first(value,
+ const_cast(uint64_t *, bm), from, len, &i)) {
+ return false;
+ }
+
+ // some bit found; check the returned index is valid
+ xattr_assert(bm_valid_index(i, len));
+ *out = (size_t)i;
+ return true;
+}
+
+/*
+ * Return true if `i' is a valid index into a bit of `len' bits.
+ *
+ * The underlying bitmap_ routines operate on `int64_t' indices. This is
+ * mainly to safely convert to `size_t'.
+ */
+static bool
+bm_valid_index(int64_t i, size_t len)
+{
+ return (i >= 0) && ((uint64_t)i < len);
+}
+
+/*
+ * Virtualize `uio' offset to target xattr `fext' before a call to
+ * cluster_xattr().
+ *
+ * The computation of the IO offset is somewhat subtle. The reason for this
+ * fact largely has to do with how exactly the single xattr vnode
+ * (hfsmp->hfs_attrdata_vp) caches data for multiple xattrs. First,
+ * some discussion on the motivation for the single xattr vnode design. At the
+ * top level, xattr apis are quite different from normal file data apis. Some
+ * key properties are:
+ * - xattr IO apis do no support editing or random reads
+ * - xattrs may not be mmapped
+ * - any one file may have an arbitrary number of xattrs
+ * To contrast with a normal file, each file has a corresponding vnode which in
+ * turn has its own private ubc. The only way in which xattrs are actually like
+ * files is that they the have the same size limits and in
+ * terms of their implementation, they use the same on-disk structures.
+ * The result of this is that it is too high overhead to have one vnode per
+ * xattr, but to instead to maintain a disk block-type cache for xattr data.
+ * This cache is implemented as the ubc of a virtual file known as the single
+ * xattr vnode. Reads and writes are serviced by the cluster layer. The cluster
+ * layer operates in units of the vm pagesize. On a system for which
+ * pagesize > fs blocksize, then the naïve approach of using an identity
+ * mapping between ubc logical offset and device offset poses a challenge.
+ * Consider the following scenario:
+ * - 16k vm pagesize
+ * - 4k fs blocksize
+ *
+ * On disk, we have two xattrs that reside on adjacent blocks:
+ * xattr A xattr B
+ * [aaaa|aaaa|bbbb|bbbb]
+ * pbn 4 5 6 7 8
+ *
+ * Suppose we want to just read xattr A -- pbn 4 and 5 -- the cluster layer can
+ * issue just an 8k IO, but it will store it in memory as a whole page, so we
+ * would result in
+ *
+ * xattr A xattr B
+ * in memory:
+ * [aaaa|aaaa|0000|0000]
+ *
+ * on disk:
+ * [aaaa|aaaa|bbbb|bbbb]
+ * pbn 4 5 6 7 8
+ *
+ * A subsequent read for pbn 6 or 7, as a part of xattr B, will find the page
+ * already cached in the ubc and erroneously return zeros.
+ * Instead, we could have the cluster layer issue the full 16k IO, but then we
+ * run into encryption issues on per-file (really per-xattr) key volumes
+ *
+ * xattr A xattr B
+ * in memory:
+ * [aaaa|aaaa|O!#W|JF%R]
+ *
+ * on disk:
+ * [asdf|ghjk|ZXCV|BNM,] encrypted
+ * [aaaa|aaaa|bbbb|bbbb] unencrypted
+ * pbn 4 5 6 7 8
+ *
+ * In this case, the issue is that we have the crypto state available to read
+ * xattr A, but we do not have the crypto state for xattr B, so we would
+ * incorrectly decrypt pbn 6, 7 in the same IO.
+ *
+ * The solution to this is to use a scaled mapping between ubc logical offset
+ * and device offset. In this case, we use
+ * logical offset = physical block number * pagesize
+ * and result looks like
+ *
+ * xattr A xattr B
+ * in memory:
+ * [aaaa|aaaa|0000|0000] ... [bbbb|bbbb|0000|0000]
+ * 64k 96k
+ *
+ * on disk:
+ * [asdf|ghjk|ZXCV|BNM,] encrypted
+ * [aaaa|aaaa|bbbb|bbbb] unencrypted
+ * pbn 4 5 6 7 8
+ *
+ * In memory, xattr A occupies the virtual range of [64k, 96k), but its
+ * contents are representable in the first 8k out of [64k, 80k). Note that the
+ * mapping here is not per block but rather per xattr file extent. The contents
+ * tracked by an individual file extent are logically contiguous in memory. In
+ * the example above, xattr A has one file extent spanning [0, 8k). Suppose it
+ * instead had two file extents -- [0, 4k) at pbn 4 and [4k, 8k) at pbn 5 --
+ * the above diagram would instead look like
+ * in memory:
+ * [aaaa|0000|0000|0000][aaaa|0000|0000|0000]
+ * 64k 80k
+ * on disk:
+ * [aaaa|aaaa] unencrypted
+ * pbn 4 5
+ *
+ * This scaled mapping approach guarantees that xattrs are always on different
+ * pages from other xattrs, but it comes at an increased memory cost for
+ * non-page multiple sized xattrs.
+ * --
+ *
+ * If `fext' is not representable as a virtual offset (e.g. its phys_block_num
+ * is corrupt), this function returns false.
+ */
+static bool
+uio_set_fext(uio_t uio, const HFSPlusExtentDescriptor *fext, uint32_t fs_bsize)
+{
+ uint64_t off;
+ if (!fext2cluster_check(fext, fs_bsize, &off)) {
+ // `fext' is out of range
+ return false;
+ }
+
+ uio_setoffset(uio, off);
+ return true;
+}
+#endif // NEW_XATTR