X-Git-Url: https://git.saurik.com/apple/hfs.git/blobdiff_plain/e4cc4424151bb40d5b692bd5de231b3283d1db36..refs/heads/master:/core/hfs_xattr.c diff --git a/core/hfs_xattr.c b/core/hfs_xattr.c index 311b901..4dba366 100644 --- a/core/hfs_xattr.c +++ b/core/hfs_xattr.c @@ -98,6 +98,91 @@ static int has_overflow_extents(HFSPlusForkData *forkdata); static int count_extent_blocks(int maxblks, HFSPlusExtentRecord extents); +#if NEW_XATTR + +/* + * Iterator over set bits within an xattr free fext bitmap. + * + * See xbm_make_iter() for details on creation. + */ +typedef struct { + const uint64_t *bitmap; + size_t i; + size_t len; +} xbm_iter_t; + +// xattr IO subsystem assertion +#define xattr_assert(cond) if(!(cond)) do { \ + panic("xattr_assert() failed: %s",#cond); \ +} while(0) + +#define xattr_infomsg(...) // errmsg(__VA_ARGS__) + +/* + * Purely for documentation purposes, simulate C++ const_cast<>() in the C + * language. + * This makes it quite obvious that the `const' part of a cast is being removed + * to workaround an api that accepts a non-const pointer but otherwise has no + * reason to modify its value. + * E.g. + * + * ancient_sum(a, count) + * int *a; int count; + * { + * int s; + * s = 0; + * while (count--) { + * s = s + a[i]; + * } + * return s; + * } + * + * int modern_sum(const int *a, int count) { + * return ancient_sum(const_cast(int *, a), count); + * } + */ +#define const_cast(type, expr) ((type)(expr)) + +static uint64_t fext2cluster(const HFSPlusExtentDescriptor *fext, uint32_t fs_bsize); +static bool cluster_off_in_fext(uint64_t off, const HFSPlusExtentDescriptor *fext, + uint32_t fs_bsize); + +static bool fext2cluster_check(const HFSPlusExtentDescriptor *fext, uint32_t fs_bsize, + uint64_t *out); +static uint32_t xattr_cluster_scale(uint32_t fs_bsize); + +// xattr fext thread local storage routines +static const HFSPlusExtentDescriptor **_Nullable xattr_fext_alloc( + xattr_io_info_t *info) __attribute__((warn_unused_result)); +static void xattr_fext_free(xattr_io_info_t *info, + const HFSPlusExtentDescriptor **xfext); +static void xattr_fext_set(xattr_io_info_t *info, + const HFSPlusExtentDescriptor **xfext, const HFSPlusExtentDescriptor *fext); +static void xattr_fext_clear(xattr_io_info_t *info, + const HFSPlusExtentDescriptor **xfext); + +static size_t xattr_fext_index(const xattr_io_info_t *info, + const HFSPlusExtentDescriptor **xfext); + +// xattr fext free bitmap routines, namespace `xbm_' +static bool xbm_find_free(const xattr_io_info_t *info, size_t *out); +static void xbm_set_used(xattr_io_info_t *info, size_t i); +static void xbm_clear_used(xattr_io_info_t *info, size_t i); + +static bool xbm_valid_index(const xattr_io_info_t *info, int64_t i); +static size_t xbm_size(const xattr_io_info_t *info); + +// bitmap iterator functions +static xbm_iter_t xbm_make_iter(const uint64_t *bitmap, size_t len); +static bool xbm_iter_next(xbm_iter_t *ier); +static size_t xbm_iter_peek(const xbm_iter_t *ier); + +// bitmap_ wrappers under namespace bm_ +static bool bm_find(const uint64_t *bm, + size_t from, size_t len, bool value, size_t *out); +static bool bm_valid_index(int64_t i, size_t len); +#endif + #if NAMEDSTREAMS /* * Obtain the vnode for a stream. @@ -1080,12 +1165,6 @@ int hfs_setxattr_internal (struct cnode *cp, const void *data_ptr, size_t attrsi /* If it won't fit inline then use extent-based attributes. */ if (attrsize > hfsmp->hfs_max_inline_attrsize) { -#if (TARGET_OS_OSX && TARGET_CPU_ARM64) - printf("hfs_setxattr: non-inline attributes are not supported\n"); - //NOTE: ENOTSUP will fool XNU into thinking we need AppleDouble files... - result = EPERM; - goto exit; -#else int blkcnt; int extentblks; u_int32_t *keystartblk; @@ -1186,7 +1265,6 @@ int hfs_setxattr_internal (struct cnode *cp, const void *data_ptr, size_t attrsi extentblks = count_extent_blocks(blkcnt, recp->overflowExtents.extents); blkcnt -= extentblks; } -#endif //(TARGET_OS_OSX && TARGET_CPU_ARM64) } else { /* Inline data */ if (exists) { result = remove_attribute_records(hfsmp, iterator); @@ -2379,11 +2457,675 @@ int init_attrdata_vnode(struct hfsmount *hfsmp) &cat_fork, &vp, &newvnode_flags); if (result == 0) { hfsmp->hfs_attrdata_vp = vp; +#if NEW_XATTR + vnode_setnoreadahead(hfsmp->hfs_attrdata_vp); +#endif hfs_unlock(VTOC(vp)); } return (result); } +/* The following code (inside NEW_XATTR) was ported from apfs. */ +#if NEW_XATTR +/* + * This is the same as fext2cluster_check(), but overflow is asserted against. + * + * This is useful for places which hold the invariant that `fext' is already + * representable when translated but additionally want to assert that that is + * the case. + */ +static uint64_t +fext2cluster(const HFSPlusExtentDescriptor *fext, uint32_t fs_bsize) +{ + uint64_t off; + const bool ok = fext2cluster_check(fext, fs_bsize, &off); + xattr_assert(ok); + return off; +} + +/* + * Translate `fext' to a cluster layer virtual offset, set via `out', that is + * suitable for IO to the single xattr vnode. + * + * For any particular file extent, this will map to a unique logical offset + * that may be used to key into the ubc. The returned value has the property + * such that file extents on adjacent physical blocks will always be mapped to + * different page sized multiples. Internally, this just multiplies by the + * larger of the pagesize and the blocksize (see xattr_cluster_scale() for the + * derivation). For further details on the importance of this in the overall + * scheme of xattr IO, see uio_set_fext(). + * + * This return trues if `fext' is representable in cluster layer virtual + * offset. It may return false for corrupted file extents: + * - extents that likely extend beyond the size of the underlying drive + * + * The second point should not happen under normal circumstances even for large + * drives. Large drives (by the logic in hfs_newfs()) are automatically + * formatted to use large block sizes: drives sized >= 16TB use 16kiB fs + * blocksize, at which point the virtual offset computed is equal to the device + * offset (even on a 16kiB pagesize system). + * So as long as a drive does not exceed 2^63 bytes in capacity (which is the + * precision of `off_t'), the internal multiplication should not overflow. + */ +static bool +fext2cluster_check(const HFSPlusExtentDescriptor *fext, uint32_t fs_bsize, + uint64_t *out) +{ + const uint64_t pbn = fext->startBlock; + + // xattrs has invalid start block + if (!pbn) { + return false; + } + + // scale pbn + uint64_t off; + if (__builtin_mul_overflow(pbn, xattr_cluster_scale(fs_bsize), &off)) { + return false; + } + + *out = off; + + // the whole fext should be in range + if (__builtin_add_overflow(off, fext->blockCount * fs_bsize, &off)) { + return false; + } + + // don't exceed signed 64bit precision + if (off > INT64_MAX) { + return false; + } + + return true; +} + +/* + * Return the scale factor for translating xattr physical block numbers into + * a logical offset into the single xattr vnode's ubc. + * + * Translated blocks have two key requirements: + * - They must not overlap. + * Otherwise two different blocks' contents will collide. + * - They must not fall within the same page. + * Otherwise reading a whole page may pull in multiple xattr's blocks, but + * only decrypt with one of the xattr's keys. + * + * A table of all possible configurations: + * pagesize, fs blocksize, scale + * 4k, 4k, 4k + * 4k, 8k, 8k + * ... + * 4k, 64k, 64k + * 16k, 4k, 16k + * 16k, 8k, 16k + * 16k, 16k, 16k + * 16k, 32k, 32k + * 16k, 64k, 64k + * + * This may be expressed as + * scale = max(pagesize, fs blocksize) + */ +static uint32_t +xattr_cluster_scale(uint32_t fs_bsize) +{ + return MAX(PAGE_SIZE, fs_bsize); +} + +/* + * See off_in_range(). + */ +static bool +off_in_range2(uint64_t off, uint64_t start, uint64_t len) +{ + return (start <= off) && (off < (start + len)); +} + +/* + * Return true if `off' returned from fext2cluster() was produced from + * `fext'. + */ +static bool +cluster_off_in_fext(uint64_t off, const HFSPlusExtentDescriptor *fext, + uint32_t fs_bsize) +{ + return off_in_range2(off, fext2cluster(fext, fs_bsize), fext->blockCount * fs_bsize); +} + +/* + * Allocate space to save a file extent reference for xattr IO. + * + * This provides a mechanism to communicate file extents from top level, stream + * based xattr IO functions down into lower level IO functions + * (_blockmap() and _strategy()). + * + * This doesn't really return a file extent, it returns a reference into + * `info->xattr_fexts' which may be pointed to a file extent reference. + * Alternatively this could just return an integral index index, but then we'd + * need some way to signal failure. + * + * Note: the returned reference cannot be assigned directly; it must be set + * via xattr_fext_set() to correctly synchronize with a racing call to + * xattr_fext_find(). + * + * This call will not block; it will return NULL if no free spaces are + * available. On success, follow with a call to xattr_fext_free(). + * + * In terms of the implementation, this is a basic zone allocator for thread + * local storage in disguise. it supports only one file extent to be + * set by up to 64 threads. + * For further details, see the documentation for each field above the + * definition of `xattr_io_info_t'. + */ +static const HFSPlusExtentDescriptor ** +xattr_fext_alloc(xattr_io_info_t *info) +{ + const HFSPlusExtentDescriptor **ret; + size_t i; + + // search for the first free bit + lck_spin_lock(&info->lock); + if (!xbm_find_free(info, &i)) { + // no free fexts + ret = NULL; + goto fail; + } + + // mark that position as allocated + xbm_set_used(info, i); + ret = &info->xattr_fexts[i]; + xattr_assert(!*ret); + +fail: + lck_spin_unlock(&info->lock); + return ret; +} + +/* + * Free the memory associated with an xattr fext referenced return from + * xattr_fext_alloc(). + * This simply amounts to clearing the bit within `info->free_bm' that + * corresponds to `xfext'. While not strictly neccessary, we also clear out the + * xattr fext itself to hold the invariant that a clear bit within the free + * bitmap has a corresponding NULL fext reference. + */ +static void +xattr_fext_free(xattr_io_info_t *info, const HFSPlusExtentDescriptor **xfext) +{ + lck_spin_lock(&info->lock); + const size_t i = xattr_fext_index(info, xfext); + xbm_clear_used(info, i); + info->xattr_fexts[i] = NULL; + lck_spin_unlock(&info->lock); +} + +/* + * Given an allocated xattr fext from xattr_fext_alloc() assign it to reference + * `fext'. A copy of this fext may be returned by a subsequent call to + * xattr_fext_find(). + * + * This may be called multiple times for the same value of `xfext'. + * `fext' will be borrowed until a subsequent call to xattr_fext_set() for a + * different file extent or xattr_fext_free() for `xfext'. It must have + * lifetime that spans at least as long from when it's first set to when it's + * cleared either by xattr_fext_free() or xattr_fext_clear(). + * + * Note: `fext' may be introspected by other threads via xattr_fext_find() + * (and in terms of getxattr(), two threads may use each other's file extents + * if they race to read the same xattr). + */ +static void +xattr_fext_set(xattr_io_info_t *info, const HFSPlusExtentDescriptor **xfext, + const HFSPlusExtentDescriptor *fext) +{ + xattr_assert(xbm_valid_index(info, xattr_fext_index(info, xfext))); + lck_spin_lock(&info->lock); + *xfext = fext; + lck_spin_unlock(&info->lock); +} + +/* + * Given a cluster layer virtual offset, attempt to look up a file extent set + * via a previous call to xattr_fext_set(). + * + * If such a fext is found, its value is copied to `out' and true is returned. + * Note: `out' should reference wired memory: it will be stored to while a spin + * lock is held; accesses must not fault. + * + * off_out will contain the "unvirtualized" offset + */ +bool +hfs_xattr_fext_find(xattr_io_info_t *info, uint32_t fs_bsize, uint64_t off, + HFSPlusExtentDescriptor *out, uint64_t *off_out) +{ + bool found = false; + lck_spin_lock(&info->lock); + + // search through all in-use fexts + xbm_iter_t iter = xbm_make_iter(info->free_bm, xbm_size(info)); + while(xbm_iter_next(&iter)) { + const HFSPlusExtentDescriptor *fext = info->xattr_fexts[xbm_iter_peek(&iter)]; + if (!fext || !cluster_off_in_fext(off, fext, fs_bsize)) { + continue; + } + // `off' intersects; return `fext' + *out = *fext; + found = true; + break; + } + + lck_spin_unlock(&info->lock); + + if (found) { + *off_out = ((uint64_t)out->startBlock * fs_bsize) + off - fext2cluster(out, fs_bsize); + } + + return found; +} + +/* + * Given an allocated xattr fext, clear its reference to any `fext' passed to a + * previous call to xattr_fext_set(). + * + * This will end the lifetime of such a fext and prevent xattr_fext_find() from + * taking a reference to it. From here, its backing memory can be deallocated. + * + * Unlike xattr_fext_free(), `xfext' will remain allocated and it may passed to + * xattr_fext_set() again. + */ +static void +xattr_fext_clear(xattr_io_info_t *info, const HFSPlusExtentDescriptor **xfext) +{ + xattr_assert(xbm_valid_index(info, xattr_fext_index(info, xfext))); + lck_spin_lock(&info->lock); + *xfext = NULL; + lck_spin_unlock(&info->lock); +} + +/* + * For an xattr file extent, `xfext', returned from a previous call to + * xattr_fext_alloc(), return its index within `info->xattr_fexts'. + */ +static size_t +xattr_fext_index(const xattr_io_info_t *info, const HFSPlusExtentDescriptor **xfext) +{ + xattr_assert((info->xattr_fexts <= xfext) && + (xfext < &info->xattr_fexts[xbm_size(info)])); + return ((uintptr_t)xfext - (uintptr_t)info->xattr_fexts) / sizeof(*xfext); +} + +static void +bitmap_set_range(uint64_t *bm, int64_t index, int64_t count) +{ + int dstshift0, dstshift1; + uint64_t dstmask0, dstmask1; + int64_t bmi; + + dstshift0 = index % 64; + dstshift1 = 64 - (index % 64); + dstmask0 = ~0ULL << (index % 64); + dstmask1 = (dstshift1 == 64) ? 0ULL : (~0ULL >> (64 - (index % 64))); + + bmi = index / 64; + while (count >= 64) { + bm[bmi] = (bm[bmi] & ~dstmask0) | ((~0ULL << dstshift0) & dstmask0); + if (dstmask1) + bm[bmi + 1] = (bm[bmi + 1] & ~dstmask1) | ((~0ULL >> dstshift1) & dstmask1); + bmi++; + count -= 64; + } + if (count) { + // adjust dstmask to cover just the bits remaining + dstmask0 = ((1ULL << count) - 1) << (index % 64); + dstmask1 = (dstshift1 == 64) ? 0ULL : (((1ULL << count) - 1) >> (64 - (index % 64))); + bm[bmi] = (bm[bmi] & ~dstmask0) | ((~0ULL << dstshift0) & dstmask0); + if ((count > (64 - dstshift0)) && dstmask1) + bm[bmi + 1] = (bm[bmi + 1] & ~dstmask1) | ((~0ULL >> dstshift1) & dstmask1); + } +} + +static void +bitmap_clear_range(uint64_t *bm, int64_t index, int64_t count) +{ + int dstshift0, dstshift1; + uint64_t dstmask0, dstmask1; + int64_t bmi; + + dstshift0 = index % 64; + dstshift1 = 64 - (index % 64); + dstmask0 = ~0ULL << (index % 64); + dstmask1 = (dstshift1 == 64) ? 0ULL : (~0ULL >> (64 - (index % 64))); + + bmi = index / 64; + while (count >= 64) { + bm[bmi] = (bm[bmi] & ~dstmask0) | ((0ULL << dstshift0) & dstmask0); + if (dstmask1) + bm[bmi + 1] = (bm[bmi + 1] & ~dstmask1) | ((0ULL >> dstshift1) & dstmask1); + bmi++; + count -= 64; + } + if (count) { + // adjust dstmask to cover just the bits remaining + dstmask0 = ((1ULL << count) - 1) << (index % 64); + dstmask1 = (dstshift1 == 64) ? 0ULL : (((1ULL << count) - 1) >> (64 - (index % 64))); + bm[bmi] = (bm[bmi] & ~dstmask0) | ((0ULL << dstshift0) & dstmask0); + if ((count > (64 - dstshift0)) && dstmask1) + bm[bmi + 1] = (bm[bmi + 1] & ~dstmask1) | ((0ULL >> dstshift1) & dstmask1); + } +} + +static int +ctzll(uint64_t val) +{ + return (val == 0) ? 64 : __builtin_ctzll(val); +} + +// search forwards through range and return index of first "bit" (0 or 1) encountered +static int +bitmap_range_find_first(int bit, const uint64_t *bm, int64_t index, int64_t count, int64_t *clear_index) +{ + int64_t bmi, check_count; + uint64_t val; + int pos; + + bmi = index / 64; + while (count > 0) { + check_count = 64 - (index % 64); + if (check_count > count) + check_count = count; + val = bm[bmi] >> (index % 64); + if (!bit) + val = ~val; + pos = ctzll(val); + if (pos < check_count) { + *clear_index = index + pos; + return 1; + } + index += check_count; + count -= check_count; + bmi++; + } + return 0; +} + +/* + * Search for the first free (clear) bit within `info's free xattr fext bitmap. + * Return false if no bits are clear. + * If some bit is found, its index is set to `*out' and true is returned. + */ +static bool +xbm_find_free(const xattr_io_info_t *info, size_t *out) +{ + return bm_find(info->free_bm, 0, xbm_size(info), false, out); +} + +/* + * Set the bit at index `i' within `info's underlying free xattr fext bitmap. + * + * info->lock must be held. + * It only makes sense to operate on one bit at a time, so this wraps + * bitmap_set_range(). + */ +static void +xbm_set_used(xattr_io_info_t *info, size_t i) +{ + xattr_assert(xbm_valid_index(info, i)); + bitmap_set_range(info->free_bm, i, 1); +} + +/* + * Clear the bit at index `i' within `info's underlying free xattr fext bitmap. + * This is the opposite of xbm_set_used(). + */ +static void +xbm_clear_used(xattr_io_info_t *info, size_t i) +{ + xattr_assert(xbm_valid_index(info, i)); + bitmap_clear_range(info->free_bm, i, 1); +} + +/* + * Return whether the given bitmap index is a valid index into `info's free + * bitmap. + */ +static bool +xbm_valid_index(const xattr_io_info_t *info, int64_t i) +{ + return bm_valid_index(i, xbm_size(info)); +} + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(A) (sizeof(A) / sizeof(A[0])) +#endif + +/* + * Return the total number of *bits* in `info's free xattr fext bitmap. + */ +static size_t +xbm_size(const xattr_io_info_t *info) +{ + // one bit per xattr fext + return ARRAY_SIZE(info->xattr_fexts); +} + +/* + * Factory for an iterator over all the set (true value) bits in `bitmap'. + * `len' is the length in *bits* of `bitmap'. + * + * The iterator is created in an uninitialized state, a call to xbm_iter_next() + * is required to find the first set bit (this is different than + * make_fext_iterator()). As a consequence of this, an iterator may iterate + * zero times if no bits within `bitmap' are set. After each successful call to + * xbm_iter_next(), xbm_iter_peek() will return the zero-based index of each + * set bit. + * + * The intended use is along the lines of: + * uint64_t *bm = ...; // a bitmap 123 bits (two uint64_ts) long + * xbm_iterator_t iter = xbm_make_iter(bm, 123); + * while(xbm_iter_next(&iter)) { + * size_t i = xbm_iter_peek(&iter); + * printf("index %lu is set\n", i); + * } + * + * In terms of the iterator internals, we hold the invariant that a valid + * iterator always has `i' in [0, len). A valid iterator is one for which + * xbm_iter_peek() will return an index in range of `bitmap'. To bootstrap the + * first iteration, `i' is set to SIZE_MAX; further details are in + * xbm_iter_next(). + */ +static xbm_iter_t +xbm_make_iter(const uint64_t *bitmap, size_t len) +{ + xattr_assert(len); + return (xbm_iter_t) { + .bitmap = bitmap, + .i = SIZE_MAX, // This will overflow-to-zero on the first call to xbm_iter_next() + .len = len + }; +} + +/* + * Advance `iter' to internally reference the next set bit within the bitmap. + * If there are no more set bits, this returns false. + * + * On success, the index of the next set bit can be retrieved by + * xbm_iter_peek(). + * + * Internally, this searches for the first bit set at bit index >= iter->i+1. + * For a new iterator, the value of `i' is initialized to SIZE_MAX so `i'+1 + * will unsigned integer overflow (which is well defined) to zero. + */ +static bool +xbm_iter_next(xbm_iter_t *iter) +{ + size_t i; + // find the next set bit > `i' + const bool found = bm_find(iter->bitmap, iter->i + 1, iter->len, true, &i); + + // if no bit is found, invalidate the iterator by setting i=len + iter->i = found ? i : iter->len; + return found; +} + +/* + * Return the index a the set bit after a successful call to xbm_iter_next(). + */ +static size_t +xbm_iter_peek(const xbm_iter_t *iter) +{ + xattr_assert(iter->i < iter->len); + return iter->i; +} + +/* + * Search for the first bit with `value' within bitmap `bm' >= bit index `from' + * for at most `len' *bits*. Whether such a bit exists is returned and if + * that's the case, its bit index is written via `out'. + * + * This is just a fancy wrapper around bitmap_range_find_first(). + */ +static bool +bm_find(const uint64_t *bm, size_t from, size_t len, bool value, size_t *out) +{ + xattr_assert(bm_valid_index(from, len)); + + // search for `value' in [from, len) + int64_t i; + if (!bitmap_range_find_first(value, + const_cast(uint64_t *, bm), from, len, &i)) { + return false; + } + + // some bit found; check the returned index is valid + xattr_assert(bm_valid_index(i, len)); + *out = (size_t)i; + return true; +} + +/* + * Return true if `i' is a valid index into a bit of `len' bits. + * + * The underlying bitmap_ routines operate on `int64_t' indices. This is + * mainly to safely convert to `size_t'. + */ +static bool +bm_valid_index(int64_t i, size_t len) +{ + return (i >= 0) && ((uint64_t)i < len); +} + +/* + * Virtualize `uio' offset to target xattr `fext' before a call to + * cluster_xattr(). + * + * The computation of the IO offset is somewhat subtle. The reason for this + * fact largely has to do with how exactly the single xattr vnode + * (hfsmp->hfs_attrdata_vp) caches data for multiple xattrs. First, + * some discussion on the motivation for the single xattr vnode design. At the + * top level, xattr apis are quite different from normal file data apis. Some + * key properties are: + * - xattr IO apis do no support editing or random reads + * - xattrs may not be mmapped + * - any one file may have an arbitrary number of xattrs + * To contrast with a normal file, each file has a corresponding vnode which in + * turn has its own private ubc. The only way in which xattrs are actually like + * files is that they the have the same size limits and in + * terms of their implementation, they use the same on-disk structures. + * The result of this is that it is too high overhead to have one vnode per + * xattr, but to instead to maintain a disk block-type cache for xattr data. + * This cache is implemented as the ubc of a virtual file known as the single + * xattr vnode. Reads and writes are serviced by the cluster layer. The cluster + * layer operates in units of the vm pagesize. On a system for which + * pagesize > fs blocksize, then the naïve approach of using an identity + * mapping between ubc logical offset and device offset poses a challenge. + * Consider the following scenario: + * - 16k vm pagesize + * - 4k fs blocksize + * + * On disk, we have two xattrs that reside on adjacent blocks: + * xattr A xattr B + * [aaaa|aaaa|bbbb|bbbb] + * pbn 4 5 6 7 8 + * + * Suppose we want to just read xattr A -- pbn 4 and 5 -- the cluster layer can + * issue just an 8k IO, but it will store it in memory as a whole page, so we + * would result in + * + * xattr A xattr B + * in memory: + * [aaaa|aaaa|0000|0000] + * + * on disk: + * [aaaa|aaaa|bbbb|bbbb] + * pbn 4 5 6 7 8 + * + * A subsequent read for pbn 6 or 7, as a part of xattr B, will find the page + * already cached in the ubc and erroneously return zeros. + * Instead, we could have the cluster layer issue the full 16k IO, but then we + * run into encryption issues on per-file (really per-xattr) key volumes + * + * xattr A xattr B + * in memory: + * [aaaa|aaaa|O!#W|JF%R] + * + * on disk: + * [asdf|ghjk|ZXCV|BNM,] encrypted + * [aaaa|aaaa|bbbb|bbbb] unencrypted + * pbn 4 5 6 7 8 + * + * In this case, the issue is that we have the crypto state available to read + * xattr A, but we do not have the crypto state for xattr B, so we would + * incorrectly decrypt pbn 6, 7 in the same IO. + * + * The solution to this is to use a scaled mapping between ubc logical offset + * and device offset. In this case, we use + * logical offset = physical block number * pagesize + * and result looks like + * + * xattr A xattr B + * in memory: + * [aaaa|aaaa|0000|0000] ... [bbbb|bbbb|0000|0000] + * 64k 96k + * + * on disk: + * [asdf|ghjk|ZXCV|BNM,] encrypted + * [aaaa|aaaa|bbbb|bbbb] unencrypted + * pbn 4 5 6 7 8 + * + * In memory, xattr A occupies the virtual range of [64k, 96k), but its + * contents are representable in the first 8k out of [64k, 80k). Note that the + * mapping here is not per block but rather per xattr file extent. The contents + * tracked by an individual file extent are logically contiguous in memory. In + * the example above, xattr A has one file extent spanning [0, 8k). Suppose it + * instead had two file extents -- [0, 4k) at pbn 4 and [4k, 8k) at pbn 5 -- + * the above diagram would instead look like + * in memory: + * [aaaa|0000|0000|0000][aaaa|0000|0000|0000] + * 64k 80k + * on disk: + * [aaaa|aaaa] unencrypted + * pbn 4 5 + * + * This scaled mapping approach guarantees that xattrs are always on different + * pages from other xattrs, but it comes at an increased memory cost for + * non-page multiple sized xattrs. + * -- + * + * If `fext' is not representable as a virtual offset (e.g. its phys_block_num + * is corrupt), this function returns false. + */ +static bool +uio_set_fext(uio_t uio, const HFSPlusExtentDescriptor *fext, uint32_t fs_bsize) +{ + uint64_t off; + if (!fext2cluster_check(fext, fs_bsize, &off)) { + // `fext' is out of range + return false; + } + + uio_setoffset(uio, off); + return true; +} +#endif // NEW_XATTR /* * Read an extent based attribute. */ @@ -2391,6 +3133,7 @@ static int read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtentDescriptor *extents) { vnode_t evp = hfsmp->hfs_attrdata_vp; + off_t filesize; int bufsize; int64_t iosize; int attrsize; @@ -2403,7 +3146,16 @@ read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtent bufsize = (int)uio_resid(uio); attrsize = (int)datasize; blksize = (int)hfsmp->blockSize; + filesize = VTOF(evp)->ff_size; +#if NEW_XATTR + // allocate an xattr fext for tls through the cluster layer + const HFSPlusExtentDescriptor **xattr_fext; + if (!(xattr_fext = xattr_fext_alloc(&hfsmp->hfs_xattr_io))) { + result = ENOMEM; + goto exit; + } +#endif /* * Read the attribute data one extent at a time. * For the typical case there is only one extent. @@ -2413,9 +3165,28 @@ read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtent iosize = MIN(iosize, attrsize); iosize = MIN(iosize, bufsize); uio_setresid(uio, iosize); +#if NEW_XATTR + // virtualize the IO offset to target this fext + if (!uio_set_fext(uio, &extents[i], blksize)) { + // `fext' is corrupted + result = EILSEQ; + break; + } + + // stage the next xattr fext for IO + xattr_fext_set(&hfsmp->hfs_xattr_io, xattr_fext, &extents[i]); + + // Set filesize to end of data read to prevent cluster read-ahead + filesize = uio_offset(uio) + iosize; +#else uio_setoffset(uio, (u_int64_t)extents[i].startBlock * (u_int64_t)blksize); +#endif + result = cluster_read(evp, uio, filesize, IO_SYNC | IO_UNIT); - result = cluster_read(evp, uio, VTOF(evp)->ff_size, IO_SYNC | IO_UNIT); +#if NEW_XATTR + // post IO, unstage this xattr fext + xattr_fext_clear(&hfsmp->hfs_xattr_io, xattr_fext); +#endif #if HFS_XATTR_VERBOSE printf("hfs: read_attr_data: cr iosize %lld [%d, %d] (%d)\n", @@ -2429,6 +3200,11 @@ read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtent uio_setresid(uio, bufsize); uio_setoffset(uio, datasize); +#if NEW_XATTR + xattr_fext_free(&hfsmp->hfs_xattr_io, xattr_fext); + +exit: +#endif hfs_unlock_truncate(VTOC(evp), HFS_LOCK_DEFAULT); return (result); } @@ -2436,7 +3212,7 @@ read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtent /* * Write an extent based attribute. */ -__unused static int +static int write_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtentDescriptor *extents) { vnode_t evp = hfsmp->hfs_attrdata_vp; @@ -2455,6 +3231,15 @@ write_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExten blksize = (int) hfsmp->blockSize; filesize = VTOF(evp)->ff_size; +#if NEW_XATTR + // allocate an xattr fext for tls through the cluster layer + const HFSPlusExtentDescriptor **xattr_fext; + if (!(xattr_fext = xattr_fext_alloc(&hfsmp->hfs_xattr_io))) { + result = ENOMEM; + goto exit; + } +#endif + /* * Write the attribute data one extent at a time. */ @@ -2463,10 +3248,29 @@ write_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExten iosize = MIN(iosize, attrsize); iosize = MIN(iosize, bufsize); uio_setresid(uio, iosize); - uio_setoffset(uio, (u_int64_t)extents[i].startBlock * (u_int64_t)blksize); +#if NEW_XATTR + // virtualize the IO offset to target this fext + if (!uio_set_fext(uio, &extents[i], blksize)) { + // `fext' is corrupted + result = EILSEQ; + break; + } + + // stage the next xattr fext for IO + xattr_fext_set(&hfsmp->hfs_xattr_io, xattr_fext, &extents[i]); + filesize = uio_offset(uio) + iosize; +#else + uio_setoffset(uio, (u_int64_t)extents[i].startBlock * (u_int64_t)blksize); +#endif result = cluster_write(evp, uio, filesize, filesize, filesize, (off_t) 0, IO_SYNC | IO_UNIT); + +#if NEW_XATTR + // post IO, unstage this xattr fext + xattr_fext_clear(&hfsmp->hfs_xattr_io, xattr_fext); +#endif + #if HFS_XATTR_VERBOSE printf("hfs: write_attr_data: cw iosize %lld [%d, %d] (%d)\n", iosize, extents[i].startBlock, extents[i].blockCount, result); @@ -2479,6 +3283,11 @@ write_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExten uio_setresid(uio, bufsize); uio_setoffset(uio, datasize); +#if NEW_XATTR + xattr_fext_free(&hfsmp->hfs_xattr_io, xattr_fext); + +exit: +#endif hfs_unlock_truncate(VTOC(evp), HFS_LOCK_DEFAULT); return (result); } @@ -2486,7 +3295,7 @@ write_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExten /* * Allocate blocks for an extent based attribute. */ -__unused static int +static int alloc_attr_blks(struct hfsmount *hfsmp, size_t attrsize, size_t extentbufsize, HFSPlusExtentDescriptor *extents, int *blocks) { int blkcnt;