+static void
+sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg)
+{
+ struct cl_extent cl;
+ off_t offset;
+ u_int length;
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, vp, (*scmap), 0, push_flag, 0);
+
+ if (push_flag & PUSH_ALL)
+ vfs_drt_control(scmap, 1);
+
+ for (;;) {
+ if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS)
+ break;
+
+ cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
+ cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
+
+ cluster_push_now(vp, &cl, EOF, io_flags & (IO_PASSIVE|IO_CLOSE), callback, callback_arg);
+
+ if ( !(push_flag & PUSH_ALL) )
+ break;
+ }
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, vp, (*scmap), 0, 0, 0);
+}
+
+
+/*
+ * sparse_cluster_add is called with the write behind lock held
+ */
+static void
+sparse_cluster_add(void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg)
+{
+ u_int new_dirty;
+ u_int length;
+ off_t offset;
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
+
+ offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
+ length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
+
+ while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
+ /*
+ * no room left in the map
+ * only a partial update was done
+ * push out some pages and try again
+ */
+ sparse_cluster_push(scmap, vp, EOF, 0, 0, callback, callback_arg);
+
+ offset += (new_dirty * PAGE_SIZE_64);
+ length -= (new_dirty * PAGE_SIZE);
+ }
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, vp, (*scmap), 0, 0, 0);
+}
+
+
+static int
+cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
+{
+ upl_page_info_t *pl;
+ upl_t upl;
+ addr64_t ubc_paddr;
+ kern_return_t kret;
+ int error = 0;
+ int did_read = 0;
+ int abort_flags;
+ int upl_flags;
+ int bflag;
+
+ if (flags & IO_PASSIVE)
+ bflag = CL_PASSIVE;
+ else
+ bflag = 0;
+
+ if (flags & IO_NOCACHE)
+ bflag |= CL_NOCACHE;
+
+ upl_flags = UPL_SET_LITE;
+
+ if ( !(flags & CL_READ) ) {
+ /*
+ * "write" operation: let the UPL subsystem know
+ * that we intend to modify the buffer cache pages
+ * we're gathering.
+ */
+ upl_flags |= UPL_WILL_MODIFY;
+ } else {
+ /*
+ * indicate that there is no need to pull the
+ * mapping for this page... we're only going
+ * to read from it, not modify it.
+ */
+ upl_flags |= UPL_FILE_IO;
+ }
+ kret = ubc_create_upl(vp,
+ uio->uio_offset & ~PAGE_MASK_64,
+ PAGE_SIZE,
+ &upl,
+ &pl,
+ upl_flags);
+
+ if (kret != KERN_SUCCESS)
+ return(EINVAL);
+
+ if (!upl_valid_page(pl, 0)) {
+ /*
+ * issue a synchronous read to cluster_io
+ */
+ error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
+ CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
+ if (error) {
+ ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
+
+ return(error);
+ }
+ did_read = 1;
+ }
+ ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
+
+/*
+ * NOTE: There is no prototype for the following in BSD. It, and the definitions
+ * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
+ * osfmk/ppc/mappings.h. They are not included here because there appears to be no
+ * way to do so without exporting them to kexts as well.
+ */
+ if (flags & CL_READ)
+// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
+ copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
+ else
+// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
+ copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
+
+ if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
+ /*
+ * issue a synchronous write to cluster_io
+ */
+ error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
+ bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
+ }
+ if (error == 0)
+ uio_update(uio, (user_size_t)xsize);
+
+ if (did_read)
+ abort_flags = UPL_ABORT_FREE_ON_EMPTY;
+ else
+ abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
+
+ ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
+
+ return (error);
+}
+
+
+
+int
+cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid)
+{
+ int pg_offset;
+ int pg_index;
+ int csize;
+ int segflg;
+ int retval = 0;
+ int xsize;
+ upl_page_info_t *pl;
+
+ xsize = *io_resid;
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
+ (int)uio->uio_offset, upl_offset, xsize, 0, 0);
+
+ segflg = uio->uio_segflg;
+
+ switch(segflg) {
+
+ case UIO_USERSPACE32:
+ case UIO_USERISPACE32:
+ uio->uio_segflg = UIO_PHYS_USERSPACE32;
+ break;
+
+ case UIO_USERSPACE:
+ case UIO_USERISPACE:
+ uio->uio_segflg = UIO_PHYS_USERSPACE;
+ break;
+
+ case UIO_USERSPACE64:
+ case UIO_USERISPACE64:
+ uio->uio_segflg = UIO_PHYS_USERSPACE64;
+ break;
+
+ case UIO_SYSSPACE:
+ uio->uio_segflg = UIO_PHYS_SYSSPACE;
+ break;
+
+ }
+ pl = ubc_upl_pageinfo(upl);
+
+ pg_index = upl_offset / PAGE_SIZE;
+ pg_offset = upl_offset & PAGE_MASK;
+ csize = min(PAGE_SIZE - pg_offset, xsize);
+
+ while (xsize && retval == 0) {
+ addr64_t paddr;
+
+ paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
+
+ retval = uiomove64(paddr, csize, uio);
+
+ pg_index += 1;
+ pg_offset = 0;
+ xsize -= csize;
+ csize = min(PAGE_SIZE, xsize);
+ }
+ *io_resid = xsize;
+
+ uio->uio_segflg = segflg;
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
+ (int)uio->uio_offset, xsize, retval, segflg, 0);
+
+ return (retval);
+}
+
+
+int
+cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
+{
+
+ return (cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1));
+}
+
+
+static int
+cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference)
+{
+ int segflg;
+ int io_size;
+ int xsize;
+ int start_offset;
+ int retval = 0;
+ memory_object_control_t control;
+
+ io_size = *io_resid;
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
+ (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
+
+ control = ubc_getobject(vp, UBC_FLAGS_NONE);
+
+ if (control == MEMORY_OBJECT_CONTROL_NULL) {
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
+ (int)uio->uio_offset, io_size, retval, 3, 0);
+
+ return(0);
+ }
+ segflg = uio->uio_segflg;
+
+ switch(segflg) {
+
+ case UIO_USERSPACE32:
+ case UIO_USERISPACE32:
+ uio->uio_segflg = UIO_PHYS_USERSPACE32;
+ break;
+
+ case UIO_USERSPACE64:
+ case UIO_USERISPACE64:
+ uio->uio_segflg = UIO_PHYS_USERSPACE64;
+ break;
+
+ case UIO_USERSPACE:
+ case UIO_USERISPACE:
+ uio->uio_segflg = UIO_PHYS_USERSPACE;
+ break;
+
+ case UIO_SYSSPACE:
+ uio->uio_segflg = UIO_PHYS_SYSSPACE;
+ break;
+ }
+
+ if ( (io_size = *io_resid) ) {
+ start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
+ xsize = uio_resid(uio);
+
+ retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
+ start_offset, io_size, mark_dirty, take_reference);
+ xsize -= uio_resid(uio);
+ io_size -= xsize;
+ }
+ uio->uio_segflg = segflg;
+ *io_resid = io_size;
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
+ (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0);
+
+ return(retval);
+}
+
+
+int
+is_file_clean(vnode_t vp, off_t filesize)
+{
+ off_t f_offset;
+ int flags;
+ int total_dirty = 0;
+
+ for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
+ if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
+ if (flags & UPL_POP_DIRTY) {
+ total_dirty++;
+ }
+ }
+ }
+ if (total_dirty)
+ return(EINVAL);
+
+ return (0);
+}
+
+
+
+/*
+ * Dirty region tracking/clustering mechanism.
+ *
+ * This code (vfs_drt_*) provides a mechanism for tracking and clustering
+ * dirty regions within a larger space (file). It is primarily intended to
+ * support clustering in large files with many dirty areas.
+ *
+ * The implementation assumes that the dirty regions are pages.
+ *
+ * To represent dirty pages within the file, we store bit vectors in a
+ * variable-size circular hash.
+ */
+
+/*
+ * Bitvector size. This determines the number of pages we group in a
+ * single hashtable entry. Each hashtable entry is aligned to this
+ * size within the file.
+ */
+#define DRT_BITVECTOR_PAGES 256
+
+/*
+ * File offset handling.
+ *
+ * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
+ * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
+ */
+#define DRT_ADDRESS_MASK (~((1 << 20) - 1))
+#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
+
+/*
+ * Hashtable address field handling.
+ *
+ * The low-order bits of the hashtable address are used to conserve
+ * space.
+ *
+ * DRT_HASH_COUNT_MASK must be large enough to store the range
+ * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
+ * to indicate that the bucket is actually unoccupied.
+ */
+#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
+#define DRT_HASH_SET_ADDRESS(scm, i, a) \
+ do { \
+ (scm)->scm_hashtable[(i)].dhe_control = \
+ ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
+ } while (0)
+#define DRT_HASH_COUNT_MASK 0x1ff
+#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
+#define DRT_HASH_SET_COUNT(scm, i, c) \
+ do { \
+ (scm)->scm_hashtable[(i)].dhe_control = \
+ ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
+ } while (0)
+#define DRT_HASH_CLEAR(scm, i) \
+ do { \
+ (scm)->scm_hashtable[(i)].dhe_control = 0; \
+ } while (0)
+#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
+#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
+#define DRT_HASH_COPY(oscm, oi, scm, i) \
+ do { \
+ (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
+ DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
+ } while(0);
+
+
+/*
+ * Hash table moduli.
+ *
+ * Since the hashtable entry's size is dependent on the size of
+ * the bitvector, and since the hashtable size is constrained to
+ * both being prime and fitting within the desired allocation
+ * size, these values need to be manually determined.
+ *
+ * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
+ *
+ * The small hashtable allocation is 1024 bytes, so the modulus is 23.
+ * The large hashtable allocation is 16384 bytes, so the modulus is 401.
+ */
+#define DRT_HASH_SMALL_MODULUS 23
+#define DRT_HASH_LARGE_MODULUS 401
+
+/*
+ * Physical memory required before the large hash modulus is permitted.
+ *
+ * On small memory systems, the large hash modulus can lead to phsyical
+ * memory starvation, so we avoid using it there.
+ */
+#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
+
+#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
+#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
+
+/* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
+
+/*
+ * Hashtable bitvector handling.
+ *
+ * Bitvector fields are 32 bits long.
+ */
+
+#define DRT_HASH_SET_BIT(scm, i, bit) \
+ (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
+
+#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
+ (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
+
+#define DRT_HASH_TEST_BIT(scm, i, bit) \
+ ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
+
+#define DRT_BITVECTOR_CLEAR(scm, i) \
+ bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
+
+#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
+ bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
+ &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
+ (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
+
+
+
+/*
+ * Hashtable entry.
+ */
+struct vfs_drt_hashentry {
+ u_int64_t dhe_control;
+ u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
+};
+
+/*
+ * Dirty Region Tracking structure.
+ *
+ * The hashtable is allocated entirely inside the DRT structure.
+ *
+ * The hash is a simple circular prime modulus arrangement, the structure
+ * is resized from small to large if it overflows.
+ */
+
+struct vfs_drt_clustermap {
+ u_int32_t scm_magic; /* sanity/detection */
+#define DRT_SCM_MAGIC 0x12020003
+ u_int32_t scm_modulus; /* current ring size */
+ u_int32_t scm_buckets; /* number of occupied buckets */
+ u_int32_t scm_lastclean; /* last entry we cleaned */
+ u_int32_t scm_iskips; /* number of slot skips */
+
+ struct vfs_drt_hashentry scm_hashtable[0];
+};
+
+
+#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
+#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
+
+/*
+ * Debugging codes and arguments.
+ */
+#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
+#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
+#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
+#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
+#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
+ * dirty */
+ /* 0, setcount */
+ /* 1 (clean, no map) */
+ /* 2 (map alloc fail) */
+ /* 3, resid (partial) */
+#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
+#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
+ * lastclean, iskips */
+
+
+static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
+static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
+static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
+ u_int64_t offset, int *indexp);
+static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
+ u_int64_t offset,
+ int *indexp,
+ int recursed);
+static kern_return_t vfs_drt_do_mark_pages(
+ void **cmapp,
+ u_int64_t offset,
+ u_int length,
+ u_int *setcountp,
+ int dirty);
+static void vfs_drt_trace(
+ struct vfs_drt_clustermap *cmap,
+ int code,
+ int arg1,
+ int arg2,
+ int arg3,
+ int arg4);
+
+
+/*
+ * Allocate and initialise a sparse cluster map.
+ *
+ * Will allocate a new map, resize or compact an existing map.
+ *
+ * XXX we should probably have at least one intermediate map size,
+ * as the 1:16 ratio seems a bit drastic.
+ */
+static kern_return_t
+vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
+{
+ struct vfs_drt_clustermap *cmap, *ocmap;
+ kern_return_t kret;
+ u_int64_t offset;
+ u_int32_t i;
+ int nsize, active_buckets, index, copycount;
+
+ ocmap = NULL;
+ if (cmapp != NULL)
+ ocmap = *cmapp;