+static int
+sparse_cluster_push(struct vnode *vp, off_t EOF, int push_all)
+{
+ unsigned int first;
+ unsigned int last;
+ off_t offset;
+ u_int length;
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, push_all, 0);
+
+ if (push_all)
+ vfs_drt_control(&(vp->v_scmap), 1);
+
+ for (;;) {
+ if (vfs_drt_get_cluster(&(vp->v_scmap), &offset, &length) != KERN_SUCCESS) {
+ vp->v_flag &= ~VHASDIRTY;
+ vp->v_clen = 0;
+ break;
+ }
+ first = (unsigned int)(offset / PAGE_SIZE_64);
+ last = (unsigned int)((offset + length) / PAGE_SIZE_64);
+
+ cluster_push_x(vp, EOF, first, last, 0);
+
+ vp->v_scdirty -= (last - first);
+
+ if (push_all == 0)
+ break;
+ }
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
+}
+
+
+static int
+sparse_cluster_add(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last)
+{
+ u_int new_dirty;
+ u_int length;
+ off_t offset;
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)vp->v_scmap, vp->v_scdirty, first, last, 0);
+
+ offset = (off_t)first * PAGE_SIZE_64;
+ length = (last - first) * PAGE_SIZE;
+
+ while (vfs_drt_mark_pages(&(vp->v_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
+ /*
+ * no room left in the map
+ * only a partial update was done
+ * push out some pages and try again
+ */
+ vp->v_scdirty += new_dirty;
+
+ sparse_cluster_push(vp, EOF, 0);
+
+ offset += (new_dirty * PAGE_SIZE_64);
+ length -= (new_dirty * PAGE_SIZE);
+ }
+ vp->v_scdirty += new_dirty;
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
+}
+
+
+static int
+cluster_align_phys_io(struct vnode *vp, struct uio *uio, addr64_t usr_paddr, int xsize, int devblocksize, int flags)
+{
+ struct iovec *iov;
+ upl_page_info_t *pl;
+ upl_t upl;
+ addr64_t ubc_paddr;
+ kern_return_t kret;
+ int error = 0;
+
+ iov = uio->uio_iov;
+
+ kret = ubc_create_upl(vp,
+ uio->uio_offset & ~PAGE_MASK_64,
+ PAGE_SIZE,
+ &upl,
+ &pl,
+ UPL_SET_LITE);
+
+ if (kret != KERN_SUCCESS)
+ return(EINVAL);
+
+ if (!upl_valid_page(pl, 0)) {
+ /*
+ * issue a synchronous read to cluster_io
+ */
+ error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
+ CL_READ, (struct buf *)0, (struct clios *)0);
+ if (error) {
+ ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
+
+ return(error);
+ }
+ }
+ ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
+
+/*
+ * NOTE: There is no prototype for the following in BSD. It, and the definitions
+ * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
+ * osfmk/ppc/mappings.h. They are not included here because there appears to be no
+ * way to do so without exporting them to kexts as well.
+ */
+ if (flags & CL_READ)
+// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
+ copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
+ else
+// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
+ copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
+
+ if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
+ /*
+ * issue a synchronous write to cluster_io
+ */
+ error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
+ 0, (struct buf *)0, (struct clios *)0);
+ }
+ if (error == 0) {
+ uio->uio_offset += xsize;
+ iov->iov_base += xsize;
+ iov->iov_len -= xsize;
+ uio->uio_resid -= xsize;
+ }
+ ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
+
+ return (error);
+}
+
+
+
+int
+cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
+{
+ int pg_offset;
+ int pg_index;
+ int csize;
+ int segflg;
+ int retval = 0;
+ upl_page_info_t *pl;
+ boolean_t funnel_state = FALSE;
+
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
+ (int)uio->uio_offset, uio->uio_resid, upl_offset, xsize, 0);
+
+ if (xsize >= (16 * 1024))
+ funnel_state = thread_funnel_set(kernel_flock, FALSE);
+
+ segflg = uio->uio_segflg;
+
+ switch(segflg) {
+
+ case UIO_USERSPACE:
+ case UIO_USERISPACE:
+ uio->uio_segflg = UIO_PHYS_USERSPACE;
+ break;
+
+ case UIO_SYSSPACE:
+ uio->uio_segflg = UIO_PHYS_SYSSPACE;
+ break;
+ }
+ pl = ubc_upl_pageinfo(upl);
+
+ pg_index = upl_offset / PAGE_SIZE;
+ pg_offset = upl_offset & PAGE_MASK;
+ csize = min(PAGE_SIZE - pg_offset, xsize);
+
+ while (xsize && retval == 0) {
+ addr64_t paddr;
+
+ paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
+
+ retval = uiomove64(paddr, csize, uio);
+
+ pg_index += 1;
+ pg_offset = 0;
+ xsize -= csize;
+ csize = min(PAGE_SIZE, xsize);
+ }
+ uio->uio_segflg = segflg;
+
+ if (funnel_state == TRUE)
+ thread_funnel_set(kernel_flock, TRUE);
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
+ (int)uio->uio_offset, uio->uio_resid, retval, segflg, 0);
+
+ return (retval);
+}
+
+
+int
+cluster_copy_ubc_data(struct vnode *vp, struct uio *uio, int *io_resid, int mark_dirty)
+{
+ int segflg;
+ int io_size;
+ int xsize;
+ int start_offset;
+ off_t f_offset;
+ int retval = 0;
+ memory_object_control_t control;
+ int op_flags = UPL_POP_SET | UPL_POP_BUSY;
+ boolean_t funnel_state = FALSE;
+
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
+ (int)uio->uio_offset, uio->uio_resid, 0, *io_resid, 0);
+
+ control = ubc_getobject(vp, UBC_FLAGS_NONE);
+ if (control == MEMORY_OBJECT_CONTROL_NULL) {
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
+ (int)uio->uio_offset, uio->uio_resid, retval, 3, 0);
+
+ return(0);
+ }
+ if (mark_dirty)
+ op_flags |= UPL_POP_DIRTY;
+
+ segflg = uio->uio_segflg;
+
+ switch(segflg) {
+
+ case UIO_USERSPACE:
+ case UIO_USERISPACE:
+ uio->uio_segflg = UIO_PHYS_USERSPACE;
+ break;
+
+ case UIO_SYSSPACE:
+ uio->uio_segflg = UIO_PHYS_SYSSPACE;
+ break;
+ }
+ io_size = *io_resid;
+ start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
+ f_offset = uio->uio_offset - start_offset;
+ xsize = min(PAGE_SIZE - start_offset, io_size);
+
+ while (io_size && retval == 0) {
+ ppnum_t pgframe;
+
+ if (ubc_page_op_with_control(control, f_offset, op_flags, &pgframe, 0) != KERN_SUCCESS)
+ break;
+
+ if (funnel_state == FALSE && io_size >= (16 * 1024))
+ funnel_state = thread_funnel_set(kernel_flock, FALSE);
+
+ retval = uiomove64((addr64_t)(((addr64_t)pgframe << 12) + start_offset), xsize, uio);
+
+ ubc_page_op_with_control(control, f_offset, UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
+
+ io_size -= xsize;
+ start_offset = 0;
+ f_offset = uio->uio_offset;
+ xsize = min(PAGE_SIZE, io_size);
+ }
+ uio->uio_segflg = segflg;
+ *io_resid = io_size;
+
+ if (funnel_state == TRUE)
+ thread_funnel_set(kernel_flock, TRUE);
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
+ (int)uio->uio_offset, uio->uio_resid, retval, 0x80000000 | segflg, 0);
+
+ return(retval);
+}
+
+
+int
+is_file_clean(struct vnode *vp, off_t filesize)
+{
+ off_t f_offset;
+ int flags;
+ int total_dirty = 0;
+
+ for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
+ if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
+ if (flags & UPL_POP_DIRTY) {
+ total_dirty++;
+ }
+ }
+ }
+ if (total_dirty)
+ return(EINVAL);
+
+ return (0);
+}
+
+
+
+/*
+ * Dirty region tracking/clustering mechanism.
+ *
+ * This code (vfs_drt_*) provides a mechanism for tracking and clustering
+ * dirty regions within a larger space (file). It is primarily intended to
+ * support clustering in large files with many dirty areas.
+ *
+ * The implementation assumes that the dirty regions are pages.
+ *
+ * To represent dirty pages within the file, we store bit vectors in a
+ * variable-size circular hash.
+ */
+
+/*
+ * Bitvector size. This determines the number of pages we group in a
+ * single hashtable entry. Each hashtable entry is aligned to this
+ * size within the file.
+ */
+#define DRT_BITVECTOR_PAGES 256
+
+/*
+ * File offset handling.
+ *
+ * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
+ * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
+ */
+#define DRT_ADDRESS_MASK (~((1 << 20) - 1))
+#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
+
+/*
+ * Hashtable address field handling.
+ *
+ * The low-order bits of the hashtable address are used to conserve
+ * space.
+ *
+ * DRT_HASH_COUNT_MASK must be large enough to store the range
+ * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
+ * to indicate that the bucket is actually unoccupied.
+ */
+#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
+#define DRT_HASH_SET_ADDRESS(scm, i, a) \
+ do { \
+ (scm)->scm_hashtable[(i)].dhe_control = \
+ ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
+ } while (0)
+#define DRT_HASH_COUNT_MASK 0x1ff
+#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
+#define DRT_HASH_SET_COUNT(scm, i, c) \
+ do { \
+ (scm)->scm_hashtable[(i)].dhe_control = \
+ ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
+ } while (0)
+#define DRT_HASH_CLEAR(scm, i) \
+ do { \
+ (scm)->scm_hashtable[(i)].dhe_control = 0; \
+ } while (0)
+#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
+#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
+#define DRT_HASH_COPY(oscm, oi, scm, i) \
+ do { \
+ (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
+ DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
+ } while(0);
+
+
+/*
+ * Hash table moduli.
+ *
+ * Since the hashtable entry's size is dependent on the size of
+ * the bitvector, and since the hashtable size is constrained to
+ * both being prime and fitting within the desired allocation
+ * size, these values need to be manually determined.
+ *
+ * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
+ *
+ * The small hashtable allocation is 1024 bytes, so the modulus is 23.
+ * The large hashtable allocation is 16384 bytes, so the modulus is 401.
+ */
+#define DRT_HASH_SMALL_MODULUS 23
+#define DRT_HASH_LARGE_MODULUS 401
+
+#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
+#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
+
+/* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
+
+/*
+ * Hashtable bitvector handling.
+ *
+ * Bitvector fields are 32 bits long.
+ */
+
+#define DRT_HASH_SET_BIT(scm, i, bit) \
+ (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
+
+#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
+ (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
+
+#define DRT_HASH_TEST_BIT(scm, i, bit) \
+ ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
+
+#define DRT_BITVECTOR_CLEAR(scm, i) \
+ bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
+
+#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
+ bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
+ &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
+ (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
+
+
+
+/*
+ * Hashtable entry.
+ */
+struct vfs_drt_hashentry {
+ u_int64_t dhe_control;
+ u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
+};
+
+/*
+ * Dirty Region Tracking structure.
+ *
+ * The hashtable is allocated entirely inside the DRT structure.
+ *
+ * The hash is a simple circular prime modulus arrangement, the structure
+ * is resized from small to large if it overflows.
+ */
+
+struct vfs_drt_clustermap {
+ u_int32_t scm_magic; /* sanity/detection */
+#define DRT_SCM_MAGIC 0x12020003
+ u_int32_t scm_modulus; /* current ring size */
+ u_int32_t scm_buckets; /* number of occupied buckets */
+ u_int32_t scm_lastclean; /* last entry we cleaned */
+ u_int32_t scm_iskips; /* number of slot skips */
+
+ struct vfs_drt_hashentry scm_hashtable[0];
+};
+
+
+#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
+#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
+
+/*
+ * Debugging codes and arguments.
+ */
+#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
+#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
+#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
+#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
+#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
+ * dirty */
+ /* 0, setcount */
+ /* 1 (clean, no map) */
+ /* 2 (map alloc fail) */
+ /* 3, resid (partial) */
+#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
+#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
+ * lastclean, iskips */
+
+
+static void vfs_drt_sanity(struct vfs_drt_clustermap *cmap);
+static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
+static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
+static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
+ u_int64_t offset, int *indexp);
+static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
+ u_int64_t offset,
+ int *indexp,
+ int recursed);
+static kern_return_t vfs_drt_do_mark_pages(
+ void **cmapp,
+ u_int64_t offset,
+ u_int length,
+ int *setcountp,
+ int dirty);
+static void vfs_drt_trace(
+ struct vfs_drt_clustermap *cmap,
+ int code,
+ int arg1,
+ int arg2,
+ int arg3,
+ int arg4);
+
+
+/*
+ * Allocate and initialise a sparse cluster map.
+ *
+ * Will allocate a new map, resize or compact an existing map.
+ *
+ * XXX we should probably have at least one intermediate map size,
+ * as the 1:16 ratio seems a bit drastic.
+ */
+static kern_return_t
+vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
+{
+ struct vfs_drt_clustermap *cmap, *ocmap;
+ kern_return_t kret;
+ u_int64_t offset;
+ int nsize, i, active_buckets, index, copycount;
+
+ ocmap = NULL;
+ if (cmapp != NULL)
+ ocmap = *cmapp;