#include <kern/thread.h>
#include <sys/fslog.h> /* fslog_io_error() */
+#include <sys/disk.h> /* dk_error_description_t */
#include <mach/mach_types.h>
#include <mach/memory_object_types.h>
static lck_grp_attr_t *buf_mtx_grp_attr;
static lck_mtx_t *iobuffer_mtxp;
static lck_mtx_t *buf_mtxp;
+static lck_mtx_t *buf_gc_callout;
static int buf_busycount;
+#define FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE 16
+typedef struct {
+ void (* callout)(int, void *);
+ void *context;
+} fs_buffer_cache_gc_callout_t;
+
+fs_buffer_cache_gc_callout_t fs_callouts[FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE] = { {NULL, NULL} };
+
static __inline__ int
buf_timestamp(void)
{
/*
* Attach the file offset to this buffer. The
* bufattr attributes will be passed down the stack
- * until they reach IOFlashStorage. IOFlashStorage
+ * until they reach the storage driver (whether
+ * IOFlashStorage, ASP, or IONVMe). The driver
* will retain the offset in a local variable when it
* issues its I/Os to the NAND controller.
*
* case, LwVM will update this field when it dispatches
* each I/O to IOFlashStorage. But from our perspective
* we have only issued a single I/O.
+ *
+ * In the case of APFS we do not bounce through another
+ * intermediate layer (such as CoreStorage). APFS will
+ * issue the I/Os directly to the block device / IOMedia
+ * via buf_strategy on the specfs node.
*/
buf_setcpoff(bp, f_offset);
CP_DEBUG((CPDBG_OFFSET_IO | DBG_FUNC_NONE), (uint32_t) f_offset, (uint32_t) bp->b_lblkno, (uint32_t) bp->b_blkno, (uint32_t) bp->b_bcount, 0);
*/
buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
+ buf_gc_callout = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
if (iobuffer_mtxp == NULL)
panic("couldn't create iobuffer mutex");
if (buf_mtxp == NULL)
panic("couldn't create buf mutex");
+ if (buf_gc_callout == NULL)
+ panic("couldn't create buf_gc_callout mutex");
+
/*
* allocate and initialize cluster specific global locks...
*/
*/
#define MINMETA 512
-#define MAXMETA 8192
+#define MAXMETA 16384
struct meta_zone_entry {
zone_t mz_zone;
{NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
{NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
{NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
+ {NULL, (MINMETA * 32), 512 * (MINMETA * 32), "buf.16384" },
{NULL, 0, 0, "" } /* End */
};
break;
}
} else {
+ int clear_bdone;
+
/*
* buffer in core and not busy
*/
if ( (bp->b_upl) )
panic("buffer has UPL, but not marked BUSY: %p", bp);
- if ( !ret_only_valid && bp->b_bufsize != size)
- allocbuf(bp, size);
+ clear_bdone = FALSE;
+ if (!ret_only_valid) {
+ /*
+ * If the number bytes that are valid is going
+ * to increase (even if we end up not doing a
+ * reallocation through allocbuf) we have to read
+ * the new size first.
+ *
+ * This is required in cases where we doing a read
+ * modify write of a already valid data on disk but
+ * in cases where the data on disk beyond (blkno + b_bcount)
+ * is invalid, we may end up doing extra I/O.
+ */
+ if (operation == BLK_META && bp->b_bcount < size) {
+ /*
+ * Since we are going to read in the whole size first
+ * we first have to ensure that any pending delayed write
+ * is flushed to disk first.
+ */
+ if (ISSET(bp->b_flags, B_DELWRI)) {
+ CLR(bp->b_flags, B_CACHE);
+ buf_bwrite(bp);
+ goto start;
+ }
+ /*
+ * clear B_DONE before returning from
+ * this function so that the caller can
+ * can issue a read for the new size.
+ */
+ clear_bdone = TRUE;
+ }
+
+ if (bp->b_bufsize != size)
+ allocbuf(bp, size);
+ }
upl_flags = 0;
switch (operation) {
/*NOTREACHED*/
break;
}
+
+ if (clear_bdone)
+ CLR(bp->b_flags, B_DONE);
}
} else { /* not incore() */
int queue = BQ_EMPTY; /* Start with no preference */
mp = NULL;
}
+ if (ISSET(bp->b_flags, B_ERROR)) {
+ if (mp && (MNT_ROOTFS & mp->mnt_flag)) {
+ dk_error_description_t desc;
+ bzero(&desc, sizeof(desc));
+ desc.description = panic_disk_error_description;
+ desc.description_size = panic_disk_error_description_size;
+ VNOP_IOCTL(mp->mnt_devvp, DKIOCGETERRORDESCRIPTION, (caddr_t)&desc, 0, vfs_context_kernel());
+ }
+ }
+
if (mp && (bp->b_flags & B_READ) == 0) {
update_last_io_time(mp);
INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size);
if (bap->ba_flags & BA_NOCACHE)
code |= DKIO_NOCACHE;
+ if (bap->ba_flags & BA_IO_TIER_UPGRADE) {
+ code |= DKIO_TIER_UPGRADE;
+ }
+
KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
buf_kernel_addrperm_addr(bp), (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid, bp->b_error, 0);
}
* indicators
*/
CLR(bp->b_flags, (B_WASDIRTY | B_PASSIVE));
- CLR(bap->ba_flags, (BA_META | BA_NOCACHE | BA_DELAYIDLESLEEP));
+ CLR(bap->ba_flags, (BA_META | BA_NOCACHE | BA_DELAYIDLESLEEP | BA_IO_TIER_UPGRADE));
SET_BUFATTR_IO_TIER(bap, 0);
return(0);
}
+int
+fs_buffer_cache_gc_register(void (* callout)(int, void *), void *context)
+{
+ lck_mtx_lock(buf_gc_callout);
+ for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
+ if (fs_callouts[i].callout == NULL) {
+ fs_callouts[i].callout = callout;
+ fs_callouts[i].context = context;
+ lck_mtx_unlock(buf_gc_callout);
+ return 0;
+ }
+ }
+
+ lck_mtx_unlock(buf_gc_callout);
+ return ENOMEM;
+}
+
+int
+fs_buffer_cache_gc_unregister(void (* callout)(int, void *), void *context)
+{
+ lck_mtx_lock(buf_gc_callout);
+ for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
+ if (fs_callouts[i].callout == callout &&
+ fs_callouts[i].context == context) {
+ fs_callouts[i].callout = NULL;
+ fs_callouts[i].context = NULL;
+ }
+ }
+ lck_mtx_unlock(buf_gc_callout);
+ return 0;
+}
+
+static void
+fs_buffer_cache_gc_dispatch_callouts(int all)
+{
+ lck_mtx_lock(buf_gc_callout);
+ for(int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
+ if (fs_callouts[i].callout != NULL) {
+ fs_callouts[i].callout(all, fs_callouts[i].context);
+ }
+ }
+ lck_mtx_unlock(buf_gc_callout);
+}
+
boolean_t
buffer_cache_gc(int all)
{
lck_mtx_unlock(buf_mtxp);
+ fs_buffer_cache_gc_dispatch_callouts(all);
+
return did_large_zfree;
}