/*
- * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#include <kern/thread.h>
#include <sys/fslog.h> /* fslog_io_error() */
+#include <sys/disk.h> /* dk_error_description_t */
#include <mach/mach_types.h>
#include <mach/memory_object_types.h>
uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv);
-__private_extern__ int bdwrite_internal(buf_t, int);
+int bdwrite_internal(buf_t, int);
/* zone allocated buffer headers */
static void bufzoneinit(void);
static lck_grp_attr_t *buf_mtx_grp_attr;
static lck_mtx_t *iobuffer_mtxp;
static lck_mtx_t *buf_mtxp;
+static lck_mtx_t *buf_gc_callout;
static int buf_busycount;
+#define FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE 16
+typedef struct {
+ void (* callout)(int, void *);
+ void *context;
+} fs_buffer_cache_gc_callout_t;
+
+fs_buffer_cache_gc_callout_t fs_callouts[FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE] = { {NULL, NULL} };
+
static __inline__ int
buf_timestamp(void)
{
cpx_t cpx = bufattr_cpx(buf_attr(bp));
if (cpx) {
/* No need to go here for older EAs */
- if(cpx_use_offset_for_iv(cpx)) {
+ if(cpx_use_offset_for_iv(cpx) && !cpx_synthetic_offset_for_iv(cpx)) {
off_t f_offset;
if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset)))
return error;
/*
* Attach the file offset to this buffer. The
* bufattr attributes will be passed down the stack
- * until they reach IOFlashStorage. IOFlashStorage
+ * until they reach the storage driver (whether
+ * IOFlashStorage, ASP, or IONVMe). The driver
* will retain the offset in a local variable when it
* issues its I/Os to the NAND controller.
*
* case, LwVM will update this field when it dispatches
* each I/O to IOFlashStorage. But from our perspective
* we have only issued a single I/O.
+ *
+ * In the case of APFS we do not bounce through another
+ * intermediate layer (such as CoreStorage). APFS will
+ * issue the I/Os directly to the block device / IOMedia
+ * via buf_strategy on the specfs node.
*/
buf_setcpoff(bp, f_offset);
CP_DEBUG((CPDBG_OFFSET_IO | DBG_FUNC_NONE), (uint32_t) f_offset, (uint32_t) bp->b_lblkno, (uint32_t) bp->b_blkno, (uint32_t) bp->b_bcount, 0);
buf_t
buf_alloc(vnode_t vp)
{
- return(alloc_io_buf(vp, 0));
+ return(alloc_io_buf(vp, is_vm_privileged()));
}
void
*/
buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
+ buf_gc_callout = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
if (iobuffer_mtxp == NULL)
panic("couldn't create iobuffer mutex");
if (buf_mtxp == NULL)
panic("couldn't create buf mutex");
+ if (buf_gc_callout == NULL)
+ panic("couldn't create buf_gc_callout mutex");
+
/*
* allocate and initialize cluster specific global locks...
*/
*/
#define MINMETA 512
-#define MAXMETA 8192
+#define MAXMETA 16384
struct meta_zone_entry {
zone_t mz_zone;
{NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
{NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
{NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
+ {NULL, (MINMETA * 32), 512 * (MINMETA * 32), "buf.16384" },
{NULL, 0, 0, "" } /* End */
};
}
/* Release the buffer. */
- // XXXdbg - only if the unused bit is set
- if (!ISSET(bp->b_flags, B_NORELSE)) {
- buf_brelse(bp);
- } else {
- CLR(bp->b_flags, B_NORELSE);
- }
+ buf_brelse(bp);
return (rv);
} else {
* buffers faster than the disks can service. Doing a buf_bawrite() in
* cases where we have "too many" outstanding buf_bdwrite()s avoids that.
*/
-__private_extern__ int
+int
bdwrite_internal(buf_t bp, int return_error)
{
proc_t p = current_proc();
return (NULL);
goto start;
/*NOTREACHED*/
- break;
default:
/*
break;
}
} else {
+ int clear_bdone;
+
/*
* buffer in core and not busy
*/
if ( (bp->b_upl) )
panic("buffer has UPL, but not marked BUSY: %p", bp);
- if ( !ret_only_valid && bp->b_bufsize != size)
- allocbuf(bp, size);
+ clear_bdone = FALSE;
+ if (!ret_only_valid) {
+ /*
+ * If the number bytes that are valid is going
+ * to increase (even if we end up not doing a
+ * reallocation through allocbuf) we have to read
+ * the new size first.
+ *
+ * This is required in cases where we doing a read
+ * modify write of a already valid data on disk but
+ * in cases where the data on disk beyond (blkno + b_bcount)
+ * is invalid, we may end up doing extra I/O.
+ */
+ if (operation == BLK_META && bp->b_bcount < size) {
+ /*
+ * Since we are going to read in the whole size first
+ * we first have to ensure that any pending delayed write
+ * is flushed to disk first.
+ */
+ if (ISSET(bp->b_flags, B_DELWRI)) {
+ CLR(bp->b_flags, B_CACHE);
+ buf_bwrite(bp);
+ goto start;
+ }
+ /*
+ * clear B_DONE before returning from
+ * this function so that the caller can
+ * can issue a read for the new size.
+ */
+ clear_bdone = TRUE;
+ }
+
+ if (bp->b_bufsize != size)
+ allocbuf(bp, size);
+ }
upl_flags = 0;
switch (operation) {
/*NOTREACHED*/
break;
}
+
+ if (clear_bdone)
+ CLR(bp->b_flags, B_DONE);
}
} else { /* not incore() */
int queue = BQ_EMPTY; /* Start with no preference */
mp = NULL;
}
+ if (ISSET(bp->b_flags, B_ERROR)) {
+ if (mp && (MNT_ROOTFS & mp->mnt_flag)) {
+ dk_error_description_t desc;
+ bzero(&desc, sizeof(desc));
+ desc.description = panic_disk_error_description;
+ desc.description_size = panic_disk_error_description_size;
+ VNOP_IOCTL(mp->mnt_devvp, DKIOCGETERRORDESCRIPTION, (caddr_t)&desc, 0, vfs_context_kernel());
+ }
+ }
+
if (mp && (bp->b_flags & B_READ) == 0) {
update_last_io_time(mp);
INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size);
INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size);
}
+ throttle_info_end_io(bp);
+
if (kdebug_enable) {
int code = DKIO_DONE;
int io_tier = GET_BUFATTR_IO_TIER(bap);
if (bap->ba_flags & BA_NOCACHE)
code |= DKIO_NOCACHE;
+ if (bap->ba_flags & BA_IO_TIER_UPGRADE) {
+ code |= DKIO_TIER_UPGRADE;
+ }
+
KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
buf_kernel_addrperm_addr(bp), (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid, bp->b_error, 0);
}
* indicators
*/
CLR(bp->b_flags, (B_WASDIRTY | B_PASSIVE));
- CLR(bap->ba_flags, (BA_META | BA_NOCACHE | BA_DELAYIDLESLEEP));
+ CLR(bap->ba_flags, (BA_META | BA_NOCACHE | BA_DELAYIDLESLEEP | BA_IO_TIER_UPGRADE));
SET_BUFATTR_IO_TIER(bap, 0);
#define NRESERVEDIOBUFS 128
+#define MNT_VIRTUALDEV_MAX_IOBUFS 16
+#define VIRTUALDEV_MAX_IOBUFS ((40*niobuf_headers)/100)
buf_t
alloc_io_buf(vnode_t vp, int priv)
{
buf_t bp;
+ mount_t mp = NULL;
+ int alloc_for_virtualdev = FALSE;
lck_mtx_lock_spin(iobuffer_mtxp);
+ /*
+ * We subject iobuf requests for diskimages to additional restrictions.
+ *
+ * a) A single diskimage mount cannot use up more than
+ * MNT_VIRTUALDEV_MAX_IOBUFS. However,vm privileged (pageout) requests
+ * are not subject to this restriction.
+ * b) iobuf headers used by all diskimage headers by all mount
+ * points cannot exceed VIRTUALDEV_MAX_IOBUFS.
+ */
+ if (vp && ((mp = vp->v_mount)) && mp != dead_mountp &&
+ mp->mnt_kern_flag & MNTK_VIRTUALDEV) {
+ alloc_for_virtualdev = TRUE;
+ while ((!priv && mp->mnt_iobufinuse > MNT_VIRTUALDEV_MAX_IOBUFS) ||
+ bufstats.bufs_iobufinuse_vdev > VIRTUALDEV_MAX_IOBUFS) {
+ bufstats.bufs_iobufsleeps++;
+
+ need_iobuffer = 1;
+ (void)msleep(&need_iobuffer, iobuffer_mtxp,
+ PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf (1)",
+ NULL);
+ }
+ }
+
while (((niobuf_headers - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) ||
(bp = iobufqueue.tqh_first) == NULL) {
bufstats.bufs_iobufsleeps++;
need_iobuffer = 1;
- (void) msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf", NULL);
+ (void)msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1),
+ (const char *)"alloc_io_buf (2)", NULL);
}
TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
+ if (alloc_for_virtualdev) {
+ mp->mnt_iobufinuse++;
+ bufstats.bufs_iobufinuse_vdev++;
+ }
+
lck_mtx_unlock(iobuffer_mtxp);
/*
bp->b_datap = 0;
bp->b_flags = 0;
bp->b_lflags = BL_BUSY | BL_IOBUF;
+ if (alloc_for_virtualdev)
+ bp->b_lflags |= BL_IOBUF_VDEV;
bp->b_redundancy_flags = 0;
bp->b_blkno = bp->b_lblkno = 0;
#ifdef JOE_DEBUG
void
free_io_buf(buf_t bp)
{
- int need_wakeup = 0;
+ int need_wakeup = 0;
+ int free_for_virtualdev = FALSE;
+ mount_t mp = NULL;
+
+ /* Was this iobuf for a diskimage ? */
+ if (bp->b_lflags & BL_IOBUF_VDEV) {
+ free_for_virtualdev = TRUE;
+ if (bp->b_vp)
+ mp = bp->b_vp->v_mount;
+ }
/*
* put buffer back on the head of the iobufqueue
bufstats.bufs_iobufinuse--;
+ if (free_for_virtualdev) {
+ bufstats.bufs_iobufinuse_vdev--;
+ if (mp && mp != dead_mountp)
+ mp->mnt_iobufinuse--;
+ }
+
lck_mtx_unlock(iobuffer_mtxp);
if (need_wakeup)
typedef int (*bcleanbufcontinuation)(int);
+__attribute__((noreturn))
static void
bcleanbuf_thread(void)
{
return(0);
}
+int
+fs_buffer_cache_gc_register(void (* callout)(int, void *), void *context)
+{
+ lck_mtx_lock(buf_gc_callout);
+ for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
+ if (fs_callouts[i].callout == NULL) {
+ fs_callouts[i].callout = callout;
+ fs_callouts[i].context = context;
+ lck_mtx_unlock(buf_gc_callout);
+ return 0;
+ }
+ }
+
+ lck_mtx_unlock(buf_gc_callout);
+ return ENOMEM;
+}
+
+int
+fs_buffer_cache_gc_unregister(void (* callout)(int, void *), void *context)
+{
+ lck_mtx_lock(buf_gc_callout);
+ for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
+ if (fs_callouts[i].callout == callout &&
+ fs_callouts[i].context == context) {
+ fs_callouts[i].callout = NULL;
+ fs_callouts[i].context = NULL;
+ }
+ }
+ lck_mtx_unlock(buf_gc_callout);
+ return 0;
+}
+
+static void
+fs_buffer_cache_gc_dispatch_callouts(int all)
+{
+ lck_mtx_lock(buf_gc_callout);
+ for(int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
+ if (fs_callouts[i].callout != NULL) {
+ fs_callouts[i].callout(all, fs_callouts[i].context);
+ }
+ }
+ lck_mtx_unlock(buf_gc_callout);
+}
+
boolean_t
buffer_cache_gc(int all)
{
lck_mtx_unlock(buf_mtxp);
+ fs_buffer_cache_gc_dispatch_callouts(all);
+
return did_large_zfree;
}