X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/3e170ce000f1506b7b5d2c5c7faec85ceabb573d..527f99514973766e9c0382a4d8550dfb00f54939:/bsd/vfs/vfs_bio.c

diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c
index 9c4b20a0f..c1019a327 100644
--- a/bsd/vfs/vfs_bio.c
+++ b/bsd/vfs/vfs_bio.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -95,6 +95,7 @@
 #include <kern/thread.h>
 
 #include <sys/fslog.h>		/* fslog_io_error() */
+#include <sys/disk.h>		/* dk_error_description_t */
 
 #include <mach/mach_types.h>
 #include <mach/memory_object_types.h>
@@ -129,7 +130,9 @@ static buf_t	buf_create_shadow_internal(buf_t bp, boolean_t force_copy,
 					   uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv);
 
 
-__private_extern__ int  bdwrite_internal(buf_t, int);
+int  bdwrite_internal(buf_t, int);
+
+extern void disk_conditioner_delay(buf_t, int, int, uint64_t);
 
 /* zone allocated buffer headers */
 static void	bufzoneinit(void);
@@ -170,9 +173,18 @@ static lck_attr_t	*buf_mtx_attr;
 static lck_grp_attr_t   *buf_mtx_grp_attr;
 static lck_mtx_t	*iobuffer_mtxp;
 static lck_mtx_t	*buf_mtxp;
+static lck_mtx_t	*buf_gc_callout;
 
 static int buf_busycount;
 
+#define FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE 16
+typedef struct {
+	void (* callout)(int, void *);
+	void *context;
+} fs_buffer_cache_gc_callout_t;
+
+fs_buffer_cache_gc_callout_t fs_callouts[FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE] = { {NULL, NULL} };
+
 static __inline__ int
 buf_timestamp(void)
 {
@@ -481,10 +493,16 @@ bufattr_markmeta(bufattr_t bap) {
 }
 
 int
+#if !CONFIG_EMBEDDED
 bufattr_delayidlesleep(bufattr_t bap) 
+#else /* !CONFIG_EMBEDDED */
+bufattr_delayidlesleep(__unused bufattr_t bap) 
+#endif /* !CONFIG_EMBEDDED */
 {
+#if !CONFIG_EMBEDDED
 	if ( (bap->ba_flags & BA_DELAYIDLESLEEP) )
 		return 1;
+#endif /* !CONFIG_EMBEDDED */
 	return 0;
 }
 
@@ -1320,7 +1338,7 @@ buf_strategy(vnode_t devvp, void *ap)
 	cpx_t cpx = bufattr_cpx(buf_attr(bp));
 	if (cpx) {
 		/* No need to go here for older EAs */
-		if(cpx_use_offset_for_iv(cpx)) {
+		if(cpx_use_offset_for_iv(cpx) && !cpx_synthetic_offset_for_iv(cpx)) {
 			off_t f_offset;
 			if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset)))
 				return error;
@@ -1328,7 +1346,8 @@ buf_strategy(vnode_t devvp, void *ap)
 			/* 
 			 * Attach the file offset to this buffer.  The
 			 * bufattr attributes will be passed down the stack
-			 * until they reach IOFlashStorage.  IOFlashStorage
+			 * until they reach the storage driver (whether 
+			 * IOFlashStorage, ASP, or IONVMe). The driver
 			 * will retain the offset in a local variable when it
 			 * issues its I/Os to the NAND controller.	 
 			 * 
@@ -1337,6 +1356,11 @@ buf_strategy(vnode_t devvp, void *ap)
 			 * case, LwVM will update this field when it dispatches
 			 * each I/O to IOFlashStorage.  But from our perspective
 			 * we have only issued a single I/O.
+			 *
+			 * In the case of APFS we do not bounce through another 
+			 * intermediate layer (such as CoreStorage). APFS will
+			 * issue the I/Os directly to the block device / IOMedia
+			 * via buf_strategy on the specfs node. 	 
 			 */
 			buf_setcpoff(bp, f_offset);
 			CP_DEBUG((CPDBG_OFFSET_IO | DBG_FUNC_NONE), (uint32_t) f_offset, (uint32_t) bp->b_lblkno, (uint32_t) bp->b_blkno, (uint32_t) bp->b_bcount, 0);
@@ -1362,7 +1386,7 @@ buf_strategy(vnode_t devvp, void *ap)
 buf_t
 buf_alloc(vnode_t vp)
 {
-        return(alloc_io_buf(vp, 0));
+        return(alloc_io_buf(vp, is_vm_privileged()));
 }
 
 void
@@ -1990,6 +2014,7 @@ bufinit(void)
 	 */
 	buf_mtxp	= lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
 	iobuffer_mtxp	= lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
+	buf_gc_callout  = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
 
 	if (iobuffer_mtxp == NULL)
 	        panic("couldn't create iobuffer mutex");
@@ -1997,6 +2022,9 @@ bufinit(void)
 	if (buf_mtxp == NULL)
 	        panic("couldn't create buf mutex");
 
+	if (buf_gc_callout == NULL)
+		panic("couldn't create buf_gc_callout mutex");
+
 	/*
 	 * allocate and initialize cluster specific global locks...
 	 */
@@ -2023,7 +2051,7 @@ bufinit(void)
  */
 
 #define MINMETA 512
-#define MAXMETA 8192
+#define MAXMETA 16384
 
 struct meta_zone_entry {
 	zone_t mz_zone;
@@ -2038,6 +2066,7 @@ struct meta_zone_entry meta_zones[] = {
 	{NULL, (MINMETA * 4),  16 * (MINMETA * 4), "buf.2048" },
 	{NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
 	{NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
+	{NULL, (MINMETA * 32), 512 * (MINMETA * 32), "buf.16384" },
 	{NULL, 0, 0, "" } /* End */
 };
 
@@ -2276,12 +2305,7 @@ buf_bwrite(buf_t bp)
 			}
 
 		/* Release the buffer. */
-		// XXXdbg - only if the unused bit is set
-		if (!ISSET(bp->b_flags, B_NORELSE)) {
-		    buf_brelse(bp);
-		} else {
-		    CLR(bp->b_flags, B_NORELSE);
-		}
+		buf_brelse(bp);
 
 		return (rv);
 	} else {
@@ -2314,7 +2338,7 @@ vn_bwrite(struct vnop_bwrite_args *ap)
  * buffers faster than the disks can service. Doing a buf_bawrite() in
  * cases where we have "too many" outstanding buf_bdwrite()s avoids that.
  */
-__private_extern__ int
+int
 bdwrite_internal(buf_t bp, int return_error)
 {
 	proc_t	p  = current_proc();
@@ -2613,12 +2637,13 @@ buf_brelse(buf_t bp)
 
 		if (upl == NULL) {
 		        if ( !ISSET(bp->b_flags, B_INVAL)) {
-				kret = ubc_create_upl(bp->b_vp, 
+				kret = ubc_create_upl_kernel(bp->b_vp,
 						      ubc_blktooff(bp->b_vp, bp->b_lblkno),
 						      bp->b_bufsize, 
 						      &upl,
 						      NULL,
-						      UPL_PRECIOUS);
+						      UPL_PRECIOUS,
+						      VM_KERN_MEMORY_FILE);
 
 				if (kret != KERN_SUCCESS)
 				        panic("brelse: Failed to create UPL");
@@ -2940,7 +2965,6 @@ start:
 					return (NULL);
 				goto start;
 				/*NOTREACHED*/
-				break;
 
 			default:
 			        /*
@@ -2951,6 +2975,8 @@ start:
 				break;
 			}		
 		} else {
+			int clear_bdone;
+
 			/*
 			 * buffer in core and not busy
 			 */
@@ -2969,8 +2995,41 @@ start:
 			if ( (bp->b_upl) )
 			        panic("buffer has UPL, but not marked BUSY: %p", bp);
 
-			if ( !ret_only_valid && bp->b_bufsize != size)
-			        allocbuf(bp, size);
+			clear_bdone = FALSE;
+			if (!ret_only_valid) {
+				/*
+				 * If the number bytes that are valid is going
+				 * to increase (even if we end up not doing a
+				 * reallocation through allocbuf) we have to read
+				 * the new size first.
+				 *
+				 * This is required in cases where we doing a read
+				 * modify write of a already valid data on disk but
+				 * in cases where the data on disk beyond (blkno + b_bcount)
+				 * is invalid, we may end up doing extra I/O.
+				 */
+				if (operation == BLK_META && bp->b_bcount < size) {
+					/*
+					 * Since we are going to read in the whole size first
+					 * we first have to ensure that any pending delayed write
+					 * is flushed to disk first.
+					 */
+					if (ISSET(bp->b_flags, B_DELWRI)) {
+						CLR(bp->b_flags, B_CACHE);
+						buf_bwrite(bp);
+						goto start;
+					}
+					/*
+					 * clear B_DONE before returning from
+					 * this function so that the caller can
+					 * can issue a read for the new size.
+					 */
+					clear_bdone = TRUE;
+				}
+
+				if (bp->b_bufsize != size)
+					allocbuf(bp, size);
+			}
 
 			upl_flags = 0;
 			switch (operation) {
@@ -2984,12 +3043,13 @@ start:
 			case BLK_READ:
 				upl_flags |= UPL_PRECIOUS;
 			        if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
-					kret = ubc_create_upl(vp,
+					kret = ubc_create_upl_kernel(vp,
 							      ubc_blktooff(vp, bp->b_lblkno), 
 							      bp->b_bufsize, 
 							      &upl, 
 							      &pl,
-							      upl_flags);
+							      upl_flags,
+							      VM_KERN_MEMORY_FILE);
 					if (kret != KERN_SUCCESS)
 					        panic("Failed to create UPL");
 
@@ -3022,6 +3082,9 @@ start:
 				/*NOTREACHED*/
 				break;
 			}
+
+			if (clear_bdone)
+				CLR(bp->b_flags, B_DONE);
 		}
 	} else { /* not incore() */
 		int queue = BQ_EMPTY; /* Start with no preference */
@@ -3130,12 +3193,13 @@ start:
 			f_offset = ubc_blktooff(vp, blkno);
 
 			upl_flags |= UPL_PRECIOUS;
-			kret = ubc_create_upl(vp,
+			kret = ubc_create_upl_kernel(vp,
 					      f_offset,
 					      bp->b_bufsize, 
 					      &upl,
 					      &pl,
-					      upl_flags);
+					      upl_flags,
+					      VM_KERN_MEMORY_FILE);
 
 			if (kret != KERN_SUCCESS)
 				panic("Failed to create UPL");
@@ -3915,6 +3979,8 @@ buf_biodone(buf_t bp)
 {
 	mount_t mp;
 	struct bufattr *bap;
+	struct timeval real_elapsed;
+	uint64_t real_elapsed_usec = 0;
 	
 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
 		     bp, bp->b_datap, bp->b_flags, 0, 0);
@@ -3930,6 +3996,16 @@ buf_biodone(buf_t bp)
 		mp = NULL;
 	}
 	
+	if (ISSET(bp->b_flags, B_ERROR)) {
+		if (mp && (MNT_ROOTFS & mp->mnt_flag)) {
+			dk_error_description_t desc;
+			bzero(&desc, sizeof(desc));
+			desc.description      = panic_disk_error_description;
+			desc.description_size = panic_disk_error_description_size;
+			VNOP_IOCTL(mp->mnt_devvp, DKIOCGETERRORDESCRIPTION, (caddr_t)&desc, 0, vfs_context_kernel());
+		}
+	}
+
 	if (mp && (bp->b_flags & B_READ) == 0) {
 		update_last_io_time(mp);
 		INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size);
@@ -3937,6 +4013,8 @@ buf_biodone(buf_t bp)
 		INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size);
 	}
 
+	throttle_info_end_io(bp);
+
 	if (kdebug_enable) {
 		int code    = DKIO_DONE;
 		int io_tier = GET_BUFATTR_IO_TIER(bap);
@@ -3962,10 +4040,19 @@ buf_biodone(buf_t bp)
 		if (bap->ba_flags & BA_NOCACHE)
 			code |= DKIO_NOCACHE;
 
+		if (bap->ba_flags & BA_IO_TIER_UPGRADE) {
+			code |= DKIO_TIER_UPGRADE;
+		}
+
 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
 		                          buf_kernel_addrperm_addr(bp), (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid, bp->b_error, 0);
         }
 
+	microuptime(&real_elapsed);
+	timevalsub(&real_elapsed, &bp->b_timestamp_tv);
+	real_elapsed_usec = real_elapsed.tv_sec * USEC_PER_SEC + real_elapsed.tv_usec;
+	disk_conditioner_delay(bp, 1, bp->b_bcount, real_elapsed_usec);
+
 	/*
 	 * I/O was done, so don't believe
 	 * the DIRTY state from VM anymore...
@@ -3973,7 +4060,7 @@ buf_biodone(buf_t bp)
 	 * indicators
 	 */
 	CLR(bp->b_flags, (B_WASDIRTY | B_PASSIVE));
-	CLR(bap->ba_flags, (BA_META | BA_NOCACHE | BA_DELAYIDLESLEEP));
+	CLR(bap->ba_flags, (BA_META | BA_NOCACHE | BA_DELAYIDLESLEEP | BA_IO_TIER_UPGRADE));
 
 	SET_BUFATTR_IO_TIER(bap, 0);
 
@@ -4133,20 +4220,48 @@ vfs_bufstats()
 
 #define	NRESERVEDIOBUFS	128
 
+#define MNT_VIRTUALDEV_MAX_IOBUFS 16
+#define VIRTUALDEV_MAX_IOBUFS ((40*niobuf_headers)/100)
 
 buf_t
 alloc_io_buf(vnode_t vp, int priv)
 {
 	buf_t	bp;
+	mount_t mp = NULL;
+	int alloc_for_virtualdev = FALSE;
 
 	lck_mtx_lock_spin(iobuffer_mtxp);
 
+	/*
+	 * We subject iobuf requests for diskimages to additional restrictions.
+	 *
+	 * a) A single diskimage mount cannot use up more than
+	 * MNT_VIRTUALDEV_MAX_IOBUFS. However,vm privileged (pageout) requests
+	 * are not subject to this restriction.
+	 * b) iobuf headers used by all diskimage headers by all mount
+	 * points cannot exceed  VIRTUALDEV_MAX_IOBUFS.
+	 */
+	if (vp && ((mp = vp->v_mount)) && mp != dead_mountp &&
+	    mp->mnt_kern_flag & MNTK_VIRTUALDEV) {
+		alloc_for_virtualdev = TRUE;
+		while ((!priv && mp->mnt_iobufinuse > MNT_VIRTUALDEV_MAX_IOBUFS) ||
+		    bufstats.bufs_iobufinuse_vdev > VIRTUALDEV_MAX_IOBUFS) {
+			bufstats.bufs_iobufsleeps++;
+
+			need_iobuffer = 1;
+			(void)msleep(&need_iobuffer, iobuffer_mtxp,
+			    PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf (1)",
+			    NULL);
+		}
+	}
+
 	while (((niobuf_headers - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) || 
 	       (bp = iobufqueue.tqh_first) == NULL) {
 		bufstats.bufs_iobufsleeps++;
 
 		need_iobuffer = 1;
-		(void) msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf", NULL);
+		(void)msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1),
+		    (const char *)"alloc_io_buf (2)", NULL);
 	}
 	TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
 
@@ -4154,6 +4269,11 @@ alloc_io_buf(vnode_t vp, int priv)
 	if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
 		bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
 
+	if (alloc_for_virtualdev) {
+		mp->mnt_iobufinuse++;
+		bufstats.bufs_iobufinuse_vdev++;
+	}
+
 	lck_mtx_unlock(iobuffer_mtxp);
 
 	/*
@@ -4168,6 +4288,8 @@ alloc_io_buf(vnode_t vp, int priv)
 	bp->b_datap = 0;
 	bp->b_flags = 0;
 	bp->b_lflags = BL_BUSY | BL_IOBUF;
+	if (alloc_for_virtualdev)
+		bp->b_lflags |= BL_IOBUF_VDEV;
 	bp->b_redundancy_flags = 0;
 	bp->b_blkno = bp->b_lblkno = 0;
 #ifdef JOE_DEBUG
@@ -4196,7 +4318,16 @@ alloc_io_buf(vnode_t vp, int priv)
 void
 free_io_buf(buf_t bp)
 {
-        int need_wakeup = 0;
+	int need_wakeup = 0;
+	int free_for_virtualdev = FALSE;
+	mount_t mp = NULL;
+
+	/* Was this iobuf for a diskimage ? */
+	if (bp->b_lflags & BL_IOBUF_VDEV) {
+		free_for_virtualdev = TRUE;
+		if (bp->b_vp)
+			mp = bp->b_vp->v_mount;
+	}
 
 	/*
 	 * put buffer back on the head of the iobufqueue
@@ -4229,6 +4360,12 @@ free_io_buf(buf_t bp)
 
 	bufstats.bufs_iobufinuse--;
 
+	if (free_for_virtualdev) {
+		bufstats.bufs_iobufinuse_vdev--;
+		if (mp && mp != dead_mountp)
+			mp->mnt_iobufinuse--;
+	}
+
 	lck_mtx_unlock(iobuffer_mtxp);
 
 	if (need_wakeup)
@@ -4267,6 +4404,7 @@ bcleanbuf_thread_init(void)
 
 typedef int (*bcleanbufcontinuation)(int);
 
+__attribute__((noreturn))
 static void
 bcleanbuf_thread(void)
 {
@@ -4364,12 +4502,13 @@ brecover_data(buf_t bp)
 		upl_flags |= UPL_WILL_MODIFY;
 	}
 		
-	kret = ubc_create_upl(vp,
+	kret = ubc_create_upl_kernel(vp,
 			      ubc_blktooff(vp, bp->b_lblkno), 
 			      bp->b_bufsize, 
 			      &upl, 
 			      &pl,
-			      upl_flags);
+			      upl_flags,
+			      VM_KERN_MEMORY_FILE);
 	if (kret != KERN_SUCCESS)
 	        panic("Failed to create UPL");
 
@@ -4396,6 +4535,50 @@ dump_buffer:
 	return(0);
 }
 
+int
+fs_buffer_cache_gc_register(void (* callout)(int, void *), void *context)
+{
+	lck_mtx_lock(buf_gc_callout);
+	for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
+		if (fs_callouts[i].callout == NULL) {
+			fs_callouts[i].callout = callout;
+			fs_callouts[i].context = context;
+			lck_mtx_unlock(buf_gc_callout);
+			return 0;
+		}
+	}
+
+	lck_mtx_unlock(buf_gc_callout);
+	return ENOMEM;
+}
+
+int
+fs_buffer_cache_gc_unregister(void (* callout)(int, void *), void *context)
+{
+	lck_mtx_lock(buf_gc_callout);
+	for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
+		if (fs_callouts[i].callout == callout &&
+		    fs_callouts[i].context == context) {
+			fs_callouts[i].callout = NULL;
+			fs_callouts[i].context = NULL;
+		}
+	}
+	lck_mtx_unlock(buf_gc_callout);
+	return 0;
+}
+
+static void
+fs_buffer_cache_gc_dispatch_callouts(int all)
+{
+	lck_mtx_lock(buf_gc_callout);
+	for(int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
+		if (fs_callouts[i].callout != NULL) {
+			fs_callouts[i].callout(all, fs_callouts[i].context);
+		}
+	}
+	lck_mtx_unlock(buf_gc_callout);
+}
+
 boolean_t 
 buffer_cache_gc(int all)
 {
@@ -4525,6 +4708,8 @@ buffer_cache_gc(int all)
 
 	lck_mtx_unlock(buf_mtxp);
 
+	fs_buffer_cache_gc_dispatch_callouts(all);
+
 	return did_large_zfree;
 }