xnu-3789.70.16.tar.gz

[apple/xnu.git] / bsd / vfs / vfs_bio.c
diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c

index c6e919d9e806e19ca306a9c397a5da52a2207649..cdccdc82889c6e061f71774766b615291faa3027 100644 (file)
--- a/bsd/vfs/vfs_bio.c
+++ b/bsd/vfs/vfs_bio.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -95,6 +95,7 @@
  #include <kern/thread.h>
  
  #include <sys/fslog.h>         /* fslog_io_error() */
+#include <sys/disk.h>          /* dk_error_description_t */
  
  #include <mach/mach_types.h>
  #include <mach/memory_object_types.h>
@@ -110,7 +111,6 @@
  #include <sys/ubc_internal.h>
  
  #include <sys/sdt.h>
-#include <sys/cprotect.h>
  
  int    bcleanbuf(buf_t bp, boolean_t discard);
  static int     brecover_data(buf_t bp);
@@ -130,7 +130,7 @@ static buf_t        buf_create_shadow_internal(buf_t bp, boolean_t force_copy,
                                            uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv);
  
  
-__private_extern__ int  bdwrite_internal(buf_t, int);
+int  bdwrite_internal(buf_t, int);
  
  /* zone allocated buffer headers */
  static void    bufzoneinit(void);
@@ -171,9 +171,18 @@ static lck_attr_t  *buf_mtx_attr;
  static lck_grp_attr_t   *buf_mtx_grp_attr;
  static lck_mtx_t       *iobuffer_mtxp;
  static lck_mtx_t       *buf_mtxp;
+static lck_mtx_t       *buf_gc_callout;
  
  static int buf_busycount;
  
+#define FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE 16
+typedef struct {
+       void (* callout)(int, void *);
+       void *context;
+} fs_buffer_cache_gc_callout_t;
+
+fs_buffer_cache_gc_callout_t fs_callouts[FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE] = { {NULL, NULL} };
+
  static __inline__ int
  buf_timestamp(void)
  {
@@ -366,9 +375,14 @@ buf_markfua(buf_t bp) {
  }
  
  #if CONFIG_PROTECT
-void
-buf_setcpaddr(buf_t bp, struct cprotect *entry) {
-       bp->b_attr.ba_cpentry = entry;
+cpx_t bufattr_cpx(bufattr_t bap)
+{
+       return bap->ba_cpx;
+}
+
+void bufattr_setcpx(bufattr_t bap, cpx_t cpx)
+{
+       bap->ba_cpx = cpx;
  }
  
  void
@@ -376,46 +390,38 @@ buf_setcpoff (buf_t bp, uint64_t foffset) {
         bp->b_attr.ba_cp_file_off = foffset;
  }
  
-void *
-bufattr_cpaddr(bufattr_t bap) {
-       return (bap->ba_cpentry);
-}
-
  uint64_t
  bufattr_cpoff(bufattr_t bap) {
-       return (bap->ba_cp_file_off);
-}
-
-void
-bufattr_setcpaddr(bufattr_t bap, void *cp_entry_addr) {
-        bap->ba_cpentry = cp_entry_addr;
+       return bap->ba_cp_file_off;
  }
  
  void
  bufattr_setcpoff(bufattr_t bap, uint64_t foffset) {
-        bap->ba_cp_file_off = foffset;
+       bap->ba_cp_file_off = foffset;
  }
  
-#else
-void *
-bufattr_cpaddr(bufattr_t bap __unused) {
-        return NULL;
-}
+#else // !CONTECT_PROTECT
  
  uint64_t
  bufattr_cpoff(bufattr_t bap __unused) {
         return 0;
  }
  
-void
-bufattr_setcpaddr(bufattr_t bap __unused, void *cp_entry_addr __unused) {
-}
-
  void
  bufattr_setcpoff(__unused bufattr_t bap, __unused uint64_t foffset) {
         return;
  }
-#endif /* CONFIG_PROTECT */
+
+struct cpx *bufattr_cpx(__unused bufattr_t bap)
+{
+       return NULL;
+}
+
+void bufattr_setcpx(__unused bufattr_t bap, __unused struct cpx *cpx)
+{
+}
+
+#endif /* !CONFIG_PROTECT */
  
  bufattr_t
  bufattr_alloc() {
@@ -685,6 +691,8 @@ buf_callback(buf_t bp)
  errno_t
  buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
  {
+       assert(!ISSET(bp->b_flags, B_FILTER) && ISSET(bp->b_lflags, BL_BUSY));
+
         if (callback)
                 bp->b_flags |= (B_CALL | B_ASYNC);
         else
@@ -920,6 +928,8 @@ void
  buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
                           void (**old_iodone)(buf_t, void *), void **old_transaction)
  {
+       assert(ISSET(bp->b_lflags, BL_BUSY));
+
         if (old_iodone)
                 *old_iodone = bp->b_iodone;
         if (old_transaction)
@@ -1317,9 +1327,10 @@ buf_strategy(vnode_t devvp, void *ap)
         
  #if CONFIG_PROTECT
         /* Capture f_offset in the bufattr*/
-       if (bp->b_attr.ba_cpentry != 0) {
+       cpx_t cpx = bufattr_cpx(buf_attr(bp));
+       if (cpx) {
                 /* No need to go here for older EAs */
-               if(bp->b_attr.ba_cpentry->cp_flags & CP_OFF_IV_ENABLED) {
+               if(cpx_use_offset_for_iv(cpx) && !cpx_synthetic_offset_for_iv(cpx)) {
                         off_t f_offset;
                         if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset)))
                                 return error;
@@ -1327,7 +1338,8 @@ buf_strategy(vnode_t devvp, void *ap)
                         /* 
                          * Attach the file offset to this buffer.  The
                          * bufattr attributes will be passed down the stack
-                        * until they reach IOFlashStorage.  IOFlashStorage
+                        * until they reach the storage driver (whether 
+                        * IOFlashStorage, ASP, or IONVMe). The driver
                          * will retain the offset in a local variable when it
                          * issues its I/Os to the NAND controller.       
                          * 
@@ -1336,8 +1348,13 @@ buf_strategy(vnode_t devvp, void *ap)
                          * case, LwVM will update this field when it dispatches
                          * each I/O to IOFlashStorage.  But from our perspective
                          * we have only issued a single I/O.
+                        *
+                        * In the case of APFS we do not bounce through another 
+                        * intermediate layer (such as CoreStorage). APFS will
+                        * issue the I/Os directly to the block device / IOMedia
+                        * via buf_strategy on the specfs node.          
                          */
-                       bufattr_setcpoff (&(bp->b_attr), (u_int64_t)f_offset);
+                       buf_setcpoff(bp, f_offset);
                         CP_DEBUG((CPDBG_OFFSET_IO | DBG_FUNC_NONE), (uint32_t) f_offset, (uint32_t) bp->b_lblkno, (uint32_t) bp->b_blkno, (uint32_t) bp->b_bcount, 0);
                 }
         }
@@ -1361,7 +1378,7 @@ buf_strategy(vnode_t devvp, void *ap)
  buf_t
  buf_alloc(vnode_t vp)
  {
-        return(alloc_io_buf(vp, 0));
+        return(alloc_io_buf(vp, is_vm_privileged()));
  }
  
  void
@@ -1989,6 +2006,7 @@ bufinit(void)
          */
         buf_mtxp        = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
         iobuffer_mtxp   = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
+       buf_gc_callout  = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
  
         if (iobuffer_mtxp == NULL)
                 panic("couldn't create iobuffer mutex");
@@ -1996,6 +2014,9 @@ bufinit(void)
         if (buf_mtxp == NULL)
                 panic("couldn't create buf mutex");
  
+       if (buf_gc_callout == NULL)
+               panic("couldn't create buf_gc_callout mutex");
+
         /*
          * allocate and initialize cluster specific global locks...
          */
@@ -2022,7 +2043,7 @@ bufinit(void)
   */
  
  #define MINMETA 512
-#define MAXMETA 8192
+#define MAXMETA 16384
  
  struct meta_zone_entry {
         zone_t mz_zone;
@@ -2037,6 +2058,7 @@ struct meta_zone_entry meta_zones[] = {
         {NULL, (MINMETA * 4),  16 * (MINMETA * 4), "buf.2048" },
         {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
         {NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
+       {NULL, (MINMETA * 32), 512 * (MINMETA * 32), "buf.16384" },
         {NULL, 0, 0, "" } /* End */
  };
  
@@ -2275,12 +2297,7 @@ buf_bwrite(buf_t bp)
                         }
  
                 /* Release the buffer. */
-               // XXXdbg - only if the unused bit is set
-               if (!ISSET(bp->b_flags, B_NORELSE)) {
-                   buf_brelse(bp);
-               } else {
-                   CLR(bp->b_flags, B_NORELSE);
-               }
+               buf_brelse(bp);
  
                 return (rv);
         } else {
@@ -2313,7 +2330,7 @@ vn_bwrite(struct vnop_bwrite_args *ap)
   * buffers faster than the disks can service. Doing a buf_bawrite() in
   * cases where we have "too many" outstanding buf_bdwrite()s avoids that.
   */
-__private_extern__ int
+int
  bdwrite_internal(buf_t bp, int return_error)
  {
         proc_t  p  = current_proc();
@@ -2447,7 +2464,7 @@ buf_brelse_shadow(buf_t bp)
  
         lck_mtx_lock_spin(buf_mtxp);
  
-       bp_head = (buf_t)bp->b_orig;
+       __IGNORE_WCASTALIGN(bp_head = (buf_t)bp->b_orig);
  
         if (bp_head->b_whichq != -1)
                 panic("buf_brelse_shadow: bp_head on freelist %d\n", bp_head->b_whichq);
@@ -2939,7 +2956,6 @@ start:
                                         return (NULL);
                                 goto start;
                                 /*NOTREACHED*/
-                               break;
  
                         default:
                                 /*
@@ -2950,6 +2966,8 @@ start:
                                 break;
                         }               
                 } else {
+                       int clear_bdone;
+
                         /*
                          * buffer in core and not busy
                          */
@@ -2968,8 +2986,41 @@ start:
                         if ( (bp->b_upl) )
                                 panic("buffer has UPL, but not marked BUSY: %p", bp);
  
-                       if ( !ret_only_valid && bp->b_bufsize != size)
-                               allocbuf(bp, size);
+                       clear_bdone = FALSE;
+                       if (!ret_only_valid) {
+                               /*
+                                * If the number bytes that are valid is going
+                                * to increase (even if we end up not doing a
+                                * reallocation through allocbuf) we have to read
+                                * the new size first.
+                                *
+                                * This is required in cases where we doing a read
+                                * modify write of a already valid data on disk but
+                                * in cases where the data on disk beyond (blkno + b_bcount)
+                                * is invalid, we may end up doing extra I/O.
+                                */
+                               if (operation == BLK_META && bp->b_bcount < size) {
+                                       /*
+                                        * Since we are going to read in the whole size first
+                                        * we first have to ensure that any pending delayed write
+                                        * is flushed to disk first.
+                                        */
+                                       if (ISSET(bp->b_flags, B_DELWRI)) {
+                                               CLR(bp->b_flags, B_CACHE);
+                                               buf_bwrite(bp);
+                                               goto start;
+                                       }
+                                       /*
+                                        * clear B_DONE before returning from
+                                        * this function so that the caller can
+                                        * can issue a read for the new size.
+                                        */
+                                       clear_bdone = TRUE;
+                               }
+
+                               if (bp->b_bufsize != size)
+                                       allocbuf(bp, size);
+                       }
  
                         upl_flags = 0;
                         switch (operation) {
@@ -3021,6 +3072,9 @@ start:
                                 /*NOTREACHED*/
                                 break;
                         }
+
+                       if (clear_bdone)
+                               CLR(bp->b_flags, B_DONE);
                 }
         } else { /* not incore() */
                 int queue = BQ_EMPTY; /* Start with no preference */
@@ -3104,6 +3158,25 @@ start:
                         size_t  contig_bytes;
                         int     bmap_flags;
  
+#if DEVELOPMENT || DEBUG
+                       /*
+                        * Apple implemented file systems use UBC excludively; they should
+                        * not call in here."
+                        */
+                       const char* excldfs[] = {"hfs", "afpfs", "smbfs", "acfs",
+                                                "exfat", "msdos", "webdav", NULL};
+
+                       for (int i = 0; excldfs[i] != NULL; i++) {
+                               if (vp->v_mount &&
+                                   !strcmp(vp->v_mount->mnt_vfsstat.f_fstypename,
+                                               excldfs[i])) {
+                                       panic("%s %s calls buf_getblk",
+                                               excldfs[i],
+                                               operation == BLK_READ ? "BLK_READ" : "BLK_WRITE");
+                               }
+                       }
+#endif
+
                         if ( (bp->b_upl) )
                                 panic("bp already has UPL: %p",bp);
  
@@ -3355,7 +3428,7 @@ allocbuf(buf_t bp, int size)
                                                 *(void **)(&bp->b_datap) = grab_memory_for_meta_buf(nsize);
                                         } else {
                                                 bp->b_datap = (uintptr_t)NULL;
-                                               kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
+                                               kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
                                                 CLR(bp->b_flags, B_ZALLOC);
                                         }
                                         bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
@@ -3368,7 +3441,7 @@ allocbuf(buf_t bp, int size)
                                 if ((vm_size_t)bp->b_bufsize < desired_size) {
                                         /* reallocate to a bigger size */
                                         bp->b_datap = (uintptr_t)NULL;
-                                       kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
+                                       kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
                                         bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
                                         kmem_free(kernel_map, elem, bp->b_bufsize); 
                                 } else {
@@ -3384,7 +3457,7 @@ allocbuf(buf_t bp, int size)
                                 *(void **)(&bp->b_datap) = grab_memory_for_meta_buf(nsize);
                                 SET(bp->b_flags, B_ZALLOC);
                         } else
-                               kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
+                               kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
                 }
  
                 if (bp->b_datap == 0)
@@ -3660,8 +3733,6 @@ bcleanbuf(buf_t bp, boolean_t discard)
  
         buf_release_credentials(bp);
         
-       bp->b_redundancy_flags = 0;
-
         /* If discarding, just move to the empty queue */
         if (discard) {
                 lck_mtx_lock_spin(buf_mtxp);
@@ -3676,6 +3747,7 @@ bcleanbuf(buf_t bp, boolean_t discard)
                 bp->b_bufsize = 0;
                 bp->b_datap = (uintptr_t)NULL;
                 bp->b_upl = (void *)NULL;
+               bp->b_fsprivate = (void *)NULL;
                 /*
                  * preserve the state of whether this buffer
                  * was allocated on the fly or not...
@@ -3688,6 +3760,7 @@ bcleanbuf(buf_t bp, boolean_t discard)
  #endif
                 bp->b_lflags = BL_BUSY;
                 bp->b_flags = (bp->b_flags & B_HDRALLOC);
+               bp->b_redundancy_flags = 0;
                 bp->b_dev = NODEV;
                 bp->b_blkno = bp->b_lblkno = 0;
                 bp->b_iodone = NULL;
@@ -3910,6 +3983,16 @@ buf_biodone(buf_t bp)
                 mp = NULL;
         }
         
+       if (ISSET(bp->b_flags, B_ERROR)) {
+               if (mp && (MNT_ROOTFS & mp->mnt_flag)) {
+                       dk_error_description_t desc;
+                       bzero(&desc, sizeof(desc));
+                       desc.description      = panic_disk_error_description;
+                       desc.description_size = panic_disk_error_description_size;
+                       VNOP_IOCTL(mp->mnt_devvp, DKIOCGETERRORDESCRIPTION, (caddr_t)&desc, 0, vfs_context_kernel());
+               }
+       }
+
         if (mp && (bp->b_flags & B_READ) == 0) {
                 update_last_io_time(mp);
                 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size);
@@ -3917,6 +4000,8 @@ buf_biodone(buf_t bp)
                 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size);
         }
  
+       throttle_info_end_io(bp);
+
         if (kdebug_enable) {
                 int code    = DKIO_DONE;
                 int io_tier = GET_BUFATTR_IO_TIER(bap);
@@ -3942,6 +4027,10 @@ buf_biodone(buf_t bp)
                 if (bap->ba_flags & BA_NOCACHE)
                         code |= DKIO_NOCACHE;
  
+               if (bap->ba_flags & BA_IO_TIER_UPGRADE) {
+                       code |= DKIO_TIER_UPGRADE;
+               }
+
                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
                                           buf_kernel_addrperm_addr(bp), (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid, bp->b_error, 0);
          }
@@ -3953,7 +4042,7 @@ buf_biodone(buf_t bp)
          * indicators
          */
         CLR(bp->b_flags, (B_WASDIRTY | B_PASSIVE));
-       CLR(bap->ba_flags, (BA_META | BA_NOCACHE | BA_DELAYIDLESLEEP));
+       CLR(bap->ba_flags, (BA_META | BA_NOCACHE | BA_DELAYIDLESLEEP | BA_IO_TIER_UPGRADE));
  
         SET_BUFATTR_IO_TIER(bap, 0);
  
@@ -4113,20 +4202,48 @@ vfs_bufstats()
  
  #define        NRESERVEDIOBUFS 128
  
+#define MNT_VIRTUALDEV_MAX_IOBUFS 16
+#define VIRTUALDEV_MAX_IOBUFS ((40*niobuf_headers)/100)
  
  buf_t
  alloc_io_buf(vnode_t vp, int priv)
  {
         buf_t   bp;
+       mount_t mp = NULL;
+       int alloc_for_virtualdev = FALSE;
  
         lck_mtx_lock_spin(iobuffer_mtxp);
  
+       /*
+        * We subject iobuf requests for diskimages to additional restrictions.
+        *
+        * a) A single diskimage mount cannot use up more than
+        * MNT_VIRTUALDEV_MAX_IOBUFS. However,vm privileged (pageout) requests
+        * are not subject to this restriction.
+        * b) iobuf headers used by all diskimage headers by all mount
+        * points cannot exceed  VIRTUALDEV_MAX_IOBUFS.
+        */
+       if (vp && ((mp = vp->v_mount)) && mp != dead_mountp &&
+           mp->mnt_kern_flag & MNTK_VIRTUALDEV) {
+               alloc_for_virtualdev = TRUE;
+               while ((!priv && mp->mnt_iobufinuse > MNT_VIRTUALDEV_MAX_IOBUFS) ||
+                   bufstats.bufs_iobufinuse_vdev > VIRTUALDEV_MAX_IOBUFS) {
+                       bufstats.bufs_iobufsleeps++;
+
+                       need_iobuffer = 1;
+                       (void)msleep(&need_iobuffer, iobuffer_mtxp,
+                           PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf (1)",
+                           NULL);
+               }
+       }
+
         while (((niobuf_headers - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) || 
                (bp = iobufqueue.tqh_first) == NULL) {
                 bufstats.bufs_iobufsleeps++;
  
                 need_iobuffer = 1;
-               (void) msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf", NULL);
+               (void)msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1),
+                   (const char *)"alloc_io_buf (2)", NULL);
         }
         TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
  
@@ -4134,6 +4251,11 @@ alloc_io_buf(vnode_t vp, int priv)
         if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
                 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
  
+       if (alloc_for_virtualdev) {
+               mp->mnt_iobufinuse++;
+               bufstats.bufs_iobufinuse_vdev++;
+       }
+
         lck_mtx_unlock(iobuffer_mtxp);
  
         /*
@@ -4148,6 +4270,8 @@ alloc_io_buf(vnode_t vp, int priv)
         bp->b_datap = 0;
         bp->b_flags = 0;
         bp->b_lflags = BL_BUSY | BL_IOBUF;
+       if (alloc_for_virtualdev)
+               bp->b_lflags |= BL_IOBUF_VDEV;
         bp->b_redundancy_flags = 0;
         bp->b_blkno = bp->b_lblkno = 0;
  #ifdef JOE_DEBUG
@@ -4160,6 +4284,7 @@ alloc_io_buf(vnode_t vp, int priv)
         bp->b_bcount = 0;
         bp->b_bufsize = 0;
         bp->b_upl = NULL;
+       bp->b_fsprivate = (void *)NULL;
         bp->b_vp = vp;
         bzero(&bp->b_attr, sizeof(struct bufattr));
  
@@ -4175,7 +4300,16 @@ alloc_io_buf(vnode_t vp, int priv)
  void
  free_io_buf(buf_t bp)
  {
-        int need_wakeup = 0;
+       int need_wakeup = 0;
+       int free_for_virtualdev = FALSE;
+       mount_t mp = NULL;
+
+       /* Was this iobuf for a diskimage ? */
+       if (bp->b_lflags & BL_IOBUF_VDEV) {
+               free_for_virtualdev = TRUE;
+               if (bp->b_vp)
+                       mp = bp->b_vp->v_mount;
+       }
  
         /*
          * put buffer back on the head of the iobufqueue
@@ -4208,6 +4342,12 @@ free_io_buf(buf_t bp)
  
         bufstats.bufs_iobufinuse--;
  
+       if (free_for_virtualdev) {
+               bufstats.bufs_iobufinuse_vdev--;
+               if (mp && mp != dead_mountp)
+                       mp->mnt_iobufinuse--;
+       }
+
         lck_mtx_unlock(iobuffer_mtxp);
  
         if (need_wakeup)
@@ -4246,6 +4386,7 @@ bcleanbuf_thread_init(void)
  
  typedef int (*bcleanbufcontinuation)(int);
  
+__attribute__((noreturn))
  static void
  bcleanbuf_thread(void)
  {
@@ -4375,6 +4516,50 @@ dump_buffer:
         return(0);
  }
  
+int
+fs_buffer_cache_gc_register(void (* callout)(int, void *), void *context)
+{
+       lck_mtx_lock(buf_gc_callout);
+       for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
+               if (fs_callouts[i].callout == NULL) {
+                       fs_callouts[i].callout = callout;
+                       fs_callouts[i].context = context;
+                       lck_mtx_unlock(buf_gc_callout);
+                       return 0;
+               }
+       }
+
+       lck_mtx_unlock(buf_gc_callout);
+       return ENOMEM;
+}
+
+int
+fs_buffer_cache_gc_unregister(void (* callout)(int, void *), void *context)
+{
+       lck_mtx_lock(buf_gc_callout);
+       for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
+               if (fs_callouts[i].callout == callout &&
+                   fs_callouts[i].context == context) {
+                       fs_callouts[i].callout = NULL;
+                       fs_callouts[i].context = NULL;
+               }
+       }
+       lck_mtx_unlock(buf_gc_callout);
+       return 0;
+}
+
+static void
+fs_buffer_cache_gc_dispatch_callouts(int all)
+{
+       lck_mtx_lock(buf_gc_callout);
+       for(int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
+               if (fs_callouts[i].callout != NULL) {
+                       fs_callouts[i].callout(all, fs_callouts[i].context);
+               }
+       }
+       lck_mtx_unlock(buf_gc_callout);
+}
+
  boolean_t 
  buffer_cache_gc(int all)
  {
@@ -4504,6 +4689,8 @@ buffer_cache_gc(int all)
  
         lck_mtx_unlock(buf_mtxp);
  
+       fs_buffer_cache_gc_dispatch_callouts(all);
+
         return did_large_zfree;
  }