xnu-1228.15.4.tar.gz

[apple/xnu.git] / bsd / miscfs / specfs / spec_vnops.c
diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c

index 1c88c6a22ba68ab3853718bd1acccb04e65f3031..6c26b1799734bcec0b423c8747d8e095861e1962 100644 (file)
--- a/bsd/miscfs/specfs/spec_vnops.c
+++ b/bsd/miscfs/specfs/spec_vnops.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   * 
@@ -79,11 +79,17 @@
  #include <sys/malloc.h>
  #include <sys/disk.h>
  #include <sys/uio_internal.h>
+#include <sys/resource.h>
  #include <miscfs/specfs/specdev.h>
  #include <vfs/vfs_support.h>
  
  #include <sys/kdebug.h>
  
+/* XXX following three prototypes should be in a header file somewhere */
+extern int     isdisk(dev_t dev, int type);
+extern dev_t   chrtoblk(dev_t dev);
+extern int     iskmemdev(dev_t dev);
+
  struct vnode *speclisth[SPECHSZ];
  
  /* symbolic sleep message strings for devices */
@@ -148,13 +154,7 @@ static void set_blocksize(vnode_t, dev_t);
   * Trivial lookup routine that always fails.
   */
  int
-spec_lookup(ap)
-       struct vnop_lookup_args /* {
-               struct vnode *a_dvp;
-               struct vnode **a_vpp;
-               struct componentname *a_cnp;
-               vfs_context_t a_context;
-       } */ *ap;
+spec_lookup(struct vnop_lookup_args *ap)
  {
  
         *ap->a_vpp = NULL;
@@ -201,12 +201,7 @@ set_fsblocksize(struct vnode *vp)
   * Open a special file.
   */
  int
-spec_open(ap)
-       struct vnop_open_args /* {
-               struct vnode *a_vp;
-               int  a_mode;
-               vfs_context_t a_context;
-       } */ *ap;
+spec_open(struct vnop_open_args *ap)
  {
         struct proc *p = vfs_context_proc(ap->a_context);
         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
@@ -319,16 +314,10 @@ spec_open(ap)
   * Vnode op for read
   */
  int
-spec_read(ap)
-       struct vnop_read_args /* {
-               struct vnode *a_vp;
-               struct uio *a_uio;
-               int  a_ioflag;
-               vfs_context_t a_context;
-       } */ *ap;
+spec_read(struct vnop_read_args *ap)
  {
-       register struct vnode *vp = ap->a_vp;
-       register struct uio *uio = ap->a_uio;
+       struct vnode *vp = ap->a_vp;
+       struct uio *uio = ap->a_uio;
         struct buf *bp;
         daddr64_t bn, nextbn;
         long bsize, bscale;
@@ -393,7 +382,7 @@ spec_read(ap)
                         // LP64todo - fix this!
                         n = min((unsigned)(n  - on), uio_resid(uio));
  
-                       error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
+                       error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio);
                         if (n + on == bsize)
                                 buf_markaged(bp);
                         buf_brelse(bp);
@@ -412,23 +401,17 @@ spec_read(ap)
   * Vnode op for write
   */
  int
-spec_write(ap)
-       struct vnop_write_args /* {
-               struct vnode *a_vp;
-               struct uio *a_uio;
-               int  a_ioflag;
-               vfs_context_t a_context;
-       } */ *ap;
+spec_write(struct vnop_write_args *ap)
  {
-       register struct vnode *vp = ap->a_vp;
-       register struct uio *uio = ap->a_uio;
+       struct vnode *vp = ap->a_vp;
+       struct uio *uio = ap->a_uio;
         struct buf *bp;
         daddr64_t bn;
         int bsize, blkmask, bscale;
-       register int io_sync;
-       register int io_size;
+       int io_sync;
+       int io_size;
         int devBlockSize=0;
-       register int n, on;
+       int n, on;
         int error = 0;
         dev_t dev;
  
@@ -505,7 +488,7 @@ spec_write(ap)
                         }
                         n = min(n, bsize - buf_resid(bp));
  
-                       error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
+                       error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio);
                         if (error) {
                                 buf_brelse(bp);
                                 return (error);
@@ -535,14 +518,7 @@ spec_write(ap)
   * Device ioctl operation.
   */
  int
-spec_ioctl(ap)
-       struct vnop_ioctl_args /* {
-               struct vnode *a_vp;
-               int  a_command;
-               caddr_t  a_data;
-               int  a_fflag;
-               vfs_context_t a_context;
-       } */ *ap;
+spec_ioctl(struct vnop_ioctl_args *ap)
  {
         proc_t p = vfs_context_proc(ap->a_context);
         dev_t dev = ap->a_vp->v_rdev;
@@ -554,7 +530,7 @@ spec_ioctl(ap)
                     ap->a_fflag, p));
  
         case VBLK:
-               if (ap->a_command == 0 && (int)ap->a_data == B_TAPE) {
+               if (ap->a_command == 0 && (unsigned int)ap->a_data == B_TAPE) {
                         if (bdevsw[major(dev)].d_type == D_TAPE)
                                 return (0);
                         else
@@ -571,17 +547,10 @@ spec_ioctl(ap)
  }
  
  int
-spec_select(ap)
-       struct vnop_select_args /* {
-               struct vnode *a_vp;
-               int  a_which;
-               int  a_fflags;
-               void * a_wql;
-               vfs_context_t a_context;
-       } */ *ap;
+spec_select(struct vnop_select_args *ap)
  {
         proc_t p = vfs_context_proc(ap->a_context);
-       register dev_t dev;
+       dev_t dev;
  
         switch (ap->a_vp->v_type) {
  
@@ -605,18 +574,13 @@ spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
         /*
          * Flush all dirty buffers associated with a block device.
          */
-       buf_flushdirtyblks(vp, waitfor == MNT_WAIT, 0, (char *)"spec_fsync");
+       buf_flushdirtyblks(vp, waitfor == MNT_WAIT, 0, "spec_fsync");
  
         return (0);
  }
  
  int
-spec_fsync(ap)
-       struct vnop_fsync_args /* {
-               struct vnode *a_vp;
-               int  a_waitfor;
-               vfs_context_t a_context;
-       } */ *ap;
+spec_fsync(struct vnop_fsync_args *ap)
  {
         return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
  }
@@ -625,29 +589,134 @@ spec_fsync(ap)
   * Just call the device strategy routine
   */
  extern int hard_throttle_on_root;
+void IOSleep(int);
+
+// the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
+#define LOWPRI_INITIAL_WINDOW_MSECS 100
+#define LOWPRI_WINDOW_MSECS_INC        50
+#define LOWPRI_MAX_WINDOW_MSECS 200
+#define LOWPRI_MAX_WAITING_MSECS 200
+#define LOWPRI_SLEEP_INTERVAL 5
+
+struct _throttle_io_info_t {
+       struct timeval  last_normal_IO_timestamp;
+       struct timeval  last_IO_timestamp;
+       SInt32 numthreads_throttling;
+};
  
+struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
+int    lowpri_IO_initial_window_msecs  = LOWPRI_INITIAL_WINDOW_MSECS;
+int    lowpri_IO_window_msecs_inc  = LOWPRI_WINDOW_MSECS_INC;
+int    lowpri_max_window_msecs  = LOWPRI_MAX_WINDOW_MSECS;
+int     lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS;
  
-#define LOWPRI_DELAY_MSECS     200
-#define LOWPRI_WINDOW_MSECS    200
+SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
+SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, "");
+SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
+SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
  
-int    lowpri_IO_window_msecs = LOWPRI_WINDOW_MSECS;
-int    lowpri_IO_delay_msecs  = LOWPRI_DELAY_MSECS;
+void
+throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
+{
+       size_t devbsdunit;
+               
+       devbsdunit = mp->mnt_devbsdunit;
  
-struct timeval last_normal_IO_timestamp;
-struct timeval last_lowpri_IO_timestamp;
-struct timeval lowpri_IO_window = { 0, LOWPRI_WINDOW_MSECS * 1000 };
+       if (devbsdunit < LOWPRI_MAX_NUM_DEV) {
+               *tv = _throttle_io_info[devbsdunit].last_IO_timestamp;
+       } else {
+               memset(tv, 0, sizeof(*tv));
+       }
+}
+
+void
+update_last_io_time(mount_t mp)
+{
+       size_t devbsdunit;
+               
+       devbsdunit = mp->mnt_devbsdunit;
+
+       if (devbsdunit < LOWPRI_MAX_NUM_DEV) {
+               microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp);
+       }
+}
+
+int throttle_io_will_be_throttled(int lowpri_window_msecs, size_t devbsdunit)
+{
+       struct timeval elapsed;
+       int elapsed_msecs;
+
+       microuptime(&elapsed);
+       timevalsub(&elapsed, &_throttle_io_info[devbsdunit].last_normal_IO_timestamp);
+       elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
+
+       if (lowpri_window_msecs == -1) // use the max waiting time
+               lowpri_window_msecs = lowpri_max_waiting_msecs;
+
+       return elapsed_msecs < lowpri_window_msecs;
+}
+
+void throttle_lowpri_io(boolean_t ok_to_sleep)
+{
+       int i;
+       int max_try_num;
+       struct uthread *ut;
+
+       ut = get_bsdthread_info(current_thread());
+
+       if (ut->uu_lowpri_window == 0)
+               return;
+
+       max_try_num = lowpri_max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, _throttle_io_info[ut->uu_devbsdunit].numthreads_throttling);
+
+       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
+                    ut->uu_lowpri_window, 0, 0, 0, 0);
+
+       if (ok_to_sleep == TRUE) {
+               for (i=0; i<max_try_num; i++) {
+                       if (throttle_io_will_be_throttled(ut->uu_lowpri_window, ut->uu_devbsdunit)) {
+                               IOSleep(LOWPRI_SLEEP_INTERVAL);
+                       } else {
+                               break;
+                       }
+               }
+       }
+       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
+                    ut->uu_lowpri_window, i*5, 0, 0, 0);
+       SInt32 oldValue;
+       oldValue = OSDecrementAtomic(&_throttle_io_info[ut->uu_devbsdunit].numthreads_throttling);
+       ut->uu_lowpri_window = 0;
+
+       if (oldValue <= 0) {
+               panic("%s: numthreads negative", __func__);
+       }
+}
+
+int throttle_get_io_policy(struct uthread **ut)
+{
+       int policy = IOPOL_DEFAULT;
+       proc_t p = current_proc();
+
+       *ut = get_bsdthread_info(current_thread());
+               
+       if (p != NULL)
+               policy = p->p_iopol_disk;
+
+       if (*ut != NULL) {
+               // the I/O policy of the thread overrides that of the process
+               // unless the I/O policy of the thread is default
+               if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT)
+                       policy = (*ut)->uu_iopol_disk;
+       }
+       return policy;
+}
  
  int
-spec_strategy(ap)
-       struct vnop_strategy_args /* {
-               struct buf *a_bp;
-       } */ *ap;
+spec_strategy(struct vnop_strategy_args *ap)
  {
          buf_t  bp;
         int     bflags;
         dev_t   bdev;
-       proc_t  p;
-        struct timeval elapsed;
  
          bp = ap->a_bp;
         bdev = buf_device(bp);
@@ -673,39 +742,87 @@ spec_strategy(ap)
             (buf_vnode(bp)->v_mount->mnt_kern_flag & MNTK_ROOTDEV))
                 hard_throttle_on_root = 1;
  
-       if ( lowpri_IO_delay_msecs && lowpri_IO_window_msecs ) {
-               p = current_proc();
+       if (lowpri_IO_initial_window_msecs) {
+               struct uthread  *ut;
+               int policy;
+               int is_throttleable_io = 0;
+               int is_passive_io = 0;
+               size_t devbsdunit;
+               SInt32 oldValue;
+
+               policy = throttle_get_io_policy(&ut);
+
+               switch (policy) {
+               case IOPOL_DEFAULT:
+               case IOPOL_NORMAL:
+                       break;
+               case IOPOL_THROTTLE:
+                       is_throttleable_io = 1;
+                       break;
+               case IOPOL_PASSIVE:
+                       is_passive_io = 1;
+                       break;
+               default:
+                       printf("unknown I/O policy %d", policy);
+                       break;
+               }
+
+               if (!is_throttleable_io && ISSET(bflags, B_PASSIVE))
+                   is_passive_io |= 1;
  
-               if ( (p == NULL) || !(p->p_lflag & P_LLOW_PRI_IO)) {
-                       if (!(p->p_lflag & P_LBACKGROUND_IO))
-                               microuptime(&last_normal_IO_timestamp);
+               if (buf_vnode(bp)->v_mount != NULL)
+                       devbsdunit = buf_vnode(bp)->v_mount->mnt_devbsdunit;
+               else
+                       devbsdunit = LOWPRI_MAX_NUM_DEV - 1;
+               if (!is_throttleable_io) {
+                       if (!is_passive_io){
+                               microuptime(&_throttle_io_info[devbsdunit].last_normal_IO_timestamp);
+                       }
                 } else {
-                       microuptime(&last_lowpri_IO_timestamp);
-
-                       elapsed = last_lowpri_IO_timestamp;
-                       timevalsub(&elapsed, &last_normal_IO_timestamp);
-
-                       lowpri_IO_window.tv_sec  = lowpri_IO_window_msecs / 1000;
-                       lowpri_IO_window.tv_usec = (lowpri_IO_window_msecs % 1000) * 1000;
-
-                       if (timevalcmp(&elapsed, &lowpri_IO_window, <)) {
-                               struct uthread  *ut;
-
-                               /*
-                                * I'd really like to do the IOSleep here, but
-                                * we may be holding all kinds of filesystem related locks
-                                * and the pages for this I/O marked 'busy'...
-                                * we don't want to cause a normal task to block on
-                                * one of these locks while we're throttling a task marked
-                                * for low priority I/O... we'll mark the uthread and
-                                * do the delay just before we return from the system
-                                * call that triggered this I/O or from vnode_pagein
-                                */
-                               ut = get_bsdthread_info(current_thread());
-                               ut->uu_lowpri_delay = lowpri_IO_delay_msecs;
+                       /*
+                        * I'd really like to do the IOSleep here, but
+                        * we may be holding all kinds of filesystem related locks
+                        * and the pages for this I/O marked 'busy'...
+                        * we don't want to cause a normal task to block on
+                        * one of these locks while we're throttling a task marked
+                        * for low priority I/O... we'll mark the uthread and
+                        * do the delay just before we return from the system
+                        * call that triggered this I/O or from vnode_pagein
+                        */
+                       if (ut->uu_lowpri_window == 0) {
+                               ut->uu_devbsdunit = devbsdunit;
+                               oldValue = OSIncrementAtomic(&_throttle_io_info[devbsdunit].numthreads_throttling);
+                               if (oldValue < 0) {
+                                       panic("%s: numthreads negative", __func__);
+                               }
+                               ut->uu_lowpri_window = lowpri_IO_initial_window_msecs;
+                               ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue;
+                       } else {
+                               if (ut->uu_devbsdunit != devbsdunit) { // the thread sends I/Os to different devices within the same system call
+                                       // keep track of the numthreads in the right device
+                                       OSDecrementAtomic(&_throttle_io_info[ut->uu_devbsdunit].numthreads_throttling);
+                                       OSIncrementAtomic(&_throttle_io_info[devbsdunit].numthreads_throttling);
+                                       ut->uu_devbsdunit = devbsdunit;
+                               }
+                               int numthreads = MAX(1, _throttle_io_info[devbsdunit].numthreads_throttling);
+                               ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads;
+                               if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads)
+                                       ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads;
                         }
                 }
         }
+
+       if ((bflags & B_READ) == 0) {
+               size_t devbsdunit;
+
+               if (buf_vnode(bp)->v_mount != NULL)
+                       devbsdunit = buf_vnode(bp)->v_mount->mnt_devbsdunit;
+               else
+                       devbsdunit = LOWPRI_MAX_NUM_DEV - 1;
+               
+               microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp);
+       }
+
          (*bdevsw[major(bdev)].d_strategy)(bp);
  
          return (0);
@@ -726,18 +843,15 @@ spec_blockmap(__unused struct vnop_blockmap_args *ap)
   * Device close routine
   */
  int
-spec_close(ap)
-       struct vnop_close_args /* {
-               struct vnode *a_vp;
-               int  a_fflag;
-               vfs_context_t a_context;
-       } */ *ap;
+spec_close(struct vnop_close_args *ap)
  {
-       register struct vnode *vp = ap->a_vp;
+       struct vnode *vp = ap->a_vp;
         dev_t dev = vp->v_rdev;
         int (*devclose)(dev_t, int, int, struct proc *);
         int mode, error;
+       int flags = ap->a_fflag;
         struct proc *p = vfs_context_proc(ap->a_context);
+       struct session *sessp;
  
         switch (vp->v_type) {
  
@@ -751,18 +865,30 @@ spec_close(ap)
                  * if the reference count is 2 (this last descriptor
                  * plus the session), release the reference from the session.
                  */
-               if (vcount(vp) == 2 && p &&
-                   vp == p->p_session->s_ttyvp) {
-                       p->p_session->s_ttyvp = NULL;
-                       vnode_rele(vp);
+               sessp = proc_session(p);
+               if (sessp != SESSION_NULL) {
+                       if ((vcount(vp) == 2) && 
+                               (vp == sessp->s_ttyvp)) {
+                               session_lock(sessp);
+                               sessp->s_ttyvp = NULL;
+                               sessp->s_ttyvid = 0;
+                               sessp->s_ttyp = NULL;
+                               sessp->s_ttypgrpid = NO_PID;
+                               session_unlock(sessp);
+                               vnode_rele(vp);
+                       }
+                       session_rele(sessp);
                 }
+
+               devclose = cdevsw[major(dev)].d_close;
+               mode = S_IFCHR;
                 /*
-                * close on last reference.
+                * close on last reference or on vnode revoke call
                  */
+               if ((flags & IO_REVOKE) != 0)
+                       break;
                 if (vcount(vp) > 1)
                         return (0);
-               devclose = cdevsw[major(dev)].d_close;
-               mode = S_IFCHR;
                 break;
  
         case VBLK:
@@ -785,7 +911,7 @@ spec_close(ap)
                  * sum of the reference counts on all the aliased
                  * vnodes descends to one, we are on last close.
                  */
-               if (vcount(vp) > 1)
+               if (vcount(vp) > 0)
                         return (0);
  #else /* DEVFS_IMPLEMENTS_LOCKING */
                 /*
@@ -795,7 +921,7 @@ spec_close(ap)
                  * sum of the reference counts on all the aliased
                  * vnodes descends to one, we are on last close.
                  */
-               if (vcount(vp) > 1)
+               if (vcount(vp) > 0)
                         return (0);
  
                 /*
@@ -816,22 +942,17 @@ spec_close(ap)
  
         default:
                 panic("spec_close: not special");
+               return(EBADF);
         }
  
-       return ((*devclose)(dev, ap->a_fflag, mode, p));
+       return ((*devclose)(dev, flags, mode, p));
  }
  
  /*
   * Return POSIX pathconf information applicable to special devices.
   */
  int
-spec_pathconf(ap)
-       struct vnop_pathconf_args /* {
-               struct vnode *a_vp;
-               int a_name;
-               int *a_retval;
-               vfs_context_t a_context;
-       } */ *ap;
+spec_pathconf(struct vnop_pathconf_args *ap)
  {
  
         switch (ap->a_name) {
@@ -848,7 +969,7 @@ spec_pathconf(ap)
                 *ap->a_retval = PIPE_BUF;
                 return (0);
         case _PC_CHOWN_RESTRICTED:
-               *ap->a_retval = 1;
+               *ap->a_retval = 200112;         /* _POSIX_CHOWN_RESTRICTED */
                 return (0);
         case _PC_VDISABLE:
                 *ap->a_retval = _POSIX_VDISABLE;
@@ -869,27 +990,11 @@ spec_ebadf(__unused void *dummy)
         return (EBADF);
  }
  
-/*
- * Special device bad operation
- */
-int
-spec_badop()
-{
-
-       panic("spec_badop called");
-       /* NOTREACHED */
-}
-
  /* Blktooff derives file offset from logical block number */
  int
-spec_blktooff(ap)
-       struct vnop_blktooff_args /* {
-               struct vnode *a_vp;
-               daddr64_t a_lblkno;
-               off_t *a_offset;    
-       } */ *ap;
+spec_blktooff(struct vnop_blktooff_args *ap)
  {
-       register struct vnode *vp = ap->a_vp;
+       struct vnode *vp = ap->a_vp;
  
         switch (vp->v_type) {
         case VCHR:
@@ -911,14 +1016,9 @@ spec_blktooff(ap)
  
  /* Offtoblk derives logical block number from file offset */
  int
-spec_offtoblk(ap)
-       struct vnop_offtoblk_args /* {
-               struct vnode *a_vp;
-               off_t a_offset;    
-               daddr64_t *a_lblkno;
-       } */ *ap;
+spec_offtoblk(struct vnop_offtoblk_args *ap)
  {
-       register struct vnode *vp = ap->a_vp;
+       struct vnode *vp = ap->a_vp;
  
         switch (vp->v_type) {
         case VCHR: