xnu-1228.15.4.tar.gz

[apple/xnu.git] / bsd / miscfs / specfs / spec_vnops.c
diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c

index 752afe8df04fe8f4e47094aeaca7ec796f46ee2c..6c26b1799734bcec0b423c8747d8e095861e1962 100644 (file)
--- a/bsd/miscfs/specfs/spec_vnops.c
+++ b/bsd/miscfs/specfs/spec_vnops.c
@@ -590,7 +590,6 @@ spec_fsync(struct vnop_fsync_args *ap)
   */
  extern int hard_throttle_on_root;
  void IOSleep(int);
-extern void throttle_lowpri_io(int *lowpri_window,mount_t v_mount);
  
  // the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
  #define LOWPRI_INITIAL_WINDOW_MSECS 100
@@ -599,6 +598,13 @@ extern void throttle_lowpri_io(int *lowpri_window,mount_t v_mount);
  #define LOWPRI_MAX_WAITING_MSECS 200
  #define LOWPRI_SLEEP_INTERVAL 5
  
+struct _throttle_io_info_t {
+       struct timeval  last_normal_IO_timestamp;
+       struct timeval  last_IO_timestamp;
+       SInt32 numthreads_throttling;
+};
+
+struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
  int    lowpri_IO_initial_window_msecs  = LOWPRI_INITIAL_WINDOW_MSECS;
  int    lowpri_IO_window_msecs_inc  = LOWPRI_WINDOW_MSECS_INC;
  int    lowpri_max_window_msecs  = LOWPRI_MAX_WINDOW_MSECS;
@@ -609,40 +615,100 @@ SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW, &lowpri_IO_window
  SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
  SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
  
-void throttle_lowpri_io(int *lowpri_window,mount_t v_mount)
+void
+throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
+{
+       size_t devbsdunit;
+               
+       devbsdunit = mp->mnt_devbsdunit;
+
+       if (devbsdunit < LOWPRI_MAX_NUM_DEV) {
+               *tv = _throttle_io_info[devbsdunit].last_IO_timestamp;
+       } else {
+               memset(tv, 0, sizeof(*tv));
+       }
+}
+
+void
+update_last_io_time(mount_t mp)
+{
+       size_t devbsdunit;
+               
+       devbsdunit = mp->mnt_devbsdunit;
+
+       if (devbsdunit < LOWPRI_MAX_NUM_DEV) {
+               microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp);
+       }
+}
+
+int throttle_io_will_be_throttled(int lowpri_window_msecs, size_t devbsdunit)
  {
-       int i;
-       struct timeval last_lowpri_IO_timestamp,last_normal_IO_timestamp;
         struct timeval elapsed;
-       int lowpri_IO_window_msecs;
-       struct timeval lowpri_IO_window;
-       int max_try_num = lowpri_max_waiting_msecs / LOWPRI_SLEEP_INTERVAL;
+       int elapsed_msecs;
  
-       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
-                    *lowpri_window, 0, 0, 0, 0);
+       microuptime(&elapsed);
+       timevalsub(&elapsed, &_throttle_io_info[devbsdunit].last_normal_IO_timestamp);
+       elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
  
-        last_normal_IO_timestamp = v_mount->last_normal_IO_timestamp;
-                        
-       for (i=0; i<max_try_num; i++) {
-               microuptime(&last_lowpri_IO_timestamp);
+       if (lowpri_window_msecs == -1) // use the max waiting time
+               lowpri_window_msecs = lowpri_max_waiting_msecs;
  
-               elapsed = last_lowpri_IO_timestamp;
-               timevalsub(&elapsed, &last_normal_IO_timestamp);
+       return elapsed_msecs < lowpri_window_msecs;
+}
  
-               lowpri_IO_window_msecs = *lowpri_window;
-               lowpri_IO_window.tv_sec  = lowpri_IO_window_msecs / 1000;
-               lowpri_IO_window.tv_usec = (lowpri_IO_window_msecs % 1000) * 1000;
+void throttle_lowpri_io(boolean_t ok_to_sleep)
+{
+       int i;
+       int max_try_num;
+       struct uthread *ut;
  
-               if (timevalcmp(&elapsed, &lowpri_IO_window, <)) {
-                       IOSleep(LOWPRI_SLEEP_INTERVAL);
-               } else {
-                       break;
+       ut = get_bsdthread_info(current_thread());
+
+       if (ut->uu_lowpri_window == 0)
+               return;
+
+       max_try_num = lowpri_max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, _throttle_io_info[ut->uu_devbsdunit].numthreads_throttling);
+
+       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
+                    ut->uu_lowpri_window, 0, 0, 0, 0);
+
+       if (ok_to_sleep == TRUE) {
+               for (i=0; i<max_try_num; i++) {
+                       if (throttle_io_will_be_throttled(ut->uu_lowpri_window, ut->uu_devbsdunit)) {
+                               IOSleep(LOWPRI_SLEEP_INTERVAL);
+                       } else {
+                               break;
+                       }
                 }
         }
-
         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
-                    *lowpri_window, i*5, 0, 0, 0);
-       *lowpri_window = 0;
+                    ut->uu_lowpri_window, i*5, 0, 0, 0);
+       SInt32 oldValue;
+       oldValue = OSDecrementAtomic(&_throttle_io_info[ut->uu_devbsdunit].numthreads_throttling);
+       ut->uu_lowpri_window = 0;
+
+       if (oldValue <= 0) {
+               panic("%s: numthreads negative", __func__);
+       }
+}
+
+int throttle_get_io_policy(struct uthread **ut)
+{
+       int policy = IOPOL_DEFAULT;
+       proc_t p = current_proc();
+
+       *ut = get_bsdthread_info(current_thread());
+               
+       if (p != NULL)
+               policy = p->p_iopol_disk;
+
+       if (*ut != NULL) {
+               // the I/O policy of the thread overrides that of the process
+               // unless the I/O policy of the thread is default
+               if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT)
+                       policy = (*ut)->uu_iopol_disk;
+       }
+       return policy;
  }
  
  int
@@ -677,23 +743,14 @@ spec_strategy(struct vnop_strategy_args *ap)
                 hard_throttle_on_root = 1;
  
         if (lowpri_IO_initial_window_msecs) {
-               proc_t  p;
                 struct uthread  *ut;
-               int policy = IOPOL_DEFAULT;
+               int policy;
                 int is_throttleable_io = 0;
                 int is_passive_io = 0;
-               p = current_proc();
-               ut = get_bsdthread_info(current_thread());
-               
-               if (p != NULL)
-                       policy = p->p_iopol_disk;
-
-               if (ut != NULL) {
-                       // the I/O policy of the thread overrides that of the process
-                       // unless the I/O policy of the thread is default
-                       if (ut->uu_iopol_disk != IOPOL_DEFAULT)
-                               policy = ut->uu_iopol_disk;
-               }
+               size_t devbsdunit;
+               SInt32 oldValue;
+
+               policy = throttle_get_io_policy(&ut);
  
                 switch (policy) {
                 case IOPOL_DEFAULT:
@@ -713,9 +770,13 @@ spec_strategy(struct vnop_strategy_args *ap)
                 if (!is_throttleable_io && ISSET(bflags, B_PASSIVE))
                     is_passive_io |= 1;
  
+               if (buf_vnode(bp)->v_mount != NULL)
+                       devbsdunit = buf_vnode(bp)->v_mount->mnt_devbsdunit;
+               else
+                       devbsdunit = LOWPRI_MAX_NUM_DEV - 1;
                 if (!is_throttleable_io) {
-                       if (!is_passive_io && buf_vnode(bp)->v_mount != NULL){
-                               microuptime(&(buf_vnode(bp)->v_mount->last_normal_IO_timestamp));
+                       if (!is_passive_io){
+                               microuptime(&_throttle_io_info[devbsdunit].last_normal_IO_timestamp);
                         }
                 } else {
                         /*
@@ -728,17 +789,40 @@ spec_strategy(struct vnop_strategy_args *ap)
                          * do the delay just before we return from the system
                          * call that triggered this I/O or from vnode_pagein
                          */
-                       if(buf_vnode(bp)->v_mount != NULL)
-                                ut->v_mount = buf_vnode(bp)->v_mount;
                         if (ut->uu_lowpri_window == 0) {
+                               ut->uu_devbsdunit = devbsdunit;
+                               oldValue = OSIncrementAtomic(&_throttle_io_info[devbsdunit].numthreads_throttling);
+                               if (oldValue < 0) {
+                                       panic("%s: numthreads negative", __func__);
+                               }
                                 ut->uu_lowpri_window = lowpri_IO_initial_window_msecs;
+                               ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue;
                         } else {
-                               ut->uu_lowpri_window += lowpri_IO_window_msecs_inc;
-                               if (ut->uu_lowpri_window > lowpri_max_window_msecs)
-                                       ut->uu_lowpri_window = lowpri_max_window_msecs;
+                               if (ut->uu_devbsdunit != devbsdunit) { // the thread sends I/Os to different devices within the same system call
+                                       // keep track of the numthreads in the right device
+                                       OSDecrementAtomic(&_throttle_io_info[ut->uu_devbsdunit].numthreads_throttling);
+                                       OSIncrementAtomic(&_throttle_io_info[devbsdunit].numthreads_throttling);
+                                       ut->uu_devbsdunit = devbsdunit;
+                               }
+                               int numthreads = MAX(1, _throttle_io_info[devbsdunit].numthreads_throttling);
+                               ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads;
+                               if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads)
+                                       ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads;
                         }
                 }
         }
+
+       if ((bflags & B_READ) == 0) {
+               size_t devbsdunit;
+
+               if (buf_vnode(bp)->v_mount != NULL)
+                       devbsdunit = buf_vnode(bp)->v_mount->mnt_devbsdunit;
+               else
+                       devbsdunit = LOWPRI_MAX_NUM_DEV - 1;
+               
+               microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp);
+       }
+
          (*bdevsw[major(bdev)].d_strategy)(bp);
  
          return (0);
@@ -827,7 +911,7 @@ spec_close(struct vnop_close_args *ap)
                  * sum of the reference counts on all the aliased
                  * vnodes descends to one, we are on last close.
                  */
-               if (vcount(vp) > 1)
+               if (vcount(vp) > 0)
                         return (0);
  #else /* DEVFS_IMPLEMENTS_LOCKING */
                 /*
@@ -837,7 +921,7 @@ spec_close(struct vnop_close_args *ap)
                  * sum of the reference counts on all the aliased
                  * vnodes descends to one, we are on last close.
                  */
-               if (vcount(vp) > 1)
+               if (vcount(vp) > 0)
                         return (0);
  
                 /*