*/
extern int hard_throttle_on_root;
void IOSleep(int);
-extern void throttle_lowpri_io(int *lowpri_window,mount_t v_mount);
// the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
#define LOWPRI_INITIAL_WINDOW_MSECS 100
#define LOWPRI_MAX_WAITING_MSECS 200
#define LOWPRI_SLEEP_INTERVAL 5
+struct _throttle_io_info_t {
+ struct timeval last_normal_IO_timestamp;
+ struct timeval last_IO_timestamp;
+ SInt32 numthreads_throttling;
+};
+
+struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
int lowpri_IO_initial_window_msecs = LOWPRI_INITIAL_WINDOW_MSECS;
int lowpri_IO_window_msecs_inc = LOWPRI_WINDOW_MSECS_INC;
int lowpri_max_window_msecs = LOWPRI_MAX_WINDOW_MSECS;
SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
-void throttle_lowpri_io(int *lowpri_window,mount_t v_mount)
+void
+throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
+{
+ size_t devbsdunit;
+
+ devbsdunit = mp->mnt_devbsdunit;
+
+ if (devbsdunit < LOWPRI_MAX_NUM_DEV) {
+ *tv = _throttle_io_info[devbsdunit].last_IO_timestamp;
+ } else {
+ memset(tv, 0, sizeof(*tv));
+ }
+}
+
+void
+update_last_io_time(mount_t mp)
+{
+ size_t devbsdunit;
+
+ devbsdunit = mp->mnt_devbsdunit;
+
+ if (devbsdunit < LOWPRI_MAX_NUM_DEV) {
+ microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp);
+ }
+}
+
+int throttle_io_will_be_throttled(int lowpri_window_msecs, size_t devbsdunit)
{
- int i;
- struct timeval last_lowpri_IO_timestamp,last_normal_IO_timestamp;
struct timeval elapsed;
- int lowpri_IO_window_msecs;
- struct timeval lowpri_IO_window;
- int max_try_num = lowpri_max_waiting_msecs / LOWPRI_SLEEP_INTERVAL;
+ int elapsed_msecs;
- KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
- *lowpri_window, 0, 0, 0, 0);
+ microuptime(&elapsed);
+ timevalsub(&elapsed, &_throttle_io_info[devbsdunit].last_normal_IO_timestamp);
+ elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
- last_normal_IO_timestamp = v_mount->last_normal_IO_timestamp;
-
- for (i=0; i<max_try_num; i++) {
- microuptime(&last_lowpri_IO_timestamp);
+ if (lowpri_window_msecs == -1) // use the max waiting time
+ lowpri_window_msecs = lowpri_max_waiting_msecs;
- elapsed = last_lowpri_IO_timestamp;
- timevalsub(&elapsed, &last_normal_IO_timestamp);
+ return elapsed_msecs < lowpri_window_msecs;
+}
- lowpri_IO_window_msecs = *lowpri_window;
- lowpri_IO_window.tv_sec = lowpri_IO_window_msecs / 1000;
- lowpri_IO_window.tv_usec = (lowpri_IO_window_msecs % 1000) * 1000;
+void throttle_lowpri_io(boolean_t ok_to_sleep)
+{
+ int i;
+ int max_try_num;
+ struct uthread *ut;
- if (timevalcmp(&elapsed, &lowpri_IO_window, <)) {
- IOSleep(LOWPRI_SLEEP_INTERVAL);
- } else {
- break;
+ ut = get_bsdthread_info(current_thread());
+
+ if (ut->uu_lowpri_window == 0)
+ return;
+
+ max_try_num = lowpri_max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, _throttle_io_info[ut->uu_devbsdunit].numthreads_throttling);
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
+ ut->uu_lowpri_window, 0, 0, 0, 0);
+
+ if (ok_to_sleep == TRUE) {
+ for (i=0; i<max_try_num; i++) {
+ if (throttle_io_will_be_throttled(ut->uu_lowpri_window, ut->uu_devbsdunit)) {
+ IOSleep(LOWPRI_SLEEP_INTERVAL);
+ } else {
+ break;
+ }
}
}
-
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
- *lowpri_window, i*5, 0, 0, 0);
- *lowpri_window = 0;
+ ut->uu_lowpri_window, i*5, 0, 0, 0);
+ SInt32 oldValue;
+ oldValue = OSDecrementAtomic(&_throttle_io_info[ut->uu_devbsdunit].numthreads_throttling);
+ ut->uu_lowpri_window = 0;
+
+ if (oldValue <= 0) {
+ panic("%s: numthreads negative", __func__);
+ }
+}
+
+int throttle_get_io_policy(struct uthread **ut)
+{
+ int policy = IOPOL_DEFAULT;
+ proc_t p = current_proc();
+
+ *ut = get_bsdthread_info(current_thread());
+
+ if (p != NULL)
+ policy = p->p_iopol_disk;
+
+ if (*ut != NULL) {
+ // the I/O policy of the thread overrides that of the process
+ // unless the I/O policy of the thread is default
+ if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT)
+ policy = (*ut)->uu_iopol_disk;
+ }
+ return policy;
}
int
hard_throttle_on_root = 1;
if (lowpri_IO_initial_window_msecs) {
- proc_t p;
struct uthread *ut;
- int policy = IOPOL_DEFAULT;
+ int policy;
int is_throttleable_io = 0;
int is_passive_io = 0;
- p = current_proc();
- ut = get_bsdthread_info(current_thread());
-
- if (p != NULL)
- policy = p->p_iopol_disk;
-
- if (ut != NULL) {
- // the I/O policy of the thread overrides that of the process
- // unless the I/O policy of the thread is default
- if (ut->uu_iopol_disk != IOPOL_DEFAULT)
- policy = ut->uu_iopol_disk;
- }
+ size_t devbsdunit;
+ SInt32 oldValue;
+
+ policy = throttle_get_io_policy(&ut);
switch (policy) {
case IOPOL_DEFAULT:
if (!is_throttleable_io && ISSET(bflags, B_PASSIVE))
is_passive_io |= 1;
+ if (buf_vnode(bp)->v_mount != NULL)
+ devbsdunit = buf_vnode(bp)->v_mount->mnt_devbsdunit;
+ else
+ devbsdunit = LOWPRI_MAX_NUM_DEV - 1;
if (!is_throttleable_io) {
- if (!is_passive_io && buf_vnode(bp)->v_mount != NULL){
- microuptime(&(buf_vnode(bp)->v_mount->last_normal_IO_timestamp));
+ if (!is_passive_io){
+ microuptime(&_throttle_io_info[devbsdunit].last_normal_IO_timestamp);
}
} else {
/*
* do the delay just before we return from the system
* call that triggered this I/O or from vnode_pagein
*/
- if(buf_vnode(bp)->v_mount != NULL)
- ut->v_mount = buf_vnode(bp)->v_mount;
if (ut->uu_lowpri_window == 0) {
+ ut->uu_devbsdunit = devbsdunit;
+ oldValue = OSIncrementAtomic(&_throttle_io_info[devbsdunit].numthreads_throttling);
+ if (oldValue < 0) {
+ panic("%s: numthreads negative", __func__);
+ }
ut->uu_lowpri_window = lowpri_IO_initial_window_msecs;
+ ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue;
} else {
- ut->uu_lowpri_window += lowpri_IO_window_msecs_inc;
- if (ut->uu_lowpri_window > lowpri_max_window_msecs)
- ut->uu_lowpri_window = lowpri_max_window_msecs;
+ if (ut->uu_devbsdunit != devbsdunit) { // the thread sends I/Os to different devices within the same system call
+ // keep track of the numthreads in the right device
+ OSDecrementAtomic(&_throttle_io_info[ut->uu_devbsdunit].numthreads_throttling);
+ OSIncrementAtomic(&_throttle_io_info[devbsdunit].numthreads_throttling);
+ ut->uu_devbsdunit = devbsdunit;
+ }
+ int numthreads = MAX(1, _throttle_io_info[devbsdunit].numthreads_throttling);
+ ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads;
+ if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads)
+ ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads;
}
}
}
+
+ if ((bflags & B_READ) == 0) {
+ size_t devbsdunit;
+
+ if (buf_vnode(bp)->v_mount != NULL)
+ devbsdunit = buf_vnode(bp)->v_mount->mnt_devbsdunit;
+ else
+ devbsdunit = LOWPRI_MAX_NUM_DEV - 1;
+
+ microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp);
+ }
+
(*bdevsw[major(bdev)].d_strategy)(bp);
return (0);
* sum of the reference counts on all the aliased
* vnodes descends to one, we are on last close.
*/
- if (vcount(vp) > 1)
+ if (vcount(vp) > 0)
return (0);
#else /* DEVFS_IMPLEMENTS_LOCKING */
/*
* sum of the reference counts on all the aliased
* vnodes descends to one, we are on last close.
*/
- if (vcount(vp) > 1)
+ if (vcount(vp) > 0)
return (0);
/*