/*
- * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#include <sys/malloc.h>
#include <sys/disk.h>
#include <sys/uio_internal.h>
+#include <sys/resource.h>
#include <miscfs/specfs/specdev.h>
#include <vfs/vfs_support.h>
#include <sys/kdebug.h>
+/* XXX following three prototypes should be in a header file somewhere */
+extern int isdisk(dev_t dev, int type);
+extern dev_t chrtoblk(dev_t dev);
+extern int iskmemdev(dev_t dev);
+
struct vnode *speclisth[SPECHSZ];
/* symbolic sleep message strings for devices */
* Trivial lookup routine that always fails.
*/
int
-spec_lookup(ap)
- struct vnop_lookup_args /* {
- struct vnode *a_dvp;
- struct vnode **a_vpp;
- struct componentname *a_cnp;
- vfs_context_t a_context;
- } */ *ap;
+spec_lookup(struct vnop_lookup_args *ap)
{
*ap->a_vpp = NULL;
* Open a special file.
*/
int
-spec_open(ap)
- struct vnop_open_args /* {
- struct vnode *a_vp;
- int a_mode;
- vfs_context_t a_context;
- } */ *ap;
+spec_open(struct vnop_open_args *ap)
{
struct proc *p = vfs_context_proc(ap->a_context);
kauth_cred_t cred = vfs_context_ucred(ap->a_context);
* Vnode op for read
*/
int
-spec_read(ap)
- struct vnop_read_args /* {
- struct vnode *a_vp;
- struct uio *a_uio;
- int a_ioflag;
- vfs_context_t a_context;
- } */ *ap;
+spec_read(struct vnop_read_args *ap)
{
- register struct vnode *vp = ap->a_vp;
- register struct uio *uio = ap->a_uio;
+ struct vnode *vp = ap->a_vp;
+ struct uio *uio = ap->a_uio;
struct buf *bp;
daddr64_t bn, nextbn;
long bsize, bscale;
// LP64todo - fix this!
n = min((unsigned)(n - on), uio_resid(uio));
- error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
+ error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio);
if (n + on == bsize)
buf_markaged(bp);
buf_brelse(bp);
* Vnode op for write
*/
int
-spec_write(ap)
- struct vnop_write_args /* {
- struct vnode *a_vp;
- struct uio *a_uio;
- int a_ioflag;
- vfs_context_t a_context;
- } */ *ap;
+spec_write(struct vnop_write_args *ap)
{
- register struct vnode *vp = ap->a_vp;
- register struct uio *uio = ap->a_uio;
+ struct vnode *vp = ap->a_vp;
+ struct uio *uio = ap->a_uio;
struct buf *bp;
daddr64_t bn;
int bsize, blkmask, bscale;
- register int io_sync;
- register int io_size;
+ int io_sync;
+ int io_size;
int devBlockSize=0;
- register int n, on;
+ int n, on;
int error = 0;
dev_t dev;
}
n = min(n, bsize - buf_resid(bp));
- error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
+ error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio);
if (error) {
buf_brelse(bp);
return (error);
* Device ioctl operation.
*/
int
-spec_ioctl(ap)
- struct vnop_ioctl_args /* {
- struct vnode *a_vp;
- int a_command;
- caddr_t a_data;
- int a_fflag;
- vfs_context_t a_context;
- } */ *ap;
+spec_ioctl(struct vnop_ioctl_args *ap)
{
proc_t p = vfs_context_proc(ap->a_context);
dev_t dev = ap->a_vp->v_rdev;
ap->a_fflag, p));
case VBLK:
- if (ap->a_command == 0 && (int)ap->a_data == B_TAPE) {
+ if (ap->a_command == 0 && (unsigned int)ap->a_data == B_TAPE) {
if (bdevsw[major(dev)].d_type == D_TAPE)
return (0);
else
}
int
-spec_select(ap)
- struct vnop_select_args /* {
- struct vnode *a_vp;
- int a_which;
- int a_fflags;
- void * a_wql;
- vfs_context_t a_context;
- } */ *ap;
+spec_select(struct vnop_select_args *ap)
{
proc_t p = vfs_context_proc(ap->a_context);
- register dev_t dev;
+ dev_t dev;
switch (ap->a_vp->v_type) {
/*
* Flush all dirty buffers associated with a block device.
*/
- buf_flushdirtyblks(vp, waitfor == MNT_WAIT, 0, (char *)"spec_fsync");
+ buf_flushdirtyblks(vp, waitfor == MNT_WAIT, 0, "spec_fsync");
return (0);
}
int
-spec_fsync(ap)
- struct vnop_fsync_args /* {
- struct vnode *a_vp;
- int a_waitfor;
- vfs_context_t a_context;
- } */ *ap;
+spec_fsync(struct vnop_fsync_args *ap)
{
return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
}
* Just call the device strategy routine
*/
extern int hard_throttle_on_root;
+void IOSleep(int);
+
+// the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
+#define LOWPRI_INITIAL_WINDOW_MSECS 100
+#define LOWPRI_WINDOW_MSECS_INC 50
+#define LOWPRI_MAX_WINDOW_MSECS 200
+#define LOWPRI_MAX_WAITING_MSECS 200
+#define LOWPRI_SLEEP_INTERVAL 5
+
+struct _throttle_io_info_t {
+ struct timeval last_normal_IO_timestamp;
+ struct timeval last_IO_timestamp;
+ SInt32 numthreads_throttling;
+};
+struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
+int lowpri_IO_initial_window_msecs = LOWPRI_INITIAL_WINDOW_MSECS;
+int lowpri_IO_window_msecs_inc = LOWPRI_WINDOW_MSECS_INC;
+int lowpri_max_window_msecs = LOWPRI_MAX_WINDOW_MSECS;
+int lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS;
-#define LOWPRI_DELAY_MSECS 200
-#define LOWPRI_WINDOW_MSECS 200
+SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
+SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, "");
+SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
+SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
-int lowpri_IO_window_msecs = LOWPRI_WINDOW_MSECS;
-int lowpri_IO_delay_msecs = LOWPRI_DELAY_MSECS;
+void
+throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
+{
+ size_t devbsdunit;
+
+ devbsdunit = mp->mnt_devbsdunit;
-struct timeval last_normal_IO_timestamp;
-struct timeval last_lowpri_IO_timestamp;
-struct timeval lowpri_IO_window = { 0, LOWPRI_WINDOW_MSECS * 1000 };
+ if (devbsdunit < LOWPRI_MAX_NUM_DEV) {
+ *tv = _throttle_io_info[devbsdunit].last_IO_timestamp;
+ } else {
+ memset(tv, 0, sizeof(*tv));
+ }
+}
+
+void
+update_last_io_time(mount_t mp)
+{
+ size_t devbsdunit;
+
+ devbsdunit = mp->mnt_devbsdunit;
+
+ if (devbsdunit < LOWPRI_MAX_NUM_DEV) {
+ microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp);
+ }
+}
+
+int throttle_io_will_be_throttled(int lowpri_window_msecs, size_t devbsdunit)
+{
+ struct timeval elapsed;
+ int elapsed_msecs;
+
+ microuptime(&elapsed);
+ timevalsub(&elapsed, &_throttle_io_info[devbsdunit].last_normal_IO_timestamp);
+ elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
+
+ if (lowpri_window_msecs == -1) // use the max waiting time
+ lowpri_window_msecs = lowpri_max_waiting_msecs;
+
+ return elapsed_msecs < lowpri_window_msecs;
+}
+
+void throttle_lowpri_io(boolean_t ok_to_sleep)
+{
+ int i;
+ int max_try_num;
+ struct uthread *ut;
+
+ ut = get_bsdthread_info(current_thread());
+
+ if (ut->uu_lowpri_window == 0)
+ return;
+
+ max_try_num = lowpri_max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, _throttle_io_info[ut->uu_devbsdunit].numthreads_throttling);
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
+ ut->uu_lowpri_window, 0, 0, 0, 0);
+
+ if (ok_to_sleep == TRUE) {
+ for (i=0; i<max_try_num; i++) {
+ if (throttle_io_will_be_throttled(ut->uu_lowpri_window, ut->uu_devbsdunit)) {
+ IOSleep(LOWPRI_SLEEP_INTERVAL);
+ } else {
+ break;
+ }
+ }
+ }
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
+ ut->uu_lowpri_window, i*5, 0, 0, 0);
+ SInt32 oldValue;
+ oldValue = OSDecrementAtomic(&_throttle_io_info[ut->uu_devbsdunit].numthreads_throttling);
+ ut->uu_lowpri_window = 0;
+
+ if (oldValue <= 0) {
+ panic("%s: numthreads negative", __func__);
+ }
+}
+
+int throttle_get_io_policy(struct uthread **ut)
+{
+ int policy = IOPOL_DEFAULT;
+ proc_t p = current_proc();
+
+ *ut = get_bsdthread_info(current_thread());
+
+ if (p != NULL)
+ policy = p->p_iopol_disk;
+
+ if (*ut != NULL) {
+ // the I/O policy of the thread overrides that of the process
+ // unless the I/O policy of the thread is default
+ if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT)
+ policy = (*ut)->uu_iopol_disk;
+ }
+ return policy;
+}
int
-spec_strategy(ap)
- struct vnop_strategy_args /* {
- struct buf *a_bp;
- } */ *ap;
+spec_strategy(struct vnop_strategy_args *ap)
{
buf_t bp;
int bflags;
dev_t bdev;
- proc_t p;
- struct timeval elapsed;
bp = ap->a_bp;
bdev = buf_device(bp);
(buf_vnode(bp)->v_mount->mnt_kern_flag & MNTK_ROOTDEV))
hard_throttle_on_root = 1;
- if ( lowpri_IO_delay_msecs && lowpri_IO_window_msecs ) {
- p = current_proc();
+ if (lowpri_IO_initial_window_msecs) {
+ struct uthread *ut;
+ int policy;
+ int is_throttleable_io = 0;
+ int is_passive_io = 0;
+ size_t devbsdunit;
+ SInt32 oldValue;
+
+ policy = throttle_get_io_policy(&ut);
+
+ switch (policy) {
+ case IOPOL_DEFAULT:
+ case IOPOL_NORMAL:
+ break;
+ case IOPOL_THROTTLE:
+ is_throttleable_io = 1;
+ break;
+ case IOPOL_PASSIVE:
+ is_passive_io = 1;
+ break;
+ default:
+ printf("unknown I/O policy %d", policy);
+ break;
+ }
+
+ if (!is_throttleable_io && ISSET(bflags, B_PASSIVE))
+ is_passive_io |= 1;
- if ( (p == NULL) || !(p->p_lflag & P_LLOW_PRI_IO)) {
- if (!(p->p_lflag & P_LBACKGROUND_IO))
- microuptime(&last_normal_IO_timestamp);
+ if (buf_vnode(bp)->v_mount != NULL)
+ devbsdunit = buf_vnode(bp)->v_mount->mnt_devbsdunit;
+ else
+ devbsdunit = LOWPRI_MAX_NUM_DEV - 1;
+ if (!is_throttleable_io) {
+ if (!is_passive_io){
+ microuptime(&_throttle_io_info[devbsdunit].last_normal_IO_timestamp);
+ }
} else {
- microuptime(&last_lowpri_IO_timestamp);
-
- elapsed = last_lowpri_IO_timestamp;
- timevalsub(&elapsed, &last_normal_IO_timestamp);
-
- lowpri_IO_window.tv_sec = lowpri_IO_window_msecs / 1000;
- lowpri_IO_window.tv_usec = (lowpri_IO_window_msecs % 1000) * 1000;
-
- if (timevalcmp(&elapsed, &lowpri_IO_window, <)) {
- struct uthread *ut;
-
- /*
- * I'd really like to do the IOSleep here, but
- * we may be holding all kinds of filesystem related locks
- * and the pages for this I/O marked 'busy'...
- * we don't want to cause a normal task to block on
- * one of these locks while we're throttling a task marked
- * for low priority I/O... we'll mark the uthread and
- * do the delay just before we return from the system
- * call that triggered this I/O or from vnode_pagein
- */
- ut = get_bsdthread_info(current_thread());
- ut->uu_lowpri_delay = lowpri_IO_delay_msecs;
+ /*
+ * I'd really like to do the IOSleep here, but
+ * we may be holding all kinds of filesystem related locks
+ * and the pages for this I/O marked 'busy'...
+ * we don't want to cause a normal task to block on
+ * one of these locks while we're throttling a task marked
+ * for low priority I/O... we'll mark the uthread and
+ * do the delay just before we return from the system
+ * call that triggered this I/O or from vnode_pagein
+ */
+ if (ut->uu_lowpri_window == 0) {
+ ut->uu_devbsdunit = devbsdunit;
+ oldValue = OSIncrementAtomic(&_throttle_io_info[devbsdunit].numthreads_throttling);
+ if (oldValue < 0) {
+ panic("%s: numthreads negative", __func__);
+ }
+ ut->uu_lowpri_window = lowpri_IO_initial_window_msecs;
+ ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue;
+ } else {
+ if (ut->uu_devbsdunit != devbsdunit) { // the thread sends I/Os to different devices within the same system call
+ // keep track of the numthreads in the right device
+ OSDecrementAtomic(&_throttle_io_info[ut->uu_devbsdunit].numthreads_throttling);
+ OSIncrementAtomic(&_throttle_io_info[devbsdunit].numthreads_throttling);
+ ut->uu_devbsdunit = devbsdunit;
+ }
+ int numthreads = MAX(1, _throttle_io_info[devbsdunit].numthreads_throttling);
+ ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads;
+ if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads)
+ ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads;
}
}
}
+
+ if ((bflags & B_READ) == 0) {
+ size_t devbsdunit;
+
+ if (buf_vnode(bp)->v_mount != NULL)
+ devbsdunit = buf_vnode(bp)->v_mount->mnt_devbsdunit;
+ else
+ devbsdunit = LOWPRI_MAX_NUM_DEV - 1;
+
+ microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp);
+ }
+
(*bdevsw[major(bdev)].d_strategy)(bp);
return (0);
* Device close routine
*/
int
-spec_close(ap)
- struct vnop_close_args /* {
- struct vnode *a_vp;
- int a_fflag;
- vfs_context_t a_context;
- } */ *ap;
+spec_close(struct vnop_close_args *ap)
{
- register struct vnode *vp = ap->a_vp;
+ struct vnode *vp = ap->a_vp;
dev_t dev = vp->v_rdev;
int (*devclose)(dev_t, int, int, struct proc *);
int mode, error;
+ int flags = ap->a_fflag;
struct proc *p = vfs_context_proc(ap->a_context);
+ struct session *sessp;
switch (vp->v_type) {
* if the reference count is 2 (this last descriptor
* plus the session), release the reference from the session.
*/
- if (vcount(vp) == 2 && p &&
- vp == p->p_session->s_ttyvp) {
- p->p_session->s_ttyvp = NULL;
- vnode_rele(vp);
+ sessp = proc_session(p);
+ if (sessp != SESSION_NULL) {
+ if ((vcount(vp) == 2) &&
+ (vp == sessp->s_ttyvp)) {
+ session_lock(sessp);
+ sessp->s_ttyvp = NULL;
+ sessp->s_ttyvid = 0;
+ sessp->s_ttyp = NULL;
+ sessp->s_ttypgrpid = NO_PID;
+ session_unlock(sessp);
+ vnode_rele(vp);
+ }
+ session_rele(sessp);
}
+
+ devclose = cdevsw[major(dev)].d_close;
+ mode = S_IFCHR;
/*
- * close on last reference.
+ * close on last reference or on vnode revoke call
*/
+ if ((flags & IO_REVOKE) != 0)
+ break;
if (vcount(vp) > 1)
return (0);
- devclose = cdevsw[major(dev)].d_close;
- mode = S_IFCHR;
break;
case VBLK:
* sum of the reference counts on all the aliased
* vnodes descends to one, we are on last close.
*/
- if (vcount(vp) > 1)
+ if (vcount(vp) > 0)
return (0);
#else /* DEVFS_IMPLEMENTS_LOCKING */
/*
* sum of the reference counts on all the aliased
* vnodes descends to one, we are on last close.
*/
- if (vcount(vp) > 1)
+ if (vcount(vp) > 0)
return (0);
/*
default:
panic("spec_close: not special");
+ return(EBADF);
}
- return ((*devclose)(dev, ap->a_fflag, mode, p));
+ return ((*devclose)(dev, flags, mode, p));
}
/*
* Return POSIX pathconf information applicable to special devices.
*/
int
-spec_pathconf(ap)
- struct vnop_pathconf_args /* {
- struct vnode *a_vp;
- int a_name;
- int *a_retval;
- vfs_context_t a_context;
- } */ *ap;
+spec_pathconf(struct vnop_pathconf_args *ap)
{
switch (ap->a_name) {
*ap->a_retval = PIPE_BUF;
return (0);
case _PC_CHOWN_RESTRICTED:
- *ap->a_retval = 1;
+ *ap->a_retval = 200112; /* _POSIX_CHOWN_RESTRICTED */
return (0);
case _PC_VDISABLE:
*ap->a_retval = _POSIX_VDISABLE;
return (EBADF);
}
-/*
- * Special device bad operation
- */
-int
-spec_badop()
-{
-
- panic("spec_badop called");
- /* NOTREACHED */
-}
-
/* Blktooff derives file offset from logical block number */
int
-spec_blktooff(ap)
- struct vnop_blktooff_args /* {
- struct vnode *a_vp;
- daddr64_t a_lblkno;
- off_t *a_offset;
- } */ *ap;
+spec_blktooff(struct vnop_blktooff_args *ap)
{
- register struct vnode *vp = ap->a_vp;
+ struct vnode *vp = ap->a_vp;
switch (vp->v_type) {
case VCHR:
/* Offtoblk derives logical block number from file offset */
int
-spec_offtoblk(ap)
- struct vnop_offtoblk_args /* {
- struct vnode *a_vp;
- off_t a_offset;
- daddr64_t *a_lblkno;
- } */ *ap;
+spec_offtoblk(struct vnop_offtoblk_args *ap)
{
- register struct vnode *vp = ap->a_vp;
+ struct vnode *vp = ap->a_vp;
switch (vp->v_type) {
case VCHR: