X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/0b4e3aa066abc0728aacb4bbeb86f53f9737156e..e2fac8b15b12a7979f72090454d850e612fc5b13:/bsd/miscfs/specfs/spec_vnops.c diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c index 86dba1588..6c26b1799 100644 --- a/bsd/miscfs/specfs/spec_vnops.c +++ b/bsd/miscfs/specfs/spec_vnops.c @@ -1,23 +1,29 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. * - * @APPLE_LICENSE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ /* @@ -56,23 +62,33 @@ */ #include -#include +#include +#include #include #include #include -#include -#include +#include +#include #include -#include +#include #include #include #include #include +#include #include -#include +#include +#include +#include #include #include +#include + +/* XXX following three prototypes should be in a header file somewhere */ +extern int isdisk(dev_t dev, int type); +extern dev_t chrtoblk(dev_t dev); +extern int iskmemdev(dev_t dev); struct vnode *speclisth[SPECHSZ]; @@ -89,81 +105,66 @@ char devcls[] = "devcls"; int (**spec_vnodeop_p)(void *); struct vnodeopv_entry_desc spec_vnodeop_entries[] = { - { &vop_default_desc, (VOPFUNC)vn_default_error }, - { &vop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ - { &vop_create_desc, (VOPFUNC)err_create }, /* create */ - { &vop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */ - { &vop_open_desc, (VOPFUNC)spec_open }, /* open */ - { &vop_close_desc, (VOPFUNC)spec_close }, /* close */ - { &vop_access_desc, (VOPFUNC)spec_access }, /* access */ - { &vop_getattr_desc, (VOPFUNC)spec_getattr }, /* getattr */ - { &vop_setattr_desc, (VOPFUNC)spec_setattr }, /* setattr */ - { &vop_read_desc, (VOPFUNC)spec_read }, /* read */ - { &vop_write_desc, (VOPFUNC)spec_write }, /* write */ - { &vop_lease_desc, (VOPFUNC)nop_lease }, /* lease */ - { &vop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ - { &vop_select_desc, (VOPFUNC)spec_select }, /* select */ - { &vop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ - { &vop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ - { &vop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */ - { &vop_seek_desc, (VOPFUNC)err_seek }, /* seek */ - { &vop_remove_desc, (VOPFUNC)err_remove }, /* remove */ - { &vop_link_desc, (VOPFUNC)err_link }, /* link */ - { &vop_rename_desc, (VOPFUNC)err_rename }, /* rename */ - { &vop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */ - { &vop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */ - { &vop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */ - { &vop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */ - { &vop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */ - { &vop_abortop_desc, (VOPFUNC)err_abortop }, /* abortop */ - { &vop_inactive_desc, (VOPFUNC)nop_inactive }, /* inactive */ - { &vop_reclaim_desc, (VOPFUNC)nop_reclaim }, /* reclaim */ - { &vop_lock_desc, (VOPFUNC)nop_lock }, /* lock */ - { &vop_unlock_desc, (VOPFUNC)nop_unlock }, /* unlock */ - { &vop_bmap_desc, (VOPFUNC)spec_bmap }, /* bmap */ - { &vop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ - { &vop_print_desc, (VOPFUNC)spec_print }, /* print */ - { &vop_islocked_desc, (VOPFUNC)nop_islocked }, /* islocked */ - { &vop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ - { &vop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ - { &vop_blkatoff_desc, (VOPFUNC)err_blkatoff }, /* blkatoff */ - { &vop_valloc_desc, (VOPFUNC)err_valloc }, /* valloc */ - { &vop_vfree_desc, (VOPFUNC)err_vfree }, /* vfree */ - { &vop_truncate_desc, (VOPFUNC)nop_truncate }, /* truncate */ - { &vop_update_desc, (VOPFUNC)nop_update }, /* update */ - { &vop_bwrite_desc, (VOPFUNC)spec_bwrite }, /* bwrite */ - { &vop_devblocksize_desc, (VOPFUNC)spec_devblocksize }, /* devblocksize */ - { &vop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ - { &vop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ - { &vop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */ - { &vop_offtoblk_desc, (VOPFUNC)spec_offtoblk }, /* offtoblk */ - { &vop_cmap_desc, (VOPFUNC)spec_cmap }, /* cmap */ + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)err_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)spec_close }, /* close */ + { &vnop_access_desc, (VOPFUNC)spec_access }, /* access */ + { &vnop_getattr_desc, (VOPFUNC)spec_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)spec_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)spec_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)spec_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ + { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)err_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)err_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)err_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)nop_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite }, /* bwrite */ + { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ + { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap }, /* blockmap */ { (struct vnodeop_desc*)NULL, (int(*)())NULL } }; struct vnodeopv_desc spec_vnodeop_opv_desc = { &spec_vnodeop_p, spec_vnodeop_entries }; + +static void set_blocksize(vnode_t, dev_t); + + /* * Trivial lookup routine that always fails. */ int -spec_lookup(ap) - struct vop_lookup_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - } */ *ap; +spec_lookup(struct vnop_lookup_args *ap) { *ap->a_vpp = NULL; return (ENOTDIR); } -void +static void set_blocksize(struct vnode *vp, dev_t dev) { - int (*size)(); + int (*size)(dev_t); int rsize; if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) { @@ -185,10 +186,12 @@ set_fsblocksize(struct vnode *vp) dev_t dev = (dev_t)vp->v_rdev; int maj = major(dev); - if ((u_int)maj >= nblkdev) + if ((u_int)maj >= (u_int)nblkdev) return; + vnode_lock(vp); set_blocksize(vp, dev); + vnode_unlock(vp); } } @@ -197,17 +200,12 @@ set_fsblocksize(struct vnode *vp) /* * Open a special file. */ -/* ARGSUSED */ -spec_open(ap) - struct vop_open_args /* { - struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +int +spec_open(struct vnop_open_args *ap) { - struct proc *p = ap->a_p; - struct vnode *bvp, *vp = ap->a_vp; + struct proc *p = vfs_context_proc(ap->a_context); + kauth_cred_t cred = vfs_context_ucred(ap->a_context); + struct vnode *vp = ap->a_vp; dev_t bdev, dev = (dev_t)vp->v_rdev; int maj = major(dev); int error; @@ -221,9 +219,9 @@ spec_open(ap) switch (vp->v_type) { case VCHR: - if ((u_int)maj >= nchrdev) + if ((u_int)maj >= (u_int)nchrdev) return (ENXIO); - if (ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) { + if (cred != FSCRED && (ap->a_mode & FWRITE)) { /* * When running in very secure mode, do not allow * opens for writing of any disk character devices. @@ -237,43 +235,77 @@ spec_open(ap) * currently mounted. */ if (securelevel >= 1) { - if ((bdev = chrtoblk(dev)) != NODEV && - vfinddev(bdev, VBLK, &bvp) && - bvp->v_usecount > 0 && - (error = vfs_mountedon(bvp))) + if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error)) return (error); if (iskmemdev(dev)) return (EPERM); } } - if (cdevsw[maj].d_type == D_TTY) + if (cdevsw[maj].d_type == D_TTY) { + vnode_lock(vp); vp->v_flag |= VISTTY; - VOP_UNLOCK(vp, 0, p); + vnode_unlock(vp); + } error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); case VBLK: - if ((u_int)maj >= nblkdev) + if ((u_int)maj >= (u_int)nblkdev) return (ENXIO); /* * When running in very secure mode, do not allow * opens for writing of any disk block devices. */ - if (securelevel >= 2 && ap->a_cred != FSCRED && + if (securelevel >= 2 && cred != FSCRED && (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK) return (EPERM); /* * Do not allow opens of block devices that are * currently mounted. */ - if (error = vfs_mountedon(vp)) + if ( (error = vfs_mountedon(vp)) ) return (error); error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p); if (!error) { + u_int64_t blkcnt; + u_int32_t blksize; + int setsize = 0; + u_int32_t size512 = 512; + + + if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) { + /* Switch to 512 byte sectors (temporarily) */ + + if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) { + /* Get the number of 512 byte physical blocks. */ + if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) { + setsize = 1; + } + } + /* If it doesn't set back, we can't recover */ + if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context)) + error = ENXIO; + } + + + vnode_lock(vp); set_blocksize(vp, dev); + + /* + * Cache the size in bytes of the block device for later + * use by spec_write(). + */ + if (setsize) + vp->v_specdevsize = blkcnt * (u_int64_t)size512; + else + vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */ + + vnode_unlock(vp); + } return(error); + default: + panic("spec_open type"); } return (0); } @@ -281,42 +313,33 @@ spec_open(ap) /* * Vnode op for read */ -/* ARGSUSED */ -spec_read(ap) - struct vop_read_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; +int +spec_read(struct vnop_read_args *ap) { - register struct vnode *vp = ap->a_vp; - register struct uio *uio = ap->a_uio; - struct proc *p = uio->uio_procp; + struct vnode *vp = ap->a_vp; + struct uio *uio = ap->a_uio; struct buf *bp; - daddr_t bn, nextbn; + daddr64_t bn, nextbn; long bsize, bscale; int devBlockSize=0; - int n, on, majordev, (*ioctl)(); + int n, on; int error = 0; dev_t dev; #if DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("spec_read mode"); - if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc()) + if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) panic("spec_read proc"); #endif - if (uio->uio_resid == 0) + if (uio_resid(uio) == 0) return (0); switch (vp->v_type) { case VCHR: - VOP_UNLOCK(vp, 0, p); error = (*cdevsw[major(vp->v_rdev)].d_read) (vp->v_rdev, uio, ap->a_ioflag); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); case VBLK: @@ -336,87 +359,85 @@ spec_read(ap) do { on = uio->uio_offset % bsize; - bn = (uio->uio_offset / devBlockSize) &~ (bscale - 1); + bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1)); - if (vp->v_lastr + bscale == bn) { + if (vp->v_speclastr + bscale == bn) { nextbn = bn + bscale; - error = breadn(vp, bn, (int)bsize, &nextbn, + error = buf_breadn(vp, bn, (int)bsize, &nextbn, (int *)&bsize, 1, NOCRED, &bp); } else - error = bread(vp, bn, (int)bsize, NOCRED, &bp); + error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp); - vp->v_lastr = bn; - n = bsize - bp->b_resid; + vnode_lock(vp); + vp->v_speclastr = bn; + vnode_unlock(vp); + + n = bsize - buf_resid(bp); if ((on > n) || error) { if (!error) error = EINVAL; - brelse(bp); + buf_brelse(bp); return (error); } - n = min((unsigned)(n - on), uio->uio_resid); + // LP64todo - fix this! + n = min((unsigned)(n - on), uio_resid(uio)); - error = uiomove((char *)bp->b_data + on, n, uio); + error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio); if (n + on == bsize) - bp->b_flags |= B_AGE; - brelse(bp); - } while (error == 0 && uio->uio_resid > 0 && n != 0); + buf_markaged(bp); + buf_brelse(bp); + } while (error == 0 && uio_resid(uio) > 0 && n != 0); return (error); default: panic("spec_read type"); } /* NOTREACHED */ + + return (0); } /* * Vnode op for write */ -/* ARGSUSED */ -spec_write(ap) - struct vop_write_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; +int +spec_write(struct vnop_write_args *ap) { - register struct vnode *vp = ap->a_vp; - register struct uio *uio = ap->a_uio; - struct proc *p = uio->uio_procp; + struct vnode *vp = ap->a_vp; + struct uio *uio = ap->a_uio; struct buf *bp; - daddr_t bn; + daddr64_t bn; int bsize, blkmask, bscale; - register int io_sync; - register int io_size; + int io_sync; + int io_size; int devBlockSize=0; - register int n, on; + int n, on; int error = 0; dev_t dev; #if DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("spec_write mode"); - if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc()) + if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) panic("spec_write proc"); #endif switch (vp->v_type) { case VCHR: - VOP_UNLOCK(vp, 0, p); error = (*cdevsw[major(vp->v_rdev)].d_write) (vp->v_rdev, uio, ap->a_ioflag); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); case VBLK: - if (uio->uio_resid == 0) + if (uio_resid(uio) == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); io_sync = (ap->a_ioflag & IO_SYNC); - io_size = uio->uio_resid; + // LP64todo - fix this! + io_size = uio_resid(uio); dev = (vp->v_rdev); @@ -430,92 +451,106 @@ spec_write(ap) do { - bn = (uio->uio_offset / devBlockSize) &~ blkmask; + bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask); on = uio->uio_offset % bsize; - n = min((unsigned)(bsize - on), uio->uio_resid); + // LP64todo - fix this! + n = min((unsigned)(bsize - on), uio_resid(uio)); + + /* + * Use buf_getblk() as an optimization IFF: + * + * 1) We are reading exactly a block on a block + * aligned boundary + * 2) We know the size of the device from spec_open + * 3) The read doesn't span the end of the device + * + * Otherwise, we fall back on buf_bread(). + */ + if (n == bsize && + vp->v_specdevsize != (u_int64_t)0 && + (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) { + /* reduce the size of the read to what is there */ + n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize; + } if (n == bsize) - bp = getblk(vp, bn, bsize, 0, 0, BLK_WRITE); + bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE); else - error = bread(vp, bn, bsize, NOCRED, &bp); + error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp); + /* Translate downstream error for upstream, if needed */ + if (!error) + error = (int)buf_error(bp); if (error) { - brelse(bp); + buf_brelse(bp); return (error); } - n = min(n, bsize - bp->b_resid); - - error = uiomove((char *)bp->b_data + on, n, uio); + n = min(n, bsize - buf_resid(bp)); - bp->b_flags |= B_AGE; + error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio); + if (error) { + buf_brelse(bp); + return (error); + } + buf_markaged(bp); if (io_sync) - bwrite(bp); + error = buf_bwrite(bp); else { if ((n + on) == bsize) - bawrite(bp); + error = buf_bawrite(bp); else - bdwrite(bp); + error = buf_bdwrite(bp); } - } while (error == 0 && uio->uio_resid > 0 && n != 0); + } while (error == 0 && uio_resid(uio) > 0 && n != 0); return (error); default: panic("spec_write type"); } /* NOTREACHED */ + + return (0); } /* * Device ioctl operation. */ -/* ARGSUSED */ -spec_ioctl(ap) - struct vop_ioctl_args /* { - struct vnode *a_vp; - int a_command; - caddr_t a_data; - int a_fflag; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +int +spec_ioctl(struct vnop_ioctl_args *ap) { + proc_t p = vfs_context_proc(ap->a_context); dev_t dev = ap->a_vp->v_rdev; switch (ap->a_vp->v_type) { case VCHR: return ((*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, - ap->a_fflag, ap->a_p)); + ap->a_fflag, p)); case VBLK: - if (ap->a_command == 0 && (int)ap->a_data == B_TAPE) + if (ap->a_command == 0 && (unsigned int)ap->a_data == B_TAPE) { if (bdevsw[major(dev)].d_type == D_TAPE) return (0); else return (1); + } return ((*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, - ap->a_fflag, ap->a_p)); + ap->a_fflag, p)); default: panic("spec_ioctl"); /* NOTREACHED */ } + return (0); } -/* ARGSUSED */ -spec_select(ap) - struct vop_select_args /* { - struct vnode *a_vp; - int a_which; - int a_fflags; - struct ucred *a_cred; - void * a_wql; - struct proc *a_p; - } */ *ap; +int +spec_select(struct vnop_select_args *ap) { - register dev_t dev; + proc_t p = vfs_context_proc(ap->a_context); + dev_t dev; switch (ap->a_vp->v_type) { @@ -524,130 +559,299 @@ spec_select(ap) case VCHR: dev = ap->a_vp->v_rdev; - return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, ap->a_p); + return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p); } } + /* * Synch buffers associated with a block device */ -/* ARGSUSED */ int -spec_fsync(ap) - struct vop_fsync_args /* { - struct vnode *a_vp; - struct ucred *a_cred; - int a_waitfor; - struct proc *a_p; - } */ *ap; +spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context) { - register struct vnode *vp = ap->a_vp; - register struct buf *bp; - struct buf *nbp; - int s; - if (vp->v_type == VCHR) return (0); /* * Flush all dirty buffers associated with a block device. */ -loop: - s = splbio(); - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if ((bp->b_flags & B_BUSY)) - continue; - if ((bp->b_flags & B_DELWRI) == 0) - panic("spec_fsync: not dirty"); - bremfree(bp); - bp->b_flags |= B_BUSY; - splx(s); - bawrite(bp); - goto loop; - } - if (ap->a_waitfor == MNT_WAIT) { - while (vp->v_numoutput) { - vp->v_flag |= VBWAIT; - tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "spec_fsync", 0); - } -#if DIAGNOSTIC - if (vp->v_dirtyblkhd.lh_first) { - vprint("spec_fsync: dirty", vp); - splx(s); - goto loop; - } -#endif - } - splx(s); + buf_flushdirtyblks(vp, waitfor == MNT_WAIT, 0, "spec_fsync"); + return (0); } +int +spec_fsync(struct vnop_fsync_args *ap) +{ + return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context); +} + /* * Just call the device strategy routine */ -spec_strategy(ap) - struct vop_strategy_args /* { - struct buf *a_bp; - } */ *ap; +extern int hard_throttle_on_root; +void IOSleep(int); + +// the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond +#define LOWPRI_INITIAL_WINDOW_MSECS 100 +#define LOWPRI_WINDOW_MSECS_INC 50 +#define LOWPRI_MAX_WINDOW_MSECS 200 +#define LOWPRI_MAX_WAITING_MSECS 200 +#define LOWPRI_SLEEP_INTERVAL 5 + +struct _throttle_io_info_t { + struct timeval last_normal_IO_timestamp; + struct timeval last_IO_timestamp; + SInt32 numthreads_throttling; +}; + +struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV]; +int lowpri_IO_initial_window_msecs = LOWPRI_INITIAL_WINDOW_MSECS; +int lowpri_IO_window_msecs_inc = LOWPRI_WINDOW_MSECS_INC; +int lowpri_max_window_msecs = LOWPRI_MAX_WINDOW_MSECS; +int lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS; + +SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); + +void +throttle_info_get_last_io_time(mount_t mp, struct timeval *tv) { - (*bdevsw[major(ap->a_bp->b_dev)].d_strategy)(ap->a_bp); - return (0); + size_t devbsdunit; + + devbsdunit = mp->mnt_devbsdunit; + + if (devbsdunit < LOWPRI_MAX_NUM_DEV) { + *tv = _throttle_io_info[devbsdunit].last_IO_timestamp; + } else { + memset(tv, 0, sizeof(*tv)); + } } -/* - * This is a noop, simply returning what one has been given. - */ -spec_bmap(ap) - struct vop_bmap_args /* { - struct vnode *a_vp; - daddr_t a_bn; - struct vnode **a_vpp; - daddr_t *a_bnp; - int *a_runp; - } */ *ap; +void +update_last_io_time(mount_t mp) { + size_t devbsdunit; + + devbsdunit = mp->mnt_devbsdunit; - if (ap->a_vpp != NULL) - *ap->a_vpp = ap->a_vp; - if (ap->a_bnp != NULL) - *ap->a_bnp = ap->a_bn * (PAGE_SIZE / ap->a_vp->v_specsize); - if (ap->a_runp != NULL) - *ap->a_runp = (MAXPHYSIO / PAGE_SIZE) - 1; - return (0); + if (devbsdunit < LOWPRI_MAX_NUM_DEV) { + microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp); + } +} + +int throttle_io_will_be_throttled(int lowpri_window_msecs, size_t devbsdunit) +{ + struct timeval elapsed; + int elapsed_msecs; + + microuptime(&elapsed); + timevalsub(&elapsed, &_throttle_io_info[devbsdunit].last_normal_IO_timestamp); + elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000; + + if (lowpri_window_msecs == -1) // use the max waiting time + lowpri_window_msecs = lowpri_max_waiting_msecs; + + return elapsed_msecs < lowpri_window_msecs; } +void throttle_lowpri_io(boolean_t ok_to_sleep) +{ + int i; + int max_try_num; + struct uthread *ut; + + ut = get_bsdthread_info(current_thread()); + + if (ut->uu_lowpri_window == 0) + return; + + max_try_num = lowpri_max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, _throttle_io_info[ut->uu_devbsdunit].numthreads_throttling); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, + ut->uu_lowpri_window, 0, 0, 0, 0); + + if (ok_to_sleep == TRUE) { + for (i=0; iuu_lowpri_window, ut->uu_devbsdunit)) { + IOSleep(LOWPRI_SLEEP_INTERVAL); + } else { + break; + } + } + } + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, + ut->uu_lowpri_window, i*5, 0, 0, 0); + SInt32 oldValue; + oldValue = OSDecrementAtomic(&_throttle_io_info[ut->uu_devbsdunit].numthreads_throttling); + ut->uu_lowpri_window = 0; + + if (oldValue <= 0) { + panic("%s: numthreads negative", __func__); + } +} + +int throttle_get_io_policy(struct uthread **ut) +{ + int policy = IOPOL_DEFAULT; + proc_t p = current_proc(); + + *ut = get_bsdthread_info(current_thread()); + + if (p != NULL) + policy = p->p_iopol_disk; + + if (*ut != NULL) { + // the I/O policy of the thread overrides that of the process + // unless the I/O policy of the thread is default + if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT) + policy = (*ut)->uu_iopol_disk; + } + return policy; +} + +int +spec_strategy(struct vnop_strategy_args *ap) +{ + buf_t bp; + int bflags; + dev_t bdev; + + bp = ap->a_bp; + bdev = buf_device(bp); + bflags = buf_flags(bp); + + if (kdebug_enable) { + int code = 0; + + if (bflags & B_READ) + code |= DKIO_READ; + if (bflags & B_ASYNC) + code |= DKIO_ASYNC; + + if (bflags & B_META) + code |= DKIO_META; + else if (bflags & B_PAGEIO) + code |= DKIO_PAGING; + + KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, + (unsigned int)bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0); + } + if (((bflags & (B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) && + (buf_vnode(bp)->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) + hard_throttle_on_root = 1; + + if (lowpri_IO_initial_window_msecs) { + struct uthread *ut; + int policy; + int is_throttleable_io = 0; + int is_passive_io = 0; + size_t devbsdunit; + SInt32 oldValue; + + policy = throttle_get_io_policy(&ut); + + switch (policy) { + case IOPOL_DEFAULT: + case IOPOL_NORMAL: + break; + case IOPOL_THROTTLE: + is_throttleable_io = 1; + break; + case IOPOL_PASSIVE: + is_passive_io = 1; + break; + default: + printf("unknown I/O policy %d", policy); + break; + } + + if (!is_throttleable_io && ISSET(bflags, B_PASSIVE)) + is_passive_io |= 1; + + if (buf_vnode(bp)->v_mount != NULL) + devbsdunit = buf_vnode(bp)->v_mount->mnt_devbsdunit; + else + devbsdunit = LOWPRI_MAX_NUM_DEV - 1; + if (!is_throttleable_io) { + if (!is_passive_io){ + microuptime(&_throttle_io_info[devbsdunit].last_normal_IO_timestamp); + } + } else { + /* + * I'd really like to do the IOSleep here, but + * we may be holding all kinds of filesystem related locks + * and the pages for this I/O marked 'busy'... + * we don't want to cause a normal task to block on + * one of these locks while we're throttling a task marked + * for low priority I/O... we'll mark the uthread and + * do the delay just before we return from the system + * call that triggered this I/O or from vnode_pagein + */ + if (ut->uu_lowpri_window == 0) { + ut->uu_devbsdunit = devbsdunit; + oldValue = OSIncrementAtomic(&_throttle_io_info[devbsdunit].numthreads_throttling); + if (oldValue < 0) { + panic("%s: numthreads negative", __func__); + } + ut->uu_lowpri_window = lowpri_IO_initial_window_msecs; + ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue; + } else { + if (ut->uu_devbsdunit != devbsdunit) { // the thread sends I/Os to different devices within the same system call + // keep track of the numthreads in the right device + OSDecrementAtomic(&_throttle_io_info[ut->uu_devbsdunit].numthreads_throttling); + OSIncrementAtomic(&_throttle_io_info[devbsdunit].numthreads_throttling); + ut->uu_devbsdunit = devbsdunit; + } + int numthreads = MAX(1, _throttle_io_info[devbsdunit].numthreads_throttling); + ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads; + if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads) + ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads; + } + } + } + + if ((bflags & B_READ) == 0) { + size_t devbsdunit; + + if (buf_vnode(bp)->v_mount != NULL) + devbsdunit = buf_vnode(bp)->v_mount->mnt_devbsdunit; + else + devbsdunit = LOWPRI_MAX_NUM_DEV - 1; + + microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp); + } + + (*bdevsw[major(bdev)].d_strategy)(bp); + + return (0); +} + + /* * This is a noop, simply returning what one has been given. */ -spec_cmap(ap) - struct vop_cmap_args /* { - struct vnode *a_vp; - off_t a_offset; - size_t a_size; - daddr_t *a_bpn; - size_t *a_run; - void *a_poff; - } */ *ap; +int +spec_blockmap(__unused struct vnop_blockmap_args *ap) { - return (EOPNOTSUPP); + return (ENOTSUP); } /* * Device close routine */ -/* ARGSUSED */ -spec_close(ap) - struct vop_close_args /* { - struct vnode *a_vp; - int a_fflag; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +int +spec_close(struct vnop_close_args *ap) { - register struct vnode *vp = ap->a_vp; + struct vnode *vp = ap->a_vp; dev_t dev = vp->v_rdev; - int (*devclose) __P((dev_t, int, int, struct proc *)); + int (*devclose)(dev_t, int, int, struct proc *); int mode, error; + int flags = ap->a_fflag; + struct proc *p = vfs_context_proc(ap->a_context); + struct session *sessp; switch (vp->v_type) { @@ -661,20 +865,30 @@ spec_close(ap) * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ - if (vcount(vp) == 2 && ap->a_p && - vp == ap->a_p->p_session->s_ttyvp) { - vrele(vp); - ap->a_p->p_session->s_ttyvp = NULL; + sessp = proc_session(p); + if (sessp != SESSION_NULL) { + if ((vcount(vp) == 2) && + (vp == sessp->s_ttyvp)) { + session_lock(sessp); + sessp->s_ttyvp = NULL; + sessp->s_ttyvid = 0; + sessp->s_ttyp = NULL; + sessp->s_ttypgrpid = NO_PID; + session_unlock(sessp); + vnode_rele(vp); + } + session_rele(sessp); } + + devclose = cdevsw[major(dev)].d_close; + mode = S_IFCHR; /* - * If the vnode is locked, then we are in the midst - * of forcably closing the device, otherwise we only - * close on last reference. + * close on last reference or on vnode revoke call */ - if (vcount(vp) > 1 && (vp->v_flag & VXLOCK) == 0) + if ((flags & IO_REVOKE) != 0) + break; + if (vcount(vp) > 1) return (0); - devclose = cdevsw[major(dev)].d_close; - mode = S_IFCHR; break; case VBLK: @@ -684,33 +898,30 @@ spec_close(ap) * we must invalidate any in core blocks, so that * we can, for instance, change floppy disks. */ - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p); - error = vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 0, 0); - VOP_UNLOCK(vp, 0, ap->a_p); + if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context))) + return (error); + + error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0); if (error) return (error); /* - * We do not want to really close the device if it - * is still in use unless we are trying to close it - * forcibly. Since every use (buffer, vnode, swap, cmap) + * Since every use (buffer, vnode, swap, blockmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ - if (vcount(vp) > 1 && (vp->v_flag & VXLOCK) == 0) + if (vcount(vp) > 0) return (0); #else /* DEVFS_IMPLEMENTS_LOCKING */ /* - * We do not want to really close the device if it - * is still in use unless we are trying to close it - * forcibly. Since every use (buffer, vnode, swap, cmap) + * Since every use (buffer, vnode, swap, blockmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ - if (vcount(vp) > 1 && (vp->v_flag & VXLOCK) == 0) + if (vcount(vp) > 0) return (0); /* @@ -718,7 +929,10 @@ spec_close(ap) * we must invalidate any in core blocks, so that * we can, for instance, change floppy disks. */ - error = vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 0, 0); + if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context))) + return (error); + + error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0); if (error) return (error); #endif /* DEVFS_IMPLEMENTS_LOCKING */ @@ -728,33 +942,17 @@ spec_close(ap) default: panic("spec_close: not special"); + return(EBADF); } - return ((*devclose)(dev, ap->a_fflag, mode, ap->a_p)); -} - -/* - * Print out the contents of a special device vnode. - */ -spec_print(ap) - struct vop_print_args /* { - struct vnode *a_vp; - } */ *ap; -{ - - printf("tag VT_NON, dev %d, %d\n", major(ap->a_vp->v_rdev), - minor(ap->a_vp->v_rdev)); + return ((*devclose)(dev, flags, mode, p)); } /* * Return POSIX pathconf information applicable to special devices. */ -spec_pathconf(ap) - struct vop_pathconf_args /* { - struct vnode *a_vp; - int a_name; - int *a_retval; - } */ *ap; +int +spec_pathconf(struct vnop_pathconf_args *ap) { switch (ap->a_name) { @@ -771,7 +969,7 @@ spec_pathconf(ap) *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: - *ap->a_retval = 1; + *ap->a_retval = 200112; /* _POSIX_CHOWN_RESTRICTED */ return (0); case _PC_VDISABLE: *ap->a_retval = _POSIX_VDISABLE; @@ -782,86 +980,60 @@ spec_pathconf(ap) /* NOTREACHED */ } -int -spec_devblocksize(ap) - struct vop_devblocksize_args /* { - struct vnode *a_vp; - int *a_retval; - } */ *ap; -{ - *ap->a_retval = (ap->a_vp->v_specsize); - return (0); -} - /* * Special device failed operation */ -spec_ebadf() +int +spec_ebadf(__unused void *dummy) { return (EBADF); } -/* - * Special device bad operation - */ -spec_badop() -{ - - panic("spec_badop called"); - /* NOTREACHED */ -} - /* Blktooff derives file offset from logical block number */ int -spec_blktooff(ap) - struct vop_blktooff_args /* { - struct vnode *a_vp; - daddr_t a_lblkno; - off_t *a_offset; - } */ *ap; +spec_blktooff(struct vnop_blktooff_args *ap) { - register struct vnode *vp = ap->a_vp; + struct vnode *vp = ap->a_vp; switch (vp->v_type) { case VCHR: *ap->a_offset = (off_t)-1; /* failure */ - return (EOPNOTSUPP); + return (ENOTSUP); case VBLK: printf("spec_blktooff: not implemented for VBLK\n"); *ap->a_offset = (off_t)-1; /* failure */ - return (EOPNOTSUPP); + return (ENOTSUP); default: panic("spec_blktooff type"); } /* NOTREACHED */ + + return (0); } /* Offtoblk derives logical block number from file offset */ int -spec_offtoblk(ap) - struct vop_offtoblk_args /* { - struct vnode *a_vp; - off_t a_offset; - daddr_t *a_lblkno; - } */ *ap; +spec_offtoblk(struct vnop_offtoblk_args *ap) { - register struct vnode *vp = ap->a_vp; + struct vnode *vp = ap->a_vp; switch (vp->v_type) { case VCHR: - *ap->a_lblkno = (daddr_t)-1; /* failure */ - return (EOPNOTSUPP); + *ap->a_lblkno = (daddr64_t)-1; /* failure */ + return (ENOTSUP); case VBLK: printf("spec_offtoblk: not implemented for VBLK\n"); - *ap->a_lblkno = (daddr_t)-1; /* failure */ - return (EOPNOTSUPP); + *ap->a_lblkno = (daddr64_t)-1; /* failure */ + return (ENOTSUP); default: panic("spec_offtoblk type"); } /* NOTREACHED */ + + return (0); }