git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2012 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
	29	/*
	30	* Copyright (c) 1989, 1993, 1995
	31	* The Regents of the University of California. All rights reserved.
	32	*
	33	* Redistribution and use in source and binary forms, with or without
	34	* modification, are permitted provided that the following conditions
	35	* are met:
	36	* 1. Redistributions of source code must retain the above copyright
	37	* notice, this list of conditions and the following disclaimer.
	38	* 2. Redistributions in binary form must reproduce the above copyright
	39	* notice, this list of conditions and the following disclaimer in the
	40	* documentation and/or other materials provided with the distribution.
	41	* 3. All advertising materials mentioning features or use of this software
	42	* must display the following acknowledgement:
	43	* This product includes software developed by the University of
	44	* California, Berkeley and its contributors.
	45	* 4. Neither the name of the University nor the names of its contributors
	46	* may be used to endorse or promote products derived from this software
	47	* without specific prior written permission.
	48	*
	49	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	50	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	51	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	52	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	53	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	54	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	55	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	56	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	57	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	58	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	59	* SUCH DAMAGE.
	60	*
	61	* @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95
	62	*/
	63
	64	#include <sys/param.h>
	65	#include <sys/proc_internal.h>
	66	#include <sys/kauth.h>
	67	#include <sys/systm.h>
	68	#include <sys/kernel.h>
	69	#include <sys/conf.h>
	70	#include <sys/buf_internal.h>
	71	#include <sys/mount_internal.h>
	72	#include <sys/vnode_internal.h>
	73	#include <sys/file_internal.h>
	74	#include <sys/namei.h>
	75	#include <sys/stat.h>
	76	#include <sys/errno.h>
	77	#include <sys/ioctl.h>
	78	#include <sys/file.h>
	79	#include <sys/user.h>
	80	#include <sys/malloc.h>
	81	#include <sys/disk.h>
	82	#include <sys/uio_internal.h>
	83	#include <sys/resource.h>
	84	#include <miscfs/specfs/specdev.h>
	85	#include <vfs/vfs_support.h>
	86	#include <kern/assert.h>
	87	#include <kern/task.h>
	88	#include <pexpert/pexpert.h>
	89
	90	#include <sys/kdebug.h>
	91
	92	/* XXX following three prototypes should be in a header file somewhere */
	93	extern dev_t chrtoblk(dev_t dev);
	94	extern boolean_t iskmemdev(dev_t dev);
	95	extern int bpfkqfilter(dev_t dev, struct knote *kn);
	96	extern int ptsd_kqfilter(dev_t dev, struct knote *kn);
	97
	98	extern int ignore_is_ssd;
	99
	100	struct vnode *speclisth[SPECHSZ];
	101
	102	/* symbolic sleep message strings for devices */
	103	char devopn[] = "devopn";
	104	char devio[] = "devio";
	105	char devwait[] = "devwait";
	106	char devin[] = "devin";
	107	char devout[] = "devout";
	108	char devioc[] = "devioc";
	109	char devcls[] = "devcls";
	110
	111	#define VOPFUNC int ()(void )
	112
	113	int (*spec_vnodeop_p)(void );
	114	struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
	115	{ &vnop_default_desc, (VOPFUNC)vn_default_error },
	116	{ &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */
	117	{ &vnop_create_desc, (VOPFUNC)err_create }, /* create */
	118	{ &vnop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */
	119	{ &vnop_open_desc, (VOPFUNC)spec_open }, /* open */
	120	{ &vnop_close_desc, (VOPFUNC)spec_close }, /* close */
	121	{ &vnop_access_desc, (VOPFUNC)spec_access }, /* access */
	122	{ &vnop_getattr_desc, (VOPFUNC)spec_getattr }, /* getattr */
	123	{ &vnop_setattr_desc, (VOPFUNC)spec_setattr }, /* setattr */
	124	{ &vnop_read_desc, (VOPFUNC)spec_read }, /* read */
	125	{ &vnop_write_desc, (VOPFUNC)spec_write }, /* write */
	126	{ &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */
	127	{ &vnop_select_desc, (VOPFUNC)spec_select }, /* select */
	128	{ &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */
	129	{ &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */
	130	{ &vnop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */
	131	{ &vnop_remove_desc, (VOPFUNC)err_remove }, /* remove */
	132	{ &vnop_link_desc, (VOPFUNC)err_link }, /* link */
	133	{ &vnop_rename_desc, (VOPFUNC)err_rename }, /* rename */
	134	{ &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */
	135	{ &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */
	136	{ &vnop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */
	137	{ &vnop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */
	138	{ &vnop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */
	139	{ &vnop_inactive_desc, (VOPFUNC)nop_inactive }, /* inactive */
	140	{ &vnop_reclaim_desc, (VOPFUNC)nop_reclaim }, /* reclaim */
	141	{ &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */
	142	{ &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */
	143	{ &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */
	144	{ &vnop_bwrite_desc, (VOPFUNC)spec_bwrite }, /* bwrite */
	145	{ &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */
	146	{ &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */
	147	{ &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */
	148	{ &vnop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */
	149	{ &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk }, /* offtoblk */
	150	{ &vnop_blockmap_desc, (VOPFUNC)spec_blockmap }, /* blockmap */
	151	{ (struct vnodeop_desc)NULL, (int()())NULL }
	152	};
	153	struct vnodeopv_desc spec_vnodeop_opv_desc =
	154	{ &spec_vnodeop_p, spec_vnodeop_entries };
	155
	156
	157	static void set_blocksize(vnode_t, dev_t);
	158
	159	#define LOWPRI_TIER1_WINDOW_MSECS 25
	160	#define LOWPRI_TIER2_WINDOW_MSECS 100
	161	#define LOWPRI_TIER3_WINDOW_MSECS 500
	162
	163	#define LOWPRI_TIER1_IO_PERIOD_MSECS 15
	164	#define LOWPRI_TIER2_IO_PERIOD_MSECS 50
	165	#define LOWPRI_TIER3_IO_PERIOD_MSECS 200
	166
	167	#define LOWPRI_TIER1_IO_PERIOD_SSD_MSECS 5
	168	#define LOWPRI_TIER2_IO_PERIOD_SSD_MSECS 15
	169	#define LOWPRI_TIER3_IO_PERIOD_SSD_MSECS 25
	170
	171
	172	int throttle_windows_msecs[THROTTLE_LEVEL_END + 1] = {
	173	0,
	174	LOWPRI_TIER1_WINDOW_MSECS,
	175	LOWPRI_TIER2_WINDOW_MSECS,
	176	LOWPRI_TIER3_WINDOW_MSECS,
	177	};
	178
	179	int throttle_io_period_msecs[THROTTLE_LEVEL_END + 1] = {
	180	0,
	181	LOWPRI_TIER1_IO_PERIOD_MSECS,
	182	LOWPRI_TIER2_IO_PERIOD_MSECS,
	183	LOWPRI_TIER3_IO_PERIOD_MSECS,
	184	};
	185
	186	int throttle_io_period_ssd_msecs[THROTTLE_LEVEL_END + 1] = {
	187	0,
	188	LOWPRI_TIER1_IO_PERIOD_SSD_MSECS,
	189	LOWPRI_TIER2_IO_PERIOD_SSD_MSECS,
	190	LOWPRI_TIER3_IO_PERIOD_SSD_MSECS,
	191	};
	192
	193
	194	int throttled_count[THROTTLE_LEVEL_END + 1];
	195
	196	struct _throttle_io_info_t {
	197	lck_mtx_t throttle_lock;
	198
	199	struct timeval throttle_last_write_timestamp;
	200	struct timeval throttle_min_timer_deadline;
	201	struct timeval throttle_window_start_timestamp[THROTTLE_LEVEL_END + 1];
	202	struct timeval throttle_last_IO_timestamp[THROTTLE_LEVEL_END + 1];
	203	pid_t throttle_last_IO_pid[THROTTLE_LEVEL_END + 1];
	204	struct timeval throttle_start_IO_period_timestamp[THROTTLE_LEVEL_END + 1];
	205
	206	TAILQ_HEAD( , uthread) throttle_uthlist[THROTTLE_LEVEL_END + 1]; /* Lists of throttled uthreads */
	207	int throttle_next_wake_level;
	208
	209	thread_call_t throttle_timer_call;
	210	int32_t throttle_timer_ref;
	211	int32_t throttle_timer_active;
	212
	213	int32_t throttle_io_count;
	214	int32_t throttle_io_count_begin;
	215	int *throttle_io_periods;
	216	uint32_t throttle_io_period_num;
	217
	218	int32_t throttle_refcnt;
	219	int32_t throttle_alloc;
	220	int32_t throttle_disabled;
	221	int32_t throttle_is_fusion_with_priority;
	222	};
	223
	224	struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
	225
	226
	227	int lowpri_throttle_enabled = 1;
	228
	229
	230
	231	static void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd);
	232	static int throttle_get_thread_throttle_level(uthread_t ut);
	233
	234	/*
	235	* Trivial lookup routine that always fails.
	236	*/
	237	int
	238	spec_lookup(struct vnop_lookup_args *ap)
	239	{
	240
	241	*ap->a_vpp = NULL;
	242	return (ENOTDIR);
	243	}
	244
	245	static void
	246	set_blocksize(struct vnode *vp, dev_t dev)
	247	{
	248	int (*size)(dev_t);
	249	int rsize;
	250
	251	if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
	252	rsize = (*size)(dev);
	253	if (rsize <= 0) /* did size fail? */
	254	vp->v_specsize = DEV_BSIZE;
	255	else
	256	vp->v_specsize = rsize;
	257	}
	258	else
	259	vp->v_specsize = DEV_BSIZE;
	260	}
	261
	262	void
	263	set_fsblocksize(struct vnode *vp)
	264	{
	265
	266	if (vp->v_type == VBLK) {
	267	dev_t dev = (dev_t)vp->v_rdev;
	268	int maj = major(dev);
	269
	270	if ((u_int)maj >= (u_int)nblkdev)
	271	return;
	272
	273	vnode_lock(vp);
	274	set_blocksize(vp, dev);
	275	vnode_unlock(vp);
	276	}
	277
	278	}
	279
	280
	281	/*
	282	* Open a special file.
	283	*/
	284	int
	285	spec_open(struct vnop_open_args *ap)
	286	{
	287	struct proc *p = vfs_context_proc(ap->a_context);
	288	kauth_cred_t cred = vfs_context_ucred(ap->a_context);
	289	struct vnode *vp = ap->a_vp;
	290	dev_t bdev, dev = (dev_t)vp->v_rdev;
	291	int maj = major(dev);
	292	int error;
	293
	294	/*
	295	* Don't allow open if fs is mounted -nodev.
	296	*/
	297	if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
	298	return (ENXIO);
	299
	300	switch (vp->v_type) {
	301
	302	case VCHR:
	303	if ((u_int)maj >= (u_int)nchrdev)
	304	return (ENXIO);
	305	if (cred != FSCRED && (ap->a_mode & FWRITE)) {
	306	/*
	307	* When running in very secure mode, do not allow
	308	* opens for writing of any disk character devices.
	309	*/
	310	if (securelevel >= 2 && isdisk(dev, VCHR))
	311	return (EPERM);
	312
	313	/* Never allow writing to /dev/mem or /dev/kmem */
	314	if (iskmemdev(dev))
	315	return (EPERM);
	316	/*
	317	* When running in secure mode, do not allow opens for
	318	* writing of character devices whose corresponding block
	319	* devices are currently mounted.
	320	*/
	321	if (securelevel >= 1) {
	322	if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error))
	323	return (error);
	324	}
	325	}
	326
	327	devsw_lock(dev, S_IFCHR);
	328	error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
	329
	330	if (error == 0) {
	331	vp->v_specinfo->si_opencount++;
	332	}
	333
	334	devsw_unlock(dev, S_IFCHR);
	335
	336	if (error == 0 && cdevsw[maj].d_type == D_DISK && !vp->v_un.vu_specinfo->si_initted) {
	337	int isssd = 0;
	338	uint64_t throttle_mask = 0;
	339	uint32_t devbsdunit = 0;
	340
	341	if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL) == 0) {
	342
	343	if (throttle_mask != 0 &&
	344	VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) {
	345	/*
	346	* as a reasonable approximation, only use the lowest bit of the mask
	347	* to generate a disk unit number
	348	*/
	349	devbsdunit = num_trailing_0(throttle_mask);
	350
	351	vnode_lock(vp);
	352
	353	vp->v_un.vu_specinfo->si_isssd = isssd;
	354	vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit;
	355	vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask;
	356	vp->v_un.vu_specinfo->si_throttleable = 1;
	357	vp->v_un.vu_specinfo->si_initted = 1;
	358
	359	vnode_unlock(vp);
	360	}
	361	}
	362	if (vp->v_un.vu_specinfo->si_initted == 0) {
	363	vnode_lock(vp);
	364	vp->v_un.vu_specinfo->si_initted = 1;
	365	vnode_unlock(vp);
	366	}
	367	}
	368	return (error);
	369
	370	case VBLK:
	371	if ((u_int)maj >= (u_int)nblkdev)
	372	return (ENXIO);
	373	/*
	374	* When running in very secure mode, do not allow
	375	* opens for writing of any disk block devices.
	376	*/
	377	if (securelevel >= 2 && cred != FSCRED &&
	378	(ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK)
	379	return (EPERM);
	380	/*
	381	* Do not allow opens of block devices that are
	382	* currently mounted.
	383	*/
	384	if ( (error = vfs_mountedon(vp)) )
	385	return (error);
	386
	387	devsw_lock(dev, S_IFBLK);
	388	error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
	389	if (!error) {
	390	vp->v_specinfo->si_opencount++;
	391	}
	392	devsw_unlock(dev, S_IFBLK);
	393
	394	if (!error) {
	395	u_int64_t blkcnt;
	396	u_int32_t blksize;
	397	int setsize = 0;
	398	u_int32_t size512 = 512;
	399
	400
	401	if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
	402	/* Switch to 512 byte sectors (temporarily) */
	403
	404	if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
	405	/* Get the number of 512 byte physical blocks. */
	406	if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
	407	setsize = 1;
	408	}
	409	}
	410	/* If it doesn't set back, we can't recover */
	411	if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context))
	412	error = ENXIO;
	413	}
	414
	415
	416	vnode_lock(vp);
	417	set_blocksize(vp, dev);
	418
	419	/*
	420	* Cache the size in bytes of the block device for later
	421	* use by spec_write().
	422	*/
	423	if (setsize)
	424	vp->v_specdevsize = blkcnt * (u_int64_t)size512;
	425	else
	426	vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */
	427
	428	vnode_unlock(vp);
	429
	430	}
	431	return(error);
	432	default:
	433	panic("spec_open type");
	434	}
	435	return (0);
	436	}
	437
	438	/*
	439	* Vnode op for read
	440	*/
	441	int
	442	spec_read(struct vnop_read_args *ap)
	443	{
	444	struct vnode *vp = ap->a_vp;
	445	struct uio *uio = ap->a_uio;
	446	struct buf *bp;
	447	daddr64_t bn, nextbn;
	448	long bsize, bscale;
	449	int devBlockSize=0;
	450	int n, on;
	451	int error = 0;
	452	dev_t dev;
	453
	454	#if DIAGNOSTIC
	455	if (uio->uio_rw != UIO_READ)
	456	panic("spec_read mode");
	457	if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
	458	panic("spec_read proc");
	459	#endif
	460	if (uio_resid(uio) == 0)
	461	return (0);
	462
	463	switch (vp->v_type) {
	464
	465	case VCHR:
	466	if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
	467	struct _throttle_io_info_t *throttle_info;
	468
	469	throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
	470	throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd);
	471	}
	472	error = (*cdevsw[major(vp->v_rdev)].d_read)
	473	(vp->v_rdev, uio, ap->a_ioflag);
	474
	475	return (error);
	476
	477	case VBLK:
	478	if (uio->uio_offset < 0)
	479	return (EINVAL);
	480
	481	dev = vp->v_rdev;
	482
	483	devBlockSize = vp->v_specsize;
	484
	485	if (devBlockSize > PAGE_SIZE)
	486	return (EINVAL);
	487
	488	bscale = PAGE_SIZE / devBlockSize;
	489	bsize = bscale * devBlockSize;
	490
	491	do {
	492	on = uio->uio_offset % bsize;
	493
	494	bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1));
	495
	496	if (vp->v_speclastr + bscale == bn) {
	497	nextbn = bn + bscale;
	498	error = buf_breadn(vp, bn, (int)bsize, &nextbn,
	499	(int *)&bsize, 1, NOCRED, &bp);
	500	} else
	501	error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
	502
	503	vnode_lock(vp);
	504	vp->v_speclastr = bn;
	505	vnode_unlock(vp);
	506
	507	n = bsize - buf_resid(bp);
	508	if ((on > n) \|\| error) {
	509	if (!error)
	510	error = EINVAL;
	511	buf_brelse(bp);
	512	return (error);
	513	}
	514	n = min((unsigned)(n - on), uio_resid(uio));
	515
	516	error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
	517	if (n + on == bsize)
	518	buf_markaged(bp);
	519	buf_brelse(bp);
	520	} while (error == 0 && uio_resid(uio) > 0 && n != 0);
	521	return (error);
	522
	523	default:
	524	panic("spec_read type");
	525	}
	526	/* NOTREACHED */
	527
	528	return (0);
	529	}
	530
	531	/*
	532	* Vnode op for write
	533	*/
	534	int
	535	spec_write(struct vnop_write_args *ap)
	536	{
	537	struct vnode *vp = ap->a_vp;
	538	struct uio *uio = ap->a_uio;
	539	struct buf *bp;
	540	daddr64_t bn;
	541	int bsize, blkmask, bscale;
	542	int io_sync;
	543	int devBlockSize=0;
	544	int n, on;
	545	int error = 0;
	546	dev_t dev;
	547
	548	#if DIAGNOSTIC
	549	if (uio->uio_rw != UIO_WRITE)
	550	panic("spec_write mode");
	551	if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
	552	panic("spec_write proc");
	553	#endif
	554
	555	switch (vp->v_type) {
	556
	557	case VCHR:
	558	if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
	559	struct _throttle_io_info_t *throttle_info;
	560
	561	throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
	562
	563	throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd);
	564
	565	microuptime(&throttle_info->throttle_last_write_timestamp);
	566	}
	567	error = (*cdevsw[major(vp->v_rdev)].d_write)
	568	(vp->v_rdev, uio, ap->a_ioflag);
	569
	570	return (error);
	571
	572	case VBLK:
	573	if (uio_resid(uio) == 0)
	574	return (0);
	575	if (uio->uio_offset < 0)
	576	return (EINVAL);
	577
	578	io_sync = (ap->a_ioflag & IO_SYNC);
	579
	580	dev = (vp->v_rdev);
	581
	582	devBlockSize = vp->v_specsize;
	583	if (devBlockSize > PAGE_SIZE)
	584	return(EINVAL);
	585
	586	bscale = PAGE_SIZE / devBlockSize;
	587	blkmask = bscale - 1;
	588	bsize = bscale * devBlockSize;
	589
	590
	591	do {
	592	bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask);
	593	on = uio->uio_offset % bsize;
	594
	595	n = min((unsigned)(bsize - on), uio_resid(uio));
	596
	597	/*
	598	* Use buf_getblk() as an optimization IFF:
	599	*
	600	* 1) We are reading exactly a block on a block
	601	* aligned boundary
	602	* 2) We know the size of the device from spec_open
	603	* 3) The read doesn't span the end of the device
	604	*
	605	* Otherwise, we fall back on buf_bread().
	606	*/
	607	if (n == bsize &&
	608	vp->v_specdevsize != (u_int64_t)0 &&
	609	(uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
	610	/* reduce the size of the read to what is there */
	611	n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
	612	}
	613
	614	if (n == bsize)
	615	bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
	616	else
	617	error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
	618
	619	/* Translate downstream error for upstream, if needed */
	620	if (!error)
	621	error = (int)buf_error(bp);
	622	if (error) {
	623	buf_brelse(bp);
	624	return (error);
	625	}
	626	n = min(n, bsize - buf_resid(bp));
	627
	628	error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
	629	if (error) {
	630	buf_brelse(bp);
	631	return (error);
	632	}
	633	buf_markaged(bp);
	634
	635	if (io_sync)
	636	error = buf_bwrite(bp);
	637	else {
	638	if ((n + on) == bsize)
	639	error = buf_bawrite(bp);
	640	else
	641	error = buf_bdwrite(bp);
	642	}
	643	} while (error == 0 && uio_resid(uio) > 0 && n != 0);
	644	return (error);
	645
	646	default:
	647	panic("spec_write type");
	648	}
	649	/* NOTREACHED */
	650
	651	return (0);
	652	}
	653
	654	/*
	655	* Device ioctl operation.
	656	*/
	657	int
	658	spec_ioctl(struct vnop_ioctl_args *ap)
	659	{
	660	proc_t p = vfs_context_proc(ap->a_context);
	661	dev_t dev = ap->a_vp->v_rdev;
	662	int retval = 0;
	663
	664	KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) \| DBG_FUNC_START,
	665	dev, ap->a_command, ap->a_fflag, ap->a_vp->v_type, 0);
	666
	667	switch (ap->a_vp->v_type) {
	668
	669	case VCHR:
	670	retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
	671	ap->a_fflag, p);
	672	break;
	673
	674	case VBLK:
	675	if (kdebug_enable) {
	676	if (ap->a_command == DKIOCUNMAP) {
	677	dk_unmap_t *unmap;
	678	dk_extent_t *extent;
	679	uint32_t i;
	680
	681	unmap = (dk_unmap_t *)ap->a_data;
	682	extent = unmap->extents;
	683
	684	for (i = 0; i < unmap->extentsCount; i++, extent++) {
	685	KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 1) \| DBG_FUNC_NONE, dev,
	686	extent->offset/ap->a_vp->v_specsize, extent->length, 0, 0);
	687	}
	688	} else if (ap->a_command == DKIOCSYNCHRONIZE) {
	689	dk_synchronize_t *synch;
	690	synch = (dk_synchronize_t *)ap->a_data;
	691	KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 1) \| DBG_FUNC_NONE, dev, ap->a_command,
	692	synch->options, 0, 0);
	693	}
	694	}
	695	retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, p);
	696	break;
	697
	698	default:
	699	panic("spec_ioctl");
	700	/* NOTREACHED */
	701	}
	702	KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) \| DBG_FUNC_END,
	703	dev, ap->a_command, ap->a_fflag, retval, 0);
	704
	705	return (retval);
	706	}
	707
	708	int
	709	spec_select(struct vnop_select_args *ap)
	710	{
	711	proc_t p = vfs_context_proc(ap->a_context);
	712	dev_t dev;
	713
	714	switch (ap->a_vp->v_type) {
	715
	716	default:
	717	return (1); /* XXX */
	718
	719	case VCHR:
	720	dev = ap->a_vp->v_rdev;
	721	return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
	722	}
	723	}
	724
	725	static int filt_specattach(struct knote *kn);
	726
	727	int
	728	spec_kqfilter(vnode_t vp, struct knote *kn)
	729	{
	730	dev_t dev;
	731	int err;
	732
	733	assert(vnode_ischr(vp));
	734
	735	dev = vnode_specrdev(vp);
	736
	737	#if NETWORKING
	738	/* Try a bpf device, as defined in bsd/net/bpf.c */
	739	if ((err = bpfkqfilter(dev, kn)) == 0) {
	740	return err;
	741	}
	742	#endif
	743	/* Try to attach to other char special devices */
	744	err = filt_specattach(kn);
	745
	746	return err;
	747	}
	748
	749	/*
	750	* Synch buffers associated with a block device
	751	*/
	752	int
	753	spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
	754	{
	755	if (vp->v_type == VCHR)
	756	return (0);
	757	/*
	758	* Flush all dirty buffers associated with a block device.
	759	*/
	760	buf_flushdirtyblks(vp, (waitfor == MNT_WAIT \|\| waitfor == MNT_DWAIT), 0, "spec_fsync");
	761
	762	return (0);
	763	}
	764
	765	int
	766	spec_fsync(struct vnop_fsync_args *ap)
	767	{
	768	return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
	769	}
	770
	771
	772	/*
	773	* Just call the device strategy routine
	774	*/
	775	void throttle_init(void);
	776
	777
	778	#if 0
	779	#define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) \
	780	do { \
	781	if ((debug_info)->alloc) \
	782	printf("%s: "format, __FUNCTION__, ## args); \
	783	} while(0)
	784
	785	#else
	786	#define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
	787	#endif
	788
	789
	790	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_window_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER1], 0, "");
	791	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_window_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER2], 0, "");
	792	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_window_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER3], 0, "");
	793
	794	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER1], 0, "");
	795	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER2], 0, "");
	796	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER3], 0, "");
	797
	798	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_ssd_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER1], 0, "");
	799	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_ssd_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER2], 0, "");
	800	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_ssd_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER3], 0, "");
	801
	802	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_enabled, CTLFLAG_RW \| CTLFLAG_LOCKED, &lowpri_throttle_enabled, 0, "");
	803
	804
	805	static lck_grp_t *throttle_mtx_grp;
	806	static lck_attr_t *throttle_mtx_attr;
	807	static lck_grp_attr_t *throttle_mtx_grp_attr;
	808
	809
	810	/*
	811	* throttled I/O helper function
	812	* convert the index of the lowest set bit to a device index
	813	*/
	814	int
	815	num_trailing_0(uint64_t n)
	816	{
	817	/*
	818	* since in most cases the number of trailing 0s is very small,
	819	* we simply counting sequentially from the lowest bit
	820	*/
	821	if (n == 0)
	822	return sizeof(n) * 8;
	823	int count = 0;
	824	while (!ISSET(n, 1)) {
	825	n >>= 1;
	826	++count;
	827	}
	828	return count;
	829	}
	830
	831
	832	/*
	833	* Release the reference and if the item was allocated and this is the last
	834	* reference then free it.
	835	*
	836	* This routine always returns the old value.
	837	*/
	838	static int
	839	throttle_info_rel(struct _throttle_io_info_t *info)
	840	{
	841	SInt32 oldValue = OSDecrementAtomic(&info->throttle_refcnt);
	842
	843	DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
	844	info, (int)(oldValue -1), info );
	845
	846	/* The reference count just went negative, very bad */
	847	if (oldValue == 0)
	848	panic("throttle info ref cnt went negative!");
	849
	850	/*
	851	* Once reference count is zero, no one else should be able to take a
	852	* reference
	853	*/
	854	if ((info->throttle_refcnt == 0) && (info->throttle_alloc)) {
	855	DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info);
	856
	857	lck_mtx_destroy(&info->throttle_lock, throttle_mtx_grp);
	858	FREE(info, M_TEMP);
	859	}
	860	return oldValue;
	861	}
	862
	863
	864	/*
	865	* Just take a reference on the throttle info structure.
	866	*
	867	* This routine always returns the old value.
	868	*/
	869	static SInt32
	870	throttle_info_ref(struct _throttle_io_info_t *info)
	871	{
	872	SInt32 oldValue = OSIncrementAtomic(&info->throttle_refcnt);
	873
	874	DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
	875	info, (int)(oldValue -1), info );
	876	/* Allocated items should never have a reference of zero */
	877	if (info->throttle_alloc && (oldValue == 0))
	878	panic("Taking a reference without calling create throttle info!\n");
	879
	880	return oldValue;
	881	}
	882
	883	/*
	884	* on entry the throttle_lock is held...
	885	* this function is responsible for taking
	886	* and dropping the reference on the info
	887	* structure which will keep it from going
	888	* away while the timer is running if it
	889	* happens to have been dynamically allocated by
	890	* a network fileystem kext which is now trying
	891	* to free it
	892	*/
	893	static uint32_t
	894	throttle_timer_start(struct _throttle_io_info_t *info, boolean_t update_io_count, int wakelevel)
	895	{
	896	struct timeval elapsed;
	897	struct timeval now;
	898	struct timeval period;
	899	uint64_t elapsed_msecs;
	900	int throttle_level;
	901	int level;
	902	int msecs;
	903	boolean_t throttled = FALSE;
	904	boolean_t need_timer = FALSE;
	905
	906	microuptime(&now);
	907
	908	if (update_io_count == TRUE) {
	909	info->throttle_io_count_begin = info->throttle_io_count;
	910	info->throttle_io_period_num++;
	911
	912	while (wakelevel >= THROTTLE_LEVEL_THROTTLED)
	913	info->throttle_start_IO_period_timestamp[wakelevel--] = now;
	914
	915	info->throttle_min_timer_deadline = now;
	916
	917	msecs = info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED];
	918	period.tv_sec = msecs / 1000;
	919	period.tv_usec = (msecs % 1000) * 1000;
	920
	921	timevaladd(&info->throttle_min_timer_deadline, &period);
	922	}
	923	for (throttle_level = THROTTLE_LEVEL_START; throttle_level < THROTTLE_LEVEL_END; throttle_level++) {
	924
	925	elapsed = now;
	926	timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
	927	elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
	928
	929	for (level = throttle_level + 1; level <= THROTTLE_LEVEL_END; level++) {
	930
	931	if (!TAILQ_EMPTY(&info->throttle_uthlist[level])) {
	932
	933	if (elapsed_msecs < (uint64_t)throttle_windows_msecs[level]) {
	934	/*
	935	* we had an I/O occur at a higher priority tier within
	936	* this tier's throttle window
	937	*/
	938	throttled = TRUE;
	939	}
	940	/*
	941	* we assume that the windows are the same or longer
	942	* as we drop through the throttling tiers... thus
	943	* we can stop looking once we run into a tier with
	944	* threads to schedule regardless of whether it's
	945	* still in its throttling window or not
	946	*/
	947	break;
	948	}
	949	}
	950	if (throttled == TRUE)
	951	break;
	952	}
	953	if (throttled == TRUE) {
	954	uint64_t deadline = 0;
	955	struct timeval target;
	956	struct timeval min_target;
	957
	958	/*
	959	* we've got at least one tier still in a throttled window
	960	* so we need a timer running... compute the next deadline
	961	* and schedule it
	962	*/
	963	for (level = throttle_level+1; level <= THROTTLE_LEVEL_END; level++) {
	964
	965	if (TAILQ_EMPTY(&info->throttle_uthlist[level]))
	966	continue;
	967
	968	target = info->throttle_start_IO_period_timestamp[level];
	969
	970	msecs = info->throttle_io_periods[level];
	971	period.tv_sec = msecs / 1000;
	972	period.tv_usec = (msecs % 1000) * 1000;
	973
	974	timevaladd(&target, &period);
	975
	976	if (need_timer == FALSE \|\| timevalcmp(&target, &min_target, <)) {
	977	min_target = target;
	978	need_timer = TRUE;
	979	}
	980	}
	981	if (timevalcmp(&info->throttle_min_timer_deadline, &now, >)) {
	982	if (timevalcmp(&info->throttle_min_timer_deadline, &min_target, >))
	983	min_target = info->throttle_min_timer_deadline;
	984	}
	985
	986	if (info->throttle_timer_active) {
	987	if (thread_call_cancel(info->throttle_timer_call) == FALSE) {
	988	/*
	989	* couldn't kill the timer because it's already
	990	* been dispatched, so don't try to start a new
	991	* one... once we drop the lock, the timer will
	992	* proceed and eventually re-run this function
	993	*/
	994	need_timer = FALSE;
	995	} else
	996	info->throttle_timer_active = 0;
	997	}
	998	if (need_timer == TRUE) {
	999	/*
	1000	* This is defined as an int (32-bit) rather than a 64-bit
	1001	* value because it would need a really big period in the
	1002	* order of ~500 days to overflow this. So, we let this be
	1003	* 32-bit which allows us to use the clock_interval_to_deadline()
	1004	* routine.
	1005	*/
	1006	int target_msecs;
	1007
	1008	if (info->throttle_timer_ref == 0) {
	1009	/*
	1010	* take a reference for the timer
	1011	*/
	1012	throttle_info_ref(info);
	1013
	1014	info->throttle_timer_ref = 1;
	1015	}
	1016	elapsed = min_target;
	1017	timevalsub(&elapsed, &now);
	1018	target_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
	1019
	1020	if (target_msecs <= 0) {
	1021	/*
	1022	* we may have computed a deadline slightly in the past
	1023	* due to various factors... if so, just set the timer
	1024	* to go off in the near future (we don't need to be precise)
	1025	*/
	1026	target_msecs = 1;
	1027	}
	1028	clock_interval_to_deadline(target_msecs, 1000000, &deadline);
	1029
	1030	thread_call_enter_delayed(info->throttle_timer_call, deadline);
	1031	info->throttle_timer_active = 1;
	1032	}
	1033	}
	1034	return (throttle_level);
	1035	}
	1036
	1037
	1038	static void
	1039	throttle_timer(struct _throttle_io_info_t *info)
	1040	{
	1041	uthread_t ut, utlist;
	1042	struct timeval elapsed;
	1043	struct timeval now;
	1044	uint64_t elapsed_msecs;
	1045	int throttle_level;
	1046	int level;
	1047	int wake_level;
	1048	caddr_t wake_address = NULL;
	1049	boolean_t update_io_count = FALSE;
	1050	boolean_t need_wakeup = FALSE;
	1051	boolean_t need_release = FALSE;
	1052
	1053	ut = NULL;
	1054	lck_mtx_lock(&info->throttle_lock);
	1055
	1056	info->throttle_timer_active = 0;
	1057	microuptime(&now);
	1058
	1059	elapsed = now;
	1060	timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[THROTTLE_LEVEL_THROTTLED]);
	1061	elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
	1062
	1063	if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED]) {
	1064
	1065	wake_level = info->throttle_next_wake_level;
	1066
	1067	for (level = THROTTLE_LEVEL_START; level < THROTTLE_LEVEL_END; level++) {
	1068
	1069	elapsed = now;
	1070	timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[wake_level]);
	1071	elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
	1072
	1073	if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[wake_level] && !TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
	1074	/*
	1075	* we're closing out the current IO period...
	1076	* if we have a waiting thread, wake it up
	1077	* after we have reset the I/O window info
	1078	*/
	1079	need_wakeup = TRUE;
	1080	update_io_count = TRUE;
	1081
	1082	info->throttle_next_wake_level = wake_level - 1;
	1083
	1084	if (info->throttle_next_wake_level == THROTTLE_LEVEL_START)
	1085	info->throttle_next_wake_level = THROTTLE_LEVEL_END;
	1086
	1087	break;
	1088	}
	1089	wake_level--;
	1090
	1091	if (wake_level == THROTTLE_LEVEL_START)
	1092	wake_level = THROTTLE_LEVEL_END;
	1093	}
	1094	}
	1095	if (need_wakeup == TRUE) {
	1096	if (!TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
	1097
	1098	ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist[wake_level]);
	1099	TAILQ_REMOVE(&info->throttle_uthlist[wake_level], ut, uu_throttlelist);
	1100	ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
	1101
	1102	wake_address = (caddr_t)&ut->uu_on_throttlelist;
	1103	}
	1104	} else
	1105	wake_level = THROTTLE_LEVEL_START;
	1106
	1107	throttle_level = throttle_timer_start(info, update_io_count, wake_level);
	1108
	1109	if (wake_address != NULL)
	1110	wakeup(wake_address);
	1111
	1112	for (level = THROTTLE_LEVEL_THROTTLED; level <= throttle_level; level++) {
	1113
	1114	TAILQ_FOREACH_SAFE(ut, &info->throttle_uthlist[level], uu_throttlelist, utlist) {
	1115
	1116	TAILQ_REMOVE(&info->throttle_uthlist[level], ut, uu_throttlelist);
	1117	ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
	1118
	1119	wakeup(&ut->uu_on_throttlelist);
	1120	}
	1121	}
	1122	if (info->throttle_timer_active == 0 && info->throttle_timer_ref) {
	1123	info->throttle_timer_ref = 0;
	1124	need_release = TRUE;
	1125	}
	1126	lck_mtx_unlock(&info->throttle_lock);
	1127
	1128	if (need_release == TRUE)
	1129	throttle_info_rel(info);
	1130	}
	1131
	1132
	1133	static int
	1134	throttle_add_to_list(struct _throttle_io_info_t *info, uthread_t ut, int mylevel, boolean_t insert_tail)
	1135	{
	1136	boolean_t start_timer = FALSE;
	1137	int level = THROTTLE_LEVEL_START;
	1138
	1139	if (TAILQ_EMPTY(&info->throttle_uthlist[mylevel])) {
	1140	info->throttle_start_IO_period_timestamp[mylevel] = info->throttle_last_IO_timestamp[mylevel];
	1141	start_timer = TRUE;
	1142	}
	1143
	1144	if (insert_tail == TRUE)
	1145	TAILQ_INSERT_TAIL(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
	1146	else
	1147	TAILQ_INSERT_HEAD(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
	1148
	1149	ut->uu_on_throttlelist = mylevel;
	1150
	1151	if (start_timer == TRUE) {
	1152	/* we may need to start or rearm the timer */
	1153	level = throttle_timer_start(info, FALSE, THROTTLE_LEVEL_START);
	1154
	1155	if (level == THROTTLE_LEVEL_END) {
	1156	if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
	1157	TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
	1158
	1159	ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
	1160	}
	1161	}
	1162	}
	1163	return (level);
	1164	}
	1165
	1166	static void
	1167	throttle_init_throttle_window(void)
	1168	{
	1169	int throttle_window_size;
	1170
	1171	/*
	1172	* The hierarchy of throttle window values is as follows:
	1173	* - Global defaults
	1174	* - Device tree properties
	1175	* - Boot-args
	1176	* All values are specified in msecs.
	1177	*/
	1178
	1179	/* Override global values with device-tree properties */
	1180	if (PE_get_default("kern.io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size)))
	1181	throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
	1182
	1183	if (PE_get_default("kern.io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size)))
	1184	throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
	1185
	1186	if (PE_get_default("kern.io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size)))
	1187	throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
	1188
	1189	/* Override with boot-args */
	1190	if (PE_parse_boot_argn("io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size)))
	1191	throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
	1192
	1193	if (PE_parse_boot_argn("io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size)))
	1194	throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
	1195
	1196	if (PE_parse_boot_argn("io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size)))
	1197	throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
	1198	}
	1199
	1200	static void
	1201	throttle_init_throttle_period(struct _throttle_io_info_t *info, boolean_t isssd)
	1202	{
	1203	int throttle_period_size;
	1204
	1205	/*
	1206	* The hierarchy of throttle period values is as follows:
	1207	* - Global defaults
	1208	* - Device tree properties
	1209	* - Boot-args
	1210	* All values are specified in msecs.
	1211	*/
	1212
	1213	/* Assign global defaults */
	1214	if ((isssd == TRUE) && (info->throttle_is_fusion_with_priority == 0))
	1215	info->throttle_io_periods = &throttle_io_period_ssd_msecs[0];
	1216	else
	1217	info->throttle_io_periods = &throttle_io_period_msecs[0];
	1218
	1219	/* Override global values with device-tree properties */
	1220	if (PE_get_default("kern.io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size)))
	1221	info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
	1222
	1223	if (PE_get_default("kern.io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size)))
	1224	info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
	1225
	1226	if (PE_get_default("kern.io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size)))
	1227	info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
	1228
	1229	/* Override with boot-args */
	1230	if (PE_parse_boot_argn("io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size)))
	1231	info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
	1232
	1233	if (PE_parse_boot_argn("io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size)))
	1234	info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
	1235
	1236	if (PE_parse_boot_argn("io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size)))
	1237	info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
	1238
	1239	}
	1240
	1241	#if CONFIG_IOSCHED
	1242	extern void vm_io_reprioritize_init(void);
	1243	int iosched_enabled = 1;
	1244	#endif
	1245
	1246	void
	1247	throttle_init(void)
	1248	{
	1249	struct _throttle_io_info_t *info;
	1250	int i;
	1251	int level;
	1252	#if CONFIG_IOSCHED
	1253	int iosched;
	1254	#endif
	1255	/*
	1256	* allocate lock group attribute and group
	1257	*/
	1258	throttle_mtx_grp_attr = lck_grp_attr_alloc_init();
	1259	throttle_mtx_grp = lck_grp_alloc_init("throttle I/O", throttle_mtx_grp_attr);
	1260
	1261	/* Update throttle parameters based on device tree configuration */
	1262	throttle_init_throttle_window();
	1263
	1264	/*
	1265	* allocate the lock attribute
	1266	*/
	1267	throttle_mtx_attr = lck_attr_alloc_init();
	1268
	1269	for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) {
	1270	info = &_throttle_io_info[i];
	1271
	1272	lck_mtx_init(&info->throttle_lock, throttle_mtx_grp, throttle_mtx_attr);
	1273	info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
	1274
	1275	for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
	1276	TAILQ_INIT(&info->throttle_uthlist[level]);
	1277	info->throttle_last_IO_pid[level] = 0;
	1278	}
	1279	info->throttle_next_wake_level = THROTTLE_LEVEL_END;
	1280	info->throttle_disabled = 0;
	1281	info->throttle_is_fusion_with_priority = 0;
	1282	}
	1283	#if CONFIG_IOSCHED
	1284	if (PE_parse_boot_argn("iosched", &iosched, sizeof(iosched))) {
	1285	iosched_enabled = iosched;
	1286	}
	1287	if (iosched_enabled) {
	1288	/* Initialize I/O Reprioritization mechanism */
	1289	vm_io_reprioritize_init();
	1290	}
	1291	#endif
	1292	}
	1293
	1294	void
	1295	sys_override_io_throttle(int flag)
	1296	{
	1297	if (flag == THROTTLE_IO_ENABLE)
	1298	lowpri_throttle_enabled = 1;
	1299
	1300	if (flag == THROTTLE_IO_DISABLE)
	1301	lowpri_throttle_enabled = 0;
	1302	}
	1303
	1304	int rethrottle_removed_from_list = 0;
	1305	int rethrottle_moved_to_new_list = 0;
	1306
	1307	/*
	1308	* move a throttled thread to the appropriate state based
	1309	* on it's new throttle level... throttle_add_to_list will
	1310	* reset the timer deadline if necessary... it may also
	1311	* leave the thread off of the queue if we're already outside
	1312	* the throttle window for the new level
	1313	* takes a valid uthread (which may or may not be on the
	1314	* throttle queue) as input
	1315	*
	1316	* NOTE: This is called with the task lock held.
	1317	*/
	1318
	1319	void
	1320	rethrottle_thread(uthread_t ut)
	1321	{
	1322	struct _throttle_io_info_t *info;
	1323	int my_new_level;
	1324
	1325	if ((info = ut->uu_throttle_info) == NULL)
	1326	return;
	1327
	1328	lck_mtx_lock(&info->throttle_lock);
	1329
	1330	if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
	1331
	1332	my_new_level = throttle_get_thread_throttle_level(ut);
	1333
	1334	if (my_new_level != ut->uu_on_throttlelist) {
	1335
	1336	TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
	1337	ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
	1338
	1339	if (my_new_level >= THROTTLE_LEVEL_THROTTLED) {
	1340	throttle_add_to_list(info, ut, my_new_level, TRUE);
	1341	rethrottle_moved_to_new_list++;
	1342	}
	1343
	1344	/* Thread no longer in window, need to wake it up */
	1345	if (ut->uu_on_throttlelist == THROTTLE_LEVEL_NONE) {
	1346	wakeup(&ut->uu_on_throttlelist);
	1347	rethrottle_removed_from_list++;
	1348	}
	1349	}
	1350	}
	1351
	1352	lck_mtx_unlock(&info->throttle_lock);
	1353	}
	1354
	1355
	1356	/*
	1357	* KPI routine
	1358	*
	1359	* Create and take a reference on a throttle info structure and return a
	1360	* pointer for the file system to use when calling throttle_info_update.
	1361	* Calling file system must have a matching release for every create.
	1362	*/
	1363	void *
	1364	throttle_info_create(void)
	1365	{
	1366	struct _throttle_io_info_t *info;
	1367	int level;
	1368
	1369	MALLOC(info, struct _throttle_io_info_t , sizeof(info), M_TEMP, M_ZERO \| M_WAITOK);
	1370	/* Should never happen but just in case */
	1371	if (info == NULL)
	1372	return NULL;
	1373	/* Mark that this one was allocated and needs to be freed */
	1374	DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
	1375	info->throttle_alloc = TRUE;
	1376
	1377	lck_mtx_init(&info->throttle_lock, throttle_mtx_grp, throttle_mtx_attr);
	1378	info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
	1379
	1380	for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
	1381	TAILQ_INIT(&info->throttle_uthlist[level]);
	1382	}
	1383	info->throttle_next_wake_level = THROTTLE_LEVEL_END;
	1384
	1385	/* Take a reference */
	1386	OSIncrementAtomic(&info->throttle_refcnt);
	1387	return info;
	1388	}
	1389
	1390	/*
	1391	* KPI routine
	1392	*
	1393	* Release the throttle info pointer if all the reference are gone. Should be
	1394	* called to release reference taken by throttle_info_create
	1395	*/
	1396	void
	1397	throttle_info_release(void *throttle_info)
	1398	{
	1399	DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
	1400	(struct _throttle_io_info_t *)throttle_info,
	1401	(struct _throttle_io_info_t *)throttle_info);
	1402	if (throttle_info) /* Just to be careful */
	1403	throttle_info_rel(throttle_info);
	1404	}
	1405
	1406	/*
	1407	* KPI routine
	1408	*
	1409	* File Systems that create an info structure, need to call this routine in
	1410	* their mount routine (used by cluster code). File Systems that call this in
	1411	* their mount routines must call throttle_info_mount_rel in their unmount
	1412	* routines.
	1413	*/
	1414	void
	1415	throttle_info_mount_ref(mount_t mp, void *throttle_info)
	1416	{
	1417	if ((throttle_info == NULL) \|\| (mp == NULL))
	1418	return;
	1419	throttle_info_ref(throttle_info);
	1420
	1421	/*
	1422	* We already have a reference release it before adding the new one
	1423	*/
	1424	if (mp->mnt_throttle_info)
	1425	throttle_info_rel(mp->mnt_throttle_info);
	1426	mp->mnt_throttle_info = throttle_info;
	1427	}
	1428
	1429	/*
	1430	* Private KPI routine
	1431	*
	1432	* return a handle for accessing throttle_info given a throttle_mask. The
	1433	* handle must be released by throttle_info_rel_by_mask
	1434	*/
	1435	int
	1436	throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle)
	1437	{
	1438	int dev_index;
	1439	struct _throttle_io_info_t *info;
	1440
	1441	if (throttle_info_handle == NULL)
	1442	return EINVAL;
	1443
	1444	dev_index = num_trailing_0(throttle_mask);
	1445	info = &_throttle_io_info[dev_index];
	1446	throttle_info_ref(info);
	1447	(struct _throttle_io_info_t*)throttle_info_handle = info;
	1448
	1449	return 0;
	1450	}
	1451
	1452	/*
	1453	* Private KPI routine
	1454	*
	1455	* release the handle obtained by throttle_info_ref_by_mask
	1456	*/
	1457	void
	1458	throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
	1459	{
	1460	/*
	1461	* for now the handle is just a pointer to _throttle_io_info_t
	1462	*/
	1463	throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
	1464	}
	1465
	1466	/*
	1467	* KPI routine
	1468	*
	1469	* File Systems that throttle_info_mount_ref, must call this routine in their
	1470	* umount routine.
	1471	*/
	1472	void
	1473	throttle_info_mount_rel(mount_t mp)
	1474	{
	1475	if (mp->mnt_throttle_info)
	1476	throttle_info_rel(mp->mnt_throttle_info);
	1477	mp->mnt_throttle_info = NULL;
	1478	}
	1479
	1480	void
	1481	throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
	1482	{
	1483	struct _throttle_io_info_t *info;
	1484
	1485	if (mp == NULL)
	1486	info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
	1487	else if (mp->mnt_throttle_info == NULL)
	1488	info = &_throttle_io_info[mp->mnt_devbsdunit];
	1489	else
	1490	info = mp->mnt_throttle_info;
	1491
	1492	*tv = info->throttle_last_write_timestamp;
	1493	}
	1494
	1495	void
	1496	update_last_io_time(mount_t mp)
	1497	{
	1498	struct _throttle_io_info_t *info;
	1499
	1500	if (mp == NULL)
	1501	info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
	1502	else if (mp->mnt_throttle_info == NULL)
	1503	info = &_throttle_io_info[mp->mnt_devbsdunit];
	1504	else
	1505	info = mp->mnt_throttle_info;
	1506
	1507	microuptime(&info->throttle_last_write_timestamp);
	1508	if (mp != NULL)
	1509	mp->mnt_last_write_completed_timestamp = info->throttle_last_write_timestamp;
	1510	}
	1511
	1512
	1513	int
	1514	throttle_get_io_policy(uthread_t *ut)
	1515	{
	1516	if (ut != NULL)
	1517	*ut = get_bsdthread_info(current_thread());
	1518
	1519	return (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO));
	1520	}
	1521
	1522	int
	1523	throttle_get_passive_io_policy(uthread_t *ut)
	1524	{
	1525	if (ut != NULL)
	1526	*ut = get_bsdthread_info(current_thread());
	1527
	1528	return (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_PASSIVE_IO));
	1529	}
	1530
	1531
	1532	static int
	1533	throttle_get_thread_throttle_level(uthread_t ut)
	1534	{
	1535	int thread_throttle_level;
	1536
	1537	if (ut == NULL)
	1538	ut = get_bsdthread_info(current_thread());
	1539
	1540	thread_throttle_level = proc_get_effective_thread_policy(ut->uu_thread, TASK_POLICY_IO);
	1541
	1542	/* Bootcache misses should always be throttled */
	1543	if (ut->uu_throttle_bc == TRUE)
	1544	thread_throttle_level = THROTTLE_LEVEL_TIER3;
	1545
	1546	return (thread_throttle_level);
	1547	}
	1548
	1549
	1550	static int
	1551	throttle_io_will_be_throttled_internal(void * throttle_info, int * mylevel, int * throttling_level)
	1552	{
	1553	struct _throttle_io_info_t *info = throttle_info;
	1554	struct timeval elapsed;
	1555	uint64_t elapsed_msecs;
	1556	int thread_throttle_level;
	1557	int throttle_level;
	1558
	1559	if ((thread_throttle_level = throttle_get_thread_throttle_level(NULL)) < THROTTLE_LEVEL_THROTTLED)
	1560	return (THROTTLE_DISENGAGED);
	1561
	1562	for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
	1563
	1564	microuptime(&elapsed);
	1565	timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
	1566	elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
	1567
	1568	if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level])
	1569	break;
	1570	}
	1571	if (throttle_level >= thread_throttle_level) {
	1572	/*
	1573	* we're beyond all of the throttle windows
	1574	* that affect the throttle level of this thread,
	1575	* so go ahead and treat as normal I/O
	1576	*/
	1577	return (THROTTLE_DISENGAGED);
	1578	}
	1579	if (mylevel)
	1580	*mylevel = thread_throttle_level;
	1581	if (throttling_level)
	1582	*throttling_level = throttle_level;
	1583
	1584	if (info->throttle_io_count != info->throttle_io_count_begin) {
	1585	/*
	1586	* we've already issued at least one throttleable I/O
	1587	* in the current I/O window, so avoid issuing another one
	1588	*/
	1589	return (THROTTLE_NOW);
	1590	}
	1591	/*
	1592	* we're in the throttle window, so
	1593	* cut the I/O size back
	1594	*/
	1595	return (THROTTLE_ENGAGED);
	1596	}
	1597
	1598	/*
	1599	* If we have a mount point and it has a throttle info pointer then
	1600	* use it to do the check, otherwise use the device unit number to find
	1601	* the correct throttle info array element.
	1602	*/
	1603	int
	1604	throttle_io_will_be_throttled(__unused int lowpri_window_msecs, mount_t mp)
	1605	{
	1606	struct _throttle_io_info_t *info;
	1607
	1608	/*
	1609	* Should we just return zero if no mount point
	1610	*/
	1611	if (mp == NULL)
	1612	info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
	1613	else if (mp->mnt_throttle_info == NULL)
	1614	info = &_throttle_io_info[mp->mnt_devbsdunit];
	1615	else
	1616	info = mp->mnt_throttle_info;
	1617
	1618	if (info->throttle_is_fusion_with_priority) {
	1619	uthread_t ut = get_bsdthread_info(current_thread());
	1620	if (ut->uu_lowpri_window == 0)
	1621	return (THROTTLE_DISENGAGED);
	1622	}
	1623
	1624	if (info->throttle_disabled)
	1625	return (THROTTLE_DISENGAGED);
	1626	else
	1627	return throttle_io_will_be_throttled_internal(info, NULL, NULL);
	1628	}
	1629
	1630	/*
	1631	* Routine to increment I/O throttling counters maintained in the proc
	1632	*/
	1633
	1634	static void
	1635	throttle_update_proc_stats(pid_t throttling_pid, int count)
	1636	{
	1637	proc_t throttling_proc;
	1638	proc_t throttled_proc = current_proc();
	1639
	1640	/* The throttled_proc is always the current proc; so we are not concerned with refs */
	1641	OSAddAtomic64(count, &(throttled_proc->was_throttled));
	1642
	1643	/* The throttling pid might have exited by now */
	1644	throttling_proc = proc_find(throttling_pid);
	1645	if (throttling_proc != PROC_NULL) {
	1646	OSAddAtomic64(count, &(throttling_proc->did_throttle));
	1647	proc_rele(throttling_proc);
	1648	}
	1649	}
	1650
	1651	/*
	1652	* Block until woken up by the throttle timer or by a rethrottle call.
	1653	* As long as we hold the throttle_lock while querying the throttle tier, we're
	1654	* safe against seeing an old throttle tier after a rethrottle.
	1655	*/
	1656	uint32_t
	1657	throttle_lowpri_io(int sleep_amount)
	1658	{
	1659	uthread_t ut;
	1660	struct _throttle_io_info_t *info;
	1661	int throttle_type = 0;
	1662	int mylevel = 0;
	1663	int throttling_level = THROTTLE_LEVEL_NONE;
	1664	int sleep_cnt = 0;
	1665	uint32_t throttle_io_period_num = 0;
	1666	boolean_t insert_tail = TRUE;
	1667
	1668	ut = get_bsdthread_info(current_thread());
	1669
	1670	if (ut->uu_lowpri_window == 0)
	1671	return (0);
	1672
	1673	info = ut->uu_throttle_info;
	1674
	1675	if (info == NULL) {
	1676	ut->uu_throttle_bc = FALSE;
	1677	ut->uu_lowpri_window = 0;
	1678	return (0);
	1679	}
	1680
	1681	lck_mtx_lock(&info->throttle_lock);
	1682
	1683	if (sleep_amount == 0)
	1684	goto done;
	1685
	1686	if (sleep_amount == 1 && ut->uu_throttle_bc == FALSE)
	1687	sleep_amount = 0;
	1688
	1689	throttle_io_period_num = info->throttle_io_period_num;
	1690
	1691	while ( (throttle_type = throttle_io_will_be_throttled_internal(info, &mylevel, &throttling_level)) ) {
	1692
	1693	if (throttle_type == THROTTLE_ENGAGED) {
	1694	if (sleep_amount == 0)
	1695	break;
	1696	if (info->throttle_io_period_num < throttle_io_period_num)
	1697	break;
	1698	if ((info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount)
	1699	break;
	1700	}
	1701	if (ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED) {
	1702	if (throttle_add_to_list(info, ut, mylevel, insert_tail) == THROTTLE_LEVEL_END)
	1703	goto done;
	1704	}
	1705	assert(throttling_level >= THROTTLE_LEVEL_START && throttling_level <= THROTTLE_LEVEL_END);
	1706	KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, PROCESS_THROTTLED)) \| DBG_FUNC_NONE,
	1707	info->throttle_last_IO_pid[throttling_level], throttling_level, proc_selfpid(), mylevel, 0);
	1708
	1709
	1710	if (sleep_cnt == 0) {
	1711	KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) \| DBG_FUNC_START,
	1712	throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0);
	1713	throttled_count[mylevel]++;
	1714	}
	1715	msleep((caddr_t)&ut->uu_on_throttlelist, &info->throttle_lock, PRIBIO + 1, "throttle_lowpri_io", NULL);
	1716
	1717	sleep_cnt++;
	1718
	1719	if (sleep_amount == 0)
	1720	insert_tail = FALSE;
	1721	else if (info->throttle_io_period_num < throttle_io_period_num \|\|
	1722	(info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) {
	1723	insert_tail = FALSE;
	1724	sleep_amount = 0;
	1725	}
	1726	}
	1727	done:
	1728	if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
	1729	TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
	1730	ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
	1731	}
	1732
	1733	lck_mtx_unlock(&info->throttle_lock);
	1734
	1735	if (sleep_cnt) {
	1736	KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) \| DBG_FUNC_END,
	1737	throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0);
	1738	/*
	1739	* We update the stats for the last pid which opened a throttle window for the throttled thread.
	1740	* This might not be completely accurate since the multiple throttles seen by the lower tier pid
	1741	* might have been caused by various higher prio pids. However, updating these stats accurately
	1742	* means doing a proc_find while holding the throttle lock which leads to deadlock.
	1743	*/
	1744	throttle_update_proc_stats(info->throttle_last_IO_pid[throttling_level], sleep_cnt);
	1745	}
	1746
	1747	throttle_info_rel(info);
	1748
	1749	ut->uu_throttle_info = NULL;
	1750	ut->uu_throttle_bc = FALSE;
	1751	ut->uu_lowpri_window = 0;
	1752
	1753	return (sleep_cnt);
	1754	}
	1755
	1756	/*
	1757	* KPI routine
	1758	*
	1759	* set a kernel thread's IO policy. policy can be:
	1760	* IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE, IOPOL_UTILITY, IOPOL_STANDARD
	1761	*
	1762	* explanations about these policies are in the man page of setiopolicy_np
	1763	*/
	1764	void throttle_set_thread_io_policy(int policy)
	1765	{
	1766	proc_set_task_policy(current_task(), current_thread(),
	1767	TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL,
	1768	policy);
	1769	}
	1770
	1771
	1772	void throttle_info_reset_window(uthread_t ut)
	1773	{
	1774	struct _throttle_io_info_t *info;
	1775
	1776	if (ut == NULL)
	1777	ut = get_bsdthread_info(current_thread());
	1778
	1779	if ( (info = ut->uu_throttle_info) ) {
	1780	throttle_info_rel(info);
	1781
	1782	ut->uu_throttle_info = NULL;
	1783	ut->uu_lowpri_window = 0;
	1784	ut->uu_throttle_bc = FALSE;
	1785	}
	1786	}
	1787
	1788	static
	1789	void throttle_info_set_initial_window(uthread_t ut, struct _throttle_io_info_t *info, boolean_t BC_throttle, boolean_t isssd)
	1790	{
	1791	if (lowpri_throttle_enabled == 0 \|\| info->throttle_disabled)
	1792	return;
	1793
	1794	if (info->throttle_io_periods == 0) {
	1795	throttle_init_throttle_period(info, isssd);
	1796	}
	1797	if (ut->uu_throttle_info == NULL) {
	1798
	1799	ut->uu_throttle_info = info;
	1800	throttle_info_ref(info);
	1801	DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
	1802
	1803	ut->uu_lowpri_window = 1;
	1804	ut->uu_throttle_bc = BC_throttle;
	1805	}
	1806	}
	1807
	1808
	1809	static
	1810	void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd)
	1811	{
	1812	int thread_throttle_level;
	1813
	1814	if (lowpri_throttle_enabled == 0 \|\| info->throttle_disabled)
	1815	return;
	1816
	1817	if (ut == NULL)
	1818	ut = get_bsdthread_info(current_thread());
	1819
	1820	thread_throttle_level = throttle_get_thread_throttle_level(ut);
	1821
	1822	if (thread_throttle_level != THROTTLE_LEVEL_NONE) {
	1823	if(!ISSET(flags, B_PASSIVE)) {
	1824	microuptime(&info->throttle_window_start_timestamp[thread_throttle_level]);
	1825	info->throttle_last_IO_pid[thread_throttle_level] = proc_selfpid();
	1826	KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, OPEN_THROTTLE_WINDOW)) \| DBG_FUNC_NONE,
	1827	current_proc()->p_pid, thread_throttle_level, 0, 0, 0);
	1828	}
	1829	microuptime(&info->throttle_last_IO_timestamp[thread_throttle_level]);
	1830	}
	1831
	1832
	1833	if (thread_throttle_level >= THROTTLE_LEVEL_THROTTLED) {
	1834	/*
	1835	* I'd really like to do the IOSleep here, but
	1836	* we may be holding all kinds of filesystem related locks
	1837	* and the pages for this I/O marked 'busy'...
	1838	* we don't want to cause a normal task to block on
	1839	* one of these locks while we're throttling a task marked
	1840	* for low priority I/O... we'll mark the uthread and
	1841	* do the delay just before we return from the system
	1842	* call that triggered this I/O or from vnode_pagein
	1843	*/
	1844	OSAddAtomic(1, &info->throttle_io_count);
	1845
	1846	throttle_info_set_initial_window(ut, info, FALSE, isssd);
	1847	}
	1848	}
	1849
	1850	void *throttle_info_update_by_mount(mount_t mp)
	1851	{
	1852	struct _throttle_io_info_t *info;
	1853	uthread_t ut;
	1854	boolean_t isssd = FALSE;
	1855
	1856	ut = get_bsdthread_info(current_thread());
	1857
	1858	if (mp != NULL) {
	1859	if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
	1860	isssd = TRUE;
	1861	info = &_throttle_io_info[mp->mnt_devbsdunit];
	1862	} else
	1863	info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
	1864
	1865	if (!ut->uu_lowpri_window)
	1866	throttle_info_set_initial_window(ut, info, FALSE, isssd);
	1867
	1868	return info;
	1869	}
	1870
	1871
	1872	/*
	1873	* KPI routine
	1874	*
	1875	* this is usually called before every I/O, used for throttled I/O
	1876	* book keeping. This routine has low overhead and does not sleep
	1877	*/
	1878	void throttle_info_update(void *throttle_info, int flags)
	1879	{
	1880	if (throttle_info)
	1881	throttle_info_update_internal(throttle_info, NULL, flags, FALSE);
	1882	}
	1883
	1884	/*
	1885	* KPI routine
	1886	*
	1887	* this is usually called before every I/O, used for throttled I/O
	1888	* book keeping. This routine has low overhead and does not sleep
	1889	*/
	1890	void throttle_info_update_by_mask(void *throttle_info_handle, int flags)
	1891	{
	1892	void *throttle_info = throttle_info_handle;
	1893
	1894	/*
	1895	* for now we only use the lowest bit of the throttle mask, so the
	1896	* handle is the same as the throttle_info. Later if we store a
	1897	* set of throttle infos in the handle, we will want to loop through
	1898	* them and call throttle_info_update in a loop
	1899	*/
	1900	throttle_info_update(throttle_info, flags);
	1901	}
	1902	/*
	1903	* KPI routine
	1904	*
	1905	* This routine marks the throttle info as disabled. Used for mount points which
	1906	* support I/O scheduling.
	1907	*/
	1908
	1909	void throttle_info_disable_throttle(int devno, boolean_t isfusion)
	1910	{
	1911	struct _throttle_io_info_t *info;
	1912
	1913	if (devno < 0 \|\| devno >= LOWPRI_MAX_NUM_DEV)
	1914	panic("Illegal devno (%d) passed into throttle_info_disable_throttle()", devno);
	1915
	1916	info = &_throttle_io_info[devno];
	1917	// don't disable software throttling on devices that are part of a fusion device
	1918	// and override the software throttle periods to use HDD periods
	1919	if (isfusion) {
	1920	info->throttle_is_fusion_with_priority = isfusion;
	1921	throttle_init_throttle_period(info, FALSE);
	1922	}
	1923	info->throttle_disabled = !info->throttle_is_fusion_with_priority;
	1924	return;
	1925	}
	1926
	1927
	1928	/*
	1929	* KPI routine (private)
	1930	* Called to determine if this IO is being throttled to this level so that it can be treated specially
	1931	*/
	1932	int throttle_info_io_will_be_throttled(void * throttle_info, int policy)
	1933	{
	1934	struct _throttle_io_info_t *info = throttle_info;
	1935	struct timeval elapsed;
	1936	uint64_t elapsed_msecs;
	1937	int throttle_level;
	1938	int thread_throttle_level;
	1939
	1940	switch (policy) {
	1941
	1942	case IOPOL_THROTTLE:
	1943	thread_throttle_level = THROTTLE_LEVEL_TIER3;
	1944	break;
	1945	case IOPOL_UTILITY:
	1946	thread_throttle_level = THROTTLE_LEVEL_TIER2;
	1947	break;
	1948	case IOPOL_STANDARD:
	1949	thread_throttle_level = THROTTLE_LEVEL_TIER1;
	1950	break;
	1951	default:
	1952	thread_throttle_level = THROTTLE_LEVEL_TIER0;
	1953	break;
	1954	}
	1955	for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
	1956
	1957	microuptime(&elapsed);
	1958	timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
	1959	elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
	1960
	1961	if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level])
	1962	break;
	1963	}
	1964	if (throttle_level >= thread_throttle_level) {
	1965	/*
	1966	* we're beyond all of the throttle windows
	1967	* so go ahead and treat as normal I/O
	1968	*/
	1969	return (THROTTLE_DISENGAGED);
	1970	}
	1971	/*
	1972	* we're in the throttle window
	1973	*/
	1974	return (THROTTLE_ENGAGED);
	1975	}
	1976
	1977	int
	1978	spec_strategy(struct vnop_strategy_args *ap)
	1979	{
	1980	buf_t bp;
	1981	int bflags;
	1982	int io_tier;
	1983	int passive;
	1984	dev_t bdev;
	1985	uthread_t ut;
	1986	mount_t mp;
	1987	struct bufattr *bap;
	1988	int strategy_ret;
	1989	struct _throttle_io_info_t *throttle_info;
	1990	boolean_t isssd = FALSE;
	1991	int code = 0;
	1992
	1993	proc_t curproc = current_proc();
	1994
	1995	bp = ap->a_bp;
	1996	bdev = buf_device(bp);
	1997	mp = buf_vnode(bp)->v_mount;
	1998	bap = &bp->b_attr;
	1999
	2000	io_tier = throttle_get_io_policy(&ut);
	2001	passive = throttle_get_passive_io_policy(&ut);
	2002
	2003	if (bp->b_flags & B_META)
	2004	bap->ba_flags \|= BA_META;
	2005
	2006	#if CONFIG_IOSCHED
	2007	/*
	2008	* For I/O Scheduling, we currently do not have a way to track and expedite metadata I/Os.
	2009	* To ensure we dont get into priority inversions due to metadata I/Os, we use the following rules:
	2010	* For metadata reads, ceil all I/Os to IOSCHED_METADATA_TIER & mark them passive if the I/O tier was upgraded
	2011	* For metadata writes, unconditionally mark them as IOSCHED_METADATA_TIER and passive
	2012	*/
	2013	if (bap->ba_flags & BA_META) {
	2014	if (mp && (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) {
	2015	if (bp->b_flags & B_READ) {
	2016	if (io_tier > IOSCHED_METADATA_TIER) {
	2017	io_tier = IOSCHED_METADATA_TIER;
	2018	passive = 1;
	2019	}
	2020	} else {
	2021	io_tier = IOSCHED_METADATA_TIER;
	2022	passive = 1;
	2023	}
	2024	}
	2025	}
	2026	#endif /* CONFIG_IOSCHED */
	2027
	2028	SET_BUFATTR_IO_TIER(bap, io_tier);
	2029
	2030	if (passive) {
	2031	bp->b_flags \|= B_PASSIVE;
	2032	bap->ba_flags \|= BA_PASSIVE;
	2033	}
	2034
	2035	if ((curproc != NULL) && ((curproc->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP))
	2036	bap->ba_flags \|= BA_DELAYIDLESLEEP;
	2037
	2038	bflags = bp->b_flags;
	2039
	2040	if (((bflags & B_READ) == 0) && ((bflags & B_ASYNC) == 0))
	2041	bufattr_markquickcomplete(bap);
	2042
	2043	if (bflags & B_READ)
	2044	code \|= DKIO_READ;
	2045	if (bflags & B_ASYNC)
	2046	code \|= DKIO_ASYNC;
	2047	if (bflags & B_META)
	2048	code \|= DKIO_META;
	2049	else if (bflags & B_PAGEIO)
	2050	code \|= DKIO_PAGING;
	2051
	2052	if (io_tier != 0)
	2053	code \|= DKIO_THROTTLE;
	2054
	2055	code \|= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
	2056
	2057	if (bflags & B_PASSIVE)
	2058	code \|= DKIO_PASSIVE;
	2059
	2060	if (bap->ba_flags & BA_NOCACHE)
	2061	code \|= DKIO_NOCACHE;
	2062
	2063	if (kdebug_enable) {
	2064	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) \| DBG_FUNC_NONE,
	2065	buf_kernel_addrperm_addr(bp), bdev, (int)buf_blkno(bp), buf_count(bp), 0);
	2066	}
	2067
	2068	thread_update_io_stats(current_thread(), buf_count(bp), code);
	2069
	2070	if (mp != NULL) {
	2071	if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
	2072	isssd = TRUE;
	2073	throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
	2074	} else
	2075	throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
	2076
	2077	throttle_info_update_internal(throttle_info, ut, bflags, isssd);
	2078
	2079	if ((bflags & B_READ) == 0) {
	2080	microuptime(&throttle_info->throttle_last_write_timestamp);
	2081
	2082	if (mp) {
	2083	mp->mnt_last_write_issued_timestamp = throttle_info->throttle_last_write_timestamp;
	2084	INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
	2085	}
	2086	} else if (mp) {
	2087	INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
	2088	}
	2089	/*
	2090	* The BootCache may give us special information about
	2091	* the IO, so it returns special values that we check
	2092	* for here.
	2093	*
	2094	* IO_SATISFIED_BY_CACHE
	2095	* The read has been satisfied by the boot cache. Don't
	2096	* throttle the thread unnecessarily.
	2097	*
	2098	* IO_SHOULD_BE_THROTTLED
	2099	* The boot cache is playing back a playlist and this IO
	2100	* cut through. Throttle it so we're not cutting through
	2101	* the boot cache too often.
	2102	*
	2103	* Note that typical strategy routines are defined with
	2104	* a void return so we'll get garbage here. In the
	2105	* unlikely case the garbage matches our special return
	2106	* value, it's not a big deal since we're only adjusting
	2107	* the throttling delay.
	2108	*/
	2109	#define IO_SATISFIED_BY_CACHE ((int)0xcafefeed)
	2110	#define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
	2111	typedef int strategy_fcn_ret_t(struct buf *bp);
	2112
	2113	strategy_ret = ((strategy_fcn_ret_t)bdevsw[major(bdev)].d_strategy)(bp);
	2114
	2115	if (IO_SATISFIED_BY_CACHE == strategy_ret) {
	2116	/*
	2117	* If this was a throttled IO satisfied by the boot cache,
	2118	* don't delay the thread.
	2119	*/
	2120	throttle_info_reset_window(ut);
	2121
	2122	} else if (IO_SHOULD_BE_THROTTLED == strategy_ret) {
	2123	/*
	2124	* If the boot cache indicates this IO should be throttled,
	2125	* delay the thread.
	2126	*/
	2127	throttle_info_set_initial_window(ut, throttle_info, TRUE, isssd);
	2128	}
	2129	return (0);
	2130	}
	2131
	2132
	2133	/*
	2134	* This is a noop, simply returning what one has been given.
	2135	*/
	2136	int
	2137	spec_blockmap(__unused struct vnop_blockmap_args *ap)
	2138	{
	2139	return (ENOTSUP);
	2140	}
	2141
	2142
	2143	/*
	2144	* Device close routine
	2145	*/
	2146	int
	2147	spec_close(struct vnop_close_args *ap)
	2148	{
	2149	struct vnode *vp = ap->a_vp;
	2150	dev_t dev = vp->v_rdev;
	2151	int error = 0;
	2152	int flags = ap->a_fflag;
	2153	struct proc *p = vfs_context_proc(ap->a_context);
	2154	struct session *sessp;
	2155
	2156	switch (vp->v_type) {
	2157
	2158	case VCHR:
	2159	/*
	2160	* Hack: a tty device that is a controlling terminal
	2161	* has a reference from the session structure.
	2162	* We cannot easily tell that a character device is
	2163	* a controlling terminal, unless it is the closing
	2164	* process' controlling terminal. In that case,
	2165	* if the reference count is 1 (this is the very
	2166	* last close)
	2167	*/
	2168	sessp = proc_session(p);
	2169	devsw_lock(dev, S_IFCHR);
	2170	if (sessp != SESSION_NULL) {
	2171	if (vp == sessp->s_ttyvp && vcount(vp) == 1) {
	2172	struct tty *tp = TTY_NULL;
	2173
	2174	devsw_unlock(dev, S_IFCHR);
	2175	session_lock(sessp);
	2176	if (vp == sessp->s_ttyvp) {
	2177	tp = SESSION_TP(sessp);
	2178	sessp->s_ttyvp = NULL;
	2179	sessp->s_ttyvid = 0;
	2180	sessp->s_ttyp = TTY_NULL;
	2181	sessp->s_ttypgrpid = NO_PID;
	2182	}
	2183	session_unlock(sessp);
	2184
	2185	if (tp != TTY_NULL) {
	2186	/*
	2187	* We may have won a race with a proc_exit
	2188	* of the session leader, the winner
	2189	* clears the flag (even if not set)
	2190	*/
	2191	tty_lock(tp);
	2192	ttyclrpgrphup(tp);
	2193	tty_unlock(tp);
	2194
	2195	ttyfree(tp);
	2196	}
	2197	devsw_lock(dev, S_IFCHR);
	2198	}
	2199	session_rele(sessp);
	2200	}
	2201
	2202	if (--vp->v_specinfo->si_opencount < 0)
	2203	panic("negative open count (c, %u, %u)", major(dev), minor(dev));
	2204
	2205	/*
	2206	* close on last reference or on vnode revoke call
	2207	*/
	2208	if (vcount(vp) == 0 \|\| (flags & IO_REVOKE) != 0)
	2209	error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
	2210
	2211	devsw_unlock(dev, S_IFCHR);
	2212	break;
	2213
	2214	case VBLK:
	2215	/*
	2216	* If there is more than one outstanding open, don't
	2217	* send the close to the device.
	2218	*/
	2219	devsw_lock(dev, S_IFBLK);
	2220	if (vcount(vp) > 1) {
	2221	vp->v_specinfo->si_opencount--;
	2222	devsw_unlock(dev, S_IFBLK);
	2223	return (0);
	2224	}
	2225	devsw_unlock(dev, S_IFBLK);
	2226
	2227	/*
	2228	* On last close of a block device (that isn't mounted)
	2229	* we must invalidate any in core blocks, so that
	2230	* we can, for instance, change floppy disks.
	2231	*/
	2232	if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
	2233	return (error);
	2234
	2235	error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
	2236	if (error)
	2237	return (error);
	2238
	2239	devsw_lock(dev, S_IFBLK);
	2240
	2241	if (--vp->v_specinfo->si_opencount < 0)
	2242	panic("negative open count (b, %u, %u)", major(dev), minor(dev));
	2243
	2244	if (vcount(vp) == 0)
	2245	error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
	2246
	2247	devsw_unlock(dev, S_IFBLK);
	2248	break;
	2249
	2250	default:
	2251	panic("spec_close: not special");
	2252	return(EBADF);
	2253	}
	2254
	2255	return error;
	2256	}
	2257
	2258	/*
	2259	* Return POSIX pathconf information applicable to special devices.
	2260	*/
	2261	int
	2262	spec_pathconf(struct vnop_pathconf_args *ap)
	2263	{
	2264
	2265	switch (ap->a_name) {
	2266	case _PC_LINK_MAX:
	2267	*ap->a_retval = LINK_MAX;
	2268	return (0);
	2269	case _PC_MAX_CANON:
	2270	*ap->a_retval = MAX_CANON;
	2271	return (0);
	2272	case _PC_MAX_INPUT:
	2273	*ap->a_retval = MAX_INPUT;
	2274	return (0);
	2275	case _PC_PIPE_BUF:
	2276	*ap->a_retval = PIPE_BUF;
	2277	return (0);
	2278	case _PC_CHOWN_RESTRICTED:
	2279	ap->a_retval = 200112; / _POSIX_CHOWN_RESTRICTED */
	2280	return (0);
	2281	case _PC_VDISABLE:
	2282	*ap->a_retval = _POSIX_VDISABLE;
	2283	return (0);
	2284	default:
	2285	return (EINVAL);
	2286	}
	2287	/* NOTREACHED */
	2288	}
	2289
	2290	/*
	2291	* Special device failed operation
	2292	*/
	2293	int
	2294	spec_ebadf(__unused void *dummy)
	2295	{
	2296
	2297	return (EBADF);
	2298	}
	2299
	2300	/* Blktooff derives file offset from logical block number */
	2301	int
	2302	spec_blktooff(struct vnop_blktooff_args *ap)
	2303	{
	2304	struct vnode *vp = ap->a_vp;
	2305
	2306	switch (vp->v_type) {
	2307	case VCHR:
	2308	ap->a_offset = (off_t)-1; / failure */
	2309	return (ENOTSUP);
	2310
	2311	case VBLK:
	2312	printf("spec_blktooff: not implemented for VBLK\n");
	2313	ap->a_offset = (off_t)-1; / failure */
	2314	return (ENOTSUP);
	2315
	2316	default:
	2317	panic("spec_blktooff type");
	2318	}
	2319	/* NOTREACHED */
	2320
	2321	return (0);
	2322	}
	2323
	2324	/* Offtoblk derives logical block number from file offset */
	2325	int
	2326	spec_offtoblk(struct vnop_offtoblk_args *ap)
	2327	{
	2328	struct vnode *vp = ap->a_vp;
	2329
	2330	switch (vp->v_type) {
	2331	case VCHR:
	2332	ap->a_lblkno = (daddr64_t)-1; / failure */
	2333	return (ENOTSUP);
	2334
	2335	case VBLK:
	2336	printf("spec_offtoblk: not implemented for VBLK\n");
	2337	ap->a_lblkno = (daddr64_t)-1; / failure */
	2338	return (ENOTSUP);
	2339
	2340	default:
	2341	panic("spec_offtoblk type");
	2342	}
	2343	/* NOTREACHED */
	2344
	2345	return (0);
	2346	}
	2347
	2348	static void filt_specdetach(struct knote *kn);
	2349	static int filt_spec(struct knote *kn, long hint);
	2350	static unsigned filt_specpeek(struct knote *kn);
	2351
	2352	struct filterops spec_filtops = {
	2353	.f_isfd = 1,
	2354	.f_attach = filt_specattach,
	2355	.f_detach = filt_specdetach,
	2356	.f_event = filt_spec,
	2357	.f_peek = filt_specpeek
	2358	};
	2359
	2360	static int
	2361	filter_to_seltype(int16_t filter)
	2362	{
	2363	switch (filter) {
	2364	case EVFILT_READ:
	2365	return FREAD;
	2366	case EVFILT_WRITE:
	2367	return FWRITE;
	2368	break;
	2369	default:
	2370	panic("filt_to_seltype(): invalid filter %d\n", filter);
	2371	return 0;
	2372	}
	2373	}
	2374
	2375	static int
	2376	filt_specattach(struct knote *kn)
	2377	{
	2378	vnode_t vp;
	2379	dev_t dev;
	2380
	2381	vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */
	2382
	2383	assert(vnode_ischr(vp));
	2384
	2385	dev = vnode_specrdev(vp);
	2386
	2387	if (major(dev) > nchrdev) {
	2388	return ENXIO;
	2389	}
	2390
	2391	/*
	2392	* For a few special kinds of devices, we can attach knotes with
	2393	* no restrictions because their "select" vectors return the amount
	2394	* of data available. Others require an explicit NOTE_LOWAT with
	2395	* data of 1, indicating that the caller doesn't care about actual
	2396	* data counts, just an indication that the device has data.
	2397	*/
	2398
	2399	if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0 &&
	2400	((kn->kn_sfflags & NOTE_LOWAT) == 0 \|\| kn->kn_sdata != 1)) {
	2401	return EINVAL;
	2402	}
	2403
	2404	kn->kn_hook_data = 0;
	2405
	2406	kn->kn_fop = &spec_filtops;
	2407	kn->kn_hookid = vnode_vid(vp);
	2408
	2409	knote_markstayqueued(kn);
	2410
	2411	return 0;
	2412	}
	2413
	2414	static void
	2415	filt_specdetach(struct knote *kn)
	2416	{
	2417	knote_clearstayqueued(kn);
	2418
	2419	/*
	2420	* This is potentially tricky: the device's selinfo waitq that was
	2421	* tricked into being part of this knote's waitq set may not be a part
	2422	* of any other set, and the device itself may have revoked the memory
	2423	* in which the waitq was held. We use the knote's kn_hook_data field
	2424	* to keep the ID of the waitq's prepost table object. This
	2425	* object keeps a pointer back to the waitq, and gives us a safe way
	2426	* to decouple the dereferencing of driver allocated memory: if the
	2427	* driver goes away (taking the waitq with it) then the prepost table
	2428	* object will be invalidated. The waitq details are handled in the
	2429	* waitq API invoked here.
	2430	*/
	2431	if (kn->kn_hook_data) {
	2432	waitq_unlink_by_prepost_id(kn->kn_hook_data, kn->kn_kq->kq_wqs);
	2433	kn->kn_hook_data = 0;
	2434	}
	2435	}
	2436
	2437	static int
	2438	filt_spec(struct knote *kn, long hint)
	2439	{
	2440	vnode_t vp;
	2441	uthread_t uth;
	2442	struct waitq_set *old_wqs;
	2443	vfs_context_t ctx;
	2444	int selres;
	2445	int error;
	2446	int use_offset;
	2447	dev_t dev;
	2448	uint64_t flags;
	2449	uint64_t rsvd, rsvd_arg;
	2450	uint64_t *rlptr = NULL;
	2451
	2452	if (hint != 0) {
	2453	panic("filt_spec(): nonzero hint?");
	2454	}
	2455
	2456	uth = get_bsdthread_info(current_thread());
	2457	ctx = vfs_context_current();
	2458	vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
	2459
	2460	error = vnode_getwithvid(vp, kn->kn_hookid);
	2461	if (error != 0) {
	2462	kn->kn_flags \|= (EV_EOF \| EV_ONESHOT);
	2463	return 1;
	2464	}
	2465
	2466	dev = vnode_specrdev(vp);
	2467	flags = cdevsw_flags[major(dev)];
	2468	use_offset = ((flags & CDEVSW_USE_OFFSET) != 0);
	2469
	2470	/*
	2471	* This function may be called many times to link or re-link the
	2472	* underlying vnode to the kqueue. If we've already linked the two,
	2473	* we will have a valid kn_hook_data which ties us to the underlying
	2474	* device's waitq via a the waitq's prepost table object. However,
	2475	* devices can abort any select action by calling selthreadclear().
	2476	* This is OK because the table object will be invalidated by the
	2477	* driver (through a call to selthreadclear), so any attempt to access
	2478	* the associated waitq will fail because the table object is invalid.
	2479	*
	2480	* Even if we've already registered, we need to pass a pointer
	2481	* to a reserved link structure. Otherwise, selrecord() will
	2482	* infer that we're in the second pass of select() and won't
	2483	* actually do anything!
	2484	*/
	2485	rsvd = rsvd_arg = waitq_link_reserve(NULL);
	2486	rlptr = (void *)&rsvd_arg;
	2487
	2488	/*
	2489	* Trick selrecord() into hooking kqueue's wait queue set
	2490	* set into device's selinfo wait queue
	2491	*/
	2492	old_wqs = uth->uu_wqset;
	2493	uth->uu_wqset = kn->kn_kq->kq_wqs;
	2494	selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter),
	2495	0, rlptr, ctx);
	2496	uth->uu_wqset = old_wqs;
	2497
	2498	/*
	2499	* make sure to cleanup the reserved link - this guards against
	2500	* drivers that may not actually call selrecord().
	2501	*/
	2502	waitq_link_release(rsvd);
	2503	if (rsvd != rsvd_arg) {
	2504	/* the driver / handler called selrecord() */
	2505	struct waitq *wq;
	2506	memcpy(&wq, rlptr, sizeof(void *));
	2507
	2508	/*
	2509	* The waitq_get_prepost_id() function will (potentially)
	2510	* allocate a prepost table object for the waitq and return
	2511	* the table object's ID to us. It will also set the
	2512	* waitq_prepost_id field within the waitq structure.
	2513	*
	2514	* We can just overwrite kn_hook_data because it's simply a
	2515	* table ID used to grab a reference when needed.
	2516	*
	2517	* We have a reference on the vnode, so we know that the
	2518	* device won't go away while we get this ID.
	2519	*/
	2520	kn->kn_hook_data = waitq_get_prepost_id(wq);
	2521	}
	2522
	2523	if (use_offset) {
	2524	if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) {
	2525	kn->kn_data = 0;
	2526	} else {
	2527	kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset;
	2528	}
	2529	} else {
	2530	kn->kn_data = selres;
	2531	}
	2532
	2533	vnode_put(vp);
	2534
	2535	if ((kn->kn_sfflags & NOTE_LOWAT) != 0)
	2536	return (kn->kn_data >= kn->kn_sdata);
	2537
	2538	return (kn->kn_data != 0);
	2539	}
	2540
	2541	static unsigned
	2542	filt_specpeek(struct knote *kn)
	2543	{
	2544	vnode_t vp;
	2545	uthread_t uth;
	2546	struct waitq_set *old_wqs;
	2547	vfs_context_t ctx;
	2548	int error, selres;
	2549	uint64_t rsvd, rsvd_arg;
	2550	uint64_t *rlptr = NULL;
	2551
	2552	uth = get_bsdthread_info(current_thread());
	2553	ctx = vfs_context_current();
	2554	vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
	2555
	2556	error = vnode_getwithvid(vp, kn->kn_hookid);
	2557	if (error != 0) {
	2558	return 1; /* Just like VNOP_SELECT() on recycled vnode */
	2559	}
	2560
	2561	/*
	2562	* Even if we've already registered, we need to pass a pointer
	2563	* to a reserved link structure. Otherwise, selrecord() will
	2564	* infer that we're in the second pass of select() and won't
	2565	* actually do anything!
	2566	*/
	2567	rsvd = rsvd_arg = waitq_link_reserve(NULL);
	2568	rlptr = (void *)&rsvd_arg;
	2569
	2570	old_wqs = uth->uu_wqset;
	2571	uth->uu_wqset = kn->kn_kq->kq_wqs;
	2572	selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter),
	2573	0, (void *)rlptr, ctx);
	2574	uth->uu_wqset = old_wqs;
	2575
	2576	/*
	2577	* make sure to cleanup the reserved link - this guards against
	2578	* drivers that may not actually call selrecord()
	2579	*/
	2580	waitq_link_release(rsvd);
	2581	if (rsvd != rsvd_arg) {
	2582	/* the driver / handler called selrecord() */
	2583	struct waitq *wq;
	2584	memcpy(&wq, rlptr, sizeof(void *));
	2585
	2586	/*
	2587	* The waitq_get_prepost_id() function will (potentially)
	2588	* allocate a prepost table object for the waitq and return
	2589	* the table object's ID to us. It will also set the
	2590	* waitq_prepost_id field within the waitq structure.
	2591	*
	2592	* We can just overwrite kn_hook_data because it's simply a
	2593	* table ID used to grab a reference when needed.
	2594	*
	2595	* We have a reference on the vnode, so we know that the
	2596	* device won't go away while we get this ID.
	2597	*/
	2598	kn->kn_hook_data = waitq_get_prepost_id(wq);
	2599	}
	2600
	2601	vnode_put(vp);
	2602	return selres;
	2603	}
	2604