2 * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1989, 1993, 1995
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/kauth.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
70 #include <sys/buf_internal.h>
71 #include <sys/mount_internal.h>
72 #include <sys/vnode_internal.h>
73 #include <sys/file_internal.h>
74 #include <sys/namei.h>
76 #include <sys/errno.h>
77 #include <sys/ioctl.h>
80 #include <sys/malloc.h>
82 #include <sys/uio_internal.h>
83 #include <sys/resource.h>
84 #include <miscfs/specfs/specdev.h>
85 #include <vfs/vfs_support.h>
86 #include <kern/assert.h>
87 #include <kern/task.h>
89 #include <sys/kdebug.h>
91 /* XXX following three prototypes should be in a header file somewhere */
92 extern dev_t
chrtoblk(dev_t dev
);
93 extern int iskmemdev(dev_t dev
);
94 extern int bpfkqfilter(dev_t dev
, struct knote
*kn
);
95 extern int ptsd_kqfilter(dev_t dev
, struct knote
*kn
);
97 struct vnode
*speclisth
[SPECHSZ
];
99 /* symbolic sleep message strings for devices */
100 char devopn
[] = "devopn";
101 char devio
[] = "devio";
102 char devwait
[] = "devwait";
103 char devin
[] = "devin";
104 char devout
[] = "devout";
105 char devioc
[] = "devioc";
106 char devcls
[] = "devcls";
108 #define VOPFUNC int (*)(void *)
110 int (**spec_vnodeop_p
)(void *);
111 struct vnodeopv_entry_desc spec_vnodeop_entries
[] = {
112 { &vnop_default_desc
, (VOPFUNC
)vn_default_error
},
113 { &vnop_lookup_desc
, (VOPFUNC
)spec_lookup
}, /* lookup */
114 { &vnop_create_desc
, (VOPFUNC
)err_create
}, /* create */
115 { &vnop_mknod_desc
, (VOPFUNC
)err_mknod
}, /* mknod */
116 { &vnop_open_desc
, (VOPFUNC
)spec_open
}, /* open */
117 { &vnop_close_desc
, (VOPFUNC
)spec_close
}, /* close */
118 { &vnop_access_desc
, (VOPFUNC
)spec_access
}, /* access */
119 { &vnop_getattr_desc
, (VOPFUNC
)spec_getattr
}, /* getattr */
120 { &vnop_setattr_desc
, (VOPFUNC
)spec_setattr
}, /* setattr */
121 { &vnop_read_desc
, (VOPFUNC
)spec_read
}, /* read */
122 { &vnop_write_desc
, (VOPFUNC
)spec_write
}, /* write */
123 { &vnop_ioctl_desc
, (VOPFUNC
)spec_ioctl
}, /* ioctl */
124 { &vnop_select_desc
, (VOPFUNC
)spec_select
}, /* select */
125 { &vnop_revoke_desc
, (VOPFUNC
)nop_revoke
}, /* revoke */
126 { &vnop_mmap_desc
, (VOPFUNC
)err_mmap
}, /* mmap */
127 { &vnop_fsync_desc
, (VOPFUNC
)spec_fsync
}, /* fsync */
128 { &vnop_remove_desc
, (VOPFUNC
)err_remove
}, /* remove */
129 { &vnop_link_desc
, (VOPFUNC
)err_link
}, /* link */
130 { &vnop_rename_desc
, (VOPFUNC
)err_rename
}, /* rename */
131 { &vnop_mkdir_desc
, (VOPFUNC
)err_mkdir
}, /* mkdir */
132 { &vnop_rmdir_desc
, (VOPFUNC
)err_rmdir
}, /* rmdir */
133 { &vnop_symlink_desc
, (VOPFUNC
)err_symlink
}, /* symlink */
134 { &vnop_readdir_desc
, (VOPFUNC
)err_readdir
}, /* readdir */
135 { &vnop_readlink_desc
, (VOPFUNC
)err_readlink
}, /* readlink */
136 { &vnop_inactive_desc
, (VOPFUNC
)nop_inactive
}, /* inactive */
137 { &vnop_reclaim_desc
, (VOPFUNC
)nop_reclaim
}, /* reclaim */
138 { &vnop_strategy_desc
, (VOPFUNC
)spec_strategy
}, /* strategy */
139 { &vnop_pathconf_desc
, (VOPFUNC
)spec_pathconf
}, /* pathconf */
140 { &vnop_advlock_desc
, (VOPFUNC
)err_advlock
}, /* advlock */
141 { &vnop_bwrite_desc
, (VOPFUNC
)spec_bwrite
}, /* bwrite */
142 { &vnop_pagein_desc
, (VOPFUNC
)err_pagein
}, /* Pagein */
143 { &vnop_pageout_desc
, (VOPFUNC
)err_pageout
}, /* Pageout */
144 { &vnop_copyfile_desc
, (VOPFUNC
)err_copyfile
}, /* Copyfile */
145 { &vnop_blktooff_desc
, (VOPFUNC
)spec_blktooff
}, /* blktooff */
146 { &vnop_offtoblk_desc
, (VOPFUNC
)spec_offtoblk
}, /* offtoblk */
147 { &vnop_blockmap_desc
, (VOPFUNC
)spec_blockmap
}, /* blockmap */
148 { (struct vnodeop_desc
*)NULL
, (int(*)())NULL
}
150 struct vnodeopv_desc spec_vnodeop_opv_desc
=
151 { &spec_vnodeop_p
, spec_vnodeop_entries
};
154 static void set_blocksize(vnode_t
, dev_t
);
158 * Trivial lookup routine that always fails.
161 spec_lookup(struct vnop_lookup_args
*ap
)
169 set_blocksize(struct vnode
*vp
, dev_t dev
)
174 if ((major(dev
) < nblkdev
) && (size
= bdevsw
[major(dev
)].d_psize
)) {
175 rsize
= (*size
)(dev
);
176 if (rsize
<= 0) /* did size fail? */
177 vp
->v_specsize
= DEV_BSIZE
;
179 vp
->v_specsize
= rsize
;
182 vp
->v_specsize
= DEV_BSIZE
;
186 set_fsblocksize(struct vnode
*vp
)
189 if (vp
->v_type
== VBLK
) {
190 dev_t dev
= (dev_t
)vp
->v_rdev
;
191 int maj
= major(dev
);
193 if ((u_int
)maj
>= (u_int
)nblkdev
)
197 set_blocksize(vp
, dev
);
205 * Open a special file.
208 spec_open(struct vnop_open_args
*ap
)
210 struct proc
*p
= vfs_context_proc(ap
->a_context
);
211 kauth_cred_t cred
= vfs_context_ucred(ap
->a_context
);
212 struct vnode
*vp
= ap
->a_vp
;
213 dev_t bdev
, dev
= (dev_t
)vp
->v_rdev
;
214 int maj
= major(dev
);
218 * Don't allow open if fs is mounted -nodev.
220 if (vp
->v_mount
&& (vp
->v_mount
->mnt_flag
& MNT_NODEV
))
223 switch (vp
->v_type
) {
226 if ((u_int
)maj
>= (u_int
)nchrdev
)
228 if (cred
!= FSCRED
&& (ap
->a_mode
& FWRITE
)) {
230 * When running in very secure mode, do not allow
231 * opens for writing of any disk character devices.
233 if (securelevel
>= 2 && isdisk(dev
, VCHR
))
236 * When running in secure mode, do not allow opens
237 * for writing of /dev/mem, /dev/kmem, or character
238 * devices whose corresponding block devices are
241 if (securelevel
>= 1) {
242 if ((bdev
= chrtoblk(dev
)) != NODEV
&& check_mountedon(bdev
, VBLK
, &error
))
248 if (cdevsw
[maj
].d_type
== D_TTY
) {
250 vp
->v_flag
|= VISTTY
;
254 devsw_lock(dev
, S_IFCHR
);
255 error
= (*cdevsw
[maj
].d_open
)(dev
, ap
->a_mode
, S_IFCHR
, p
);
258 vp
->v_specinfo
->si_opencount
++;
261 devsw_unlock(dev
, S_IFCHR
);
265 if ((u_int
)maj
>= (u_int
)nblkdev
)
268 * When running in very secure mode, do not allow
269 * opens for writing of any disk block devices.
271 if (securelevel
>= 2 && cred
!= FSCRED
&&
272 (ap
->a_mode
& FWRITE
) && bdevsw
[maj
].d_type
== D_DISK
)
275 * Do not allow opens of block devices that are
278 if ( (error
= vfs_mountedon(vp
)) )
281 devsw_lock(dev
, S_IFBLK
);
282 error
= (*bdevsw
[maj
].d_open
)(dev
, ap
->a_mode
, S_IFBLK
, p
);
284 vp
->v_specinfo
->si_opencount
++;
286 devsw_unlock(dev
, S_IFBLK
);
292 u_int32_t size512
= 512;
295 if (!VNOP_IOCTL(vp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&blksize
, 0, ap
->a_context
)) {
296 /* Switch to 512 byte sectors (temporarily) */
298 if (!VNOP_IOCTL(vp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&size512
, FWRITE
, ap
->a_context
)) {
299 /* Get the number of 512 byte physical blocks. */
300 if (!VNOP_IOCTL(vp
, DKIOCGETBLOCKCOUNT
, (caddr_t
)&blkcnt
, 0, ap
->a_context
)) {
304 /* If it doesn't set back, we can't recover */
305 if (VNOP_IOCTL(vp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&blksize
, FWRITE
, ap
->a_context
))
311 set_blocksize(vp
, dev
);
314 * Cache the size in bytes of the block device for later
315 * use by spec_write().
318 vp
->v_specdevsize
= blkcnt
* (u_int64_t
)size512
;
320 vp
->v_specdevsize
= (u_int64_t
)0; /* Default: Can't get */
327 panic("spec_open type");
336 spec_read(struct vnop_read_args
*ap
)
338 struct vnode
*vp
= ap
->a_vp
;
339 struct uio
*uio
= ap
->a_uio
;
341 daddr64_t bn
, nextbn
;
349 if (uio
->uio_rw
!= UIO_READ
)
350 panic("spec_read mode");
351 if (UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))
352 panic("spec_read proc");
354 if (uio_resid(uio
) == 0)
357 switch (vp
->v_type
) {
360 error
= (*cdevsw
[major(vp
->v_rdev
)].d_read
)
361 (vp
->v_rdev
, uio
, ap
->a_ioflag
);
365 if (uio
->uio_offset
< 0)
370 devBlockSize
= vp
->v_specsize
;
372 if (devBlockSize
> PAGE_SIZE
)
375 bscale
= PAGE_SIZE
/ devBlockSize
;
376 bsize
= bscale
* devBlockSize
;
379 on
= uio
->uio_offset
% bsize
;
381 bn
= (daddr64_t
)((uio
->uio_offset
/ devBlockSize
) &~ (bscale
- 1));
383 if (vp
->v_speclastr
+ bscale
== bn
) {
384 nextbn
= bn
+ bscale
;
385 error
= buf_breadn(vp
, bn
, (int)bsize
, &nextbn
,
386 (int *)&bsize
, 1, NOCRED
, &bp
);
388 error
= buf_bread(vp
, bn
, (int)bsize
, NOCRED
, &bp
);
391 vp
->v_speclastr
= bn
;
394 n
= bsize
- buf_resid(bp
);
395 if ((on
> n
) || error
) {
401 n
= min((unsigned)(n
- on
), uio_resid(uio
));
403 error
= uiomove((char *)buf_dataptr(bp
) + on
, n
, uio
);
407 } while (error
== 0 && uio_resid(uio
) > 0 && n
!= 0);
411 panic("spec_read type");
422 spec_write(struct vnop_write_args
*ap
)
424 struct vnode
*vp
= ap
->a_vp
;
425 struct uio
*uio
= ap
->a_uio
;
428 int bsize
, blkmask
, bscale
;
436 if (uio
->uio_rw
!= UIO_WRITE
)
437 panic("spec_write mode");
438 if (UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))
439 panic("spec_write proc");
442 switch (vp
->v_type
) {
445 error
= (*cdevsw
[major(vp
->v_rdev
)].d_write
)
446 (vp
->v_rdev
, uio
, ap
->a_ioflag
);
450 if (uio_resid(uio
) == 0)
452 if (uio
->uio_offset
< 0)
455 io_sync
= (ap
->a_ioflag
& IO_SYNC
);
459 devBlockSize
= vp
->v_specsize
;
460 if (devBlockSize
> PAGE_SIZE
)
463 bscale
= PAGE_SIZE
/ devBlockSize
;
464 blkmask
= bscale
- 1;
465 bsize
= bscale
* devBlockSize
;
469 bn
= (daddr64_t
)((uio
->uio_offset
/ devBlockSize
) &~ blkmask
);
470 on
= uio
->uio_offset
% bsize
;
472 n
= min((unsigned)(bsize
- on
), uio_resid(uio
));
475 * Use buf_getblk() as an optimization IFF:
477 * 1) We are reading exactly a block on a block
479 * 2) We know the size of the device from spec_open
480 * 3) The read doesn't span the end of the device
482 * Otherwise, we fall back on buf_bread().
485 vp
->v_specdevsize
!= (u_int64_t
)0 &&
486 (uio
->uio_offset
+ (u_int64_t
)n
) > vp
->v_specdevsize
) {
487 /* reduce the size of the read to what is there */
488 n
= (uio
->uio_offset
+ (u_int64_t
)n
) - vp
->v_specdevsize
;
492 bp
= buf_getblk(vp
, bn
, bsize
, 0, 0, BLK_WRITE
);
494 error
= (int)buf_bread(vp
, bn
, bsize
, NOCRED
, &bp
);
496 /* Translate downstream error for upstream, if needed */
498 error
= (int)buf_error(bp
);
503 n
= min(n
, bsize
- buf_resid(bp
));
505 error
= uiomove((char *)buf_dataptr(bp
) + on
, n
, uio
);
513 error
= buf_bwrite(bp
);
515 if ((n
+ on
) == bsize
)
516 error
= buf_bawrite(bp
);
518 error
= buf_bdwrite(bp
);
520 } while (error
== 0 && uio_resid(uio
) > 0 && n
!= 0);
524 panic("spec_write type");
532 * Device ioctl operation.
535 spec_ioctl(struct vnop_ioctl_args
*ap
)
537 proc_t p
= vfs_context_proc(ap
->a_context
);
538 dev_t dev
= ap
->a_vp
->v_rdev
;
541 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL
, 0) | DBG_FUNC_START
,
542 (unsigned int)dev
, (unsigned int)ap
->a_command
, (unsigned int)ap
->a_fflag
, (unsigned int)ap
->a_vp
->v_type
, 0);
544 switch (ap
->a_vp
->v_type
) {
547 retval
= (*cdevsw
[major(dev
)].d_ioctl
)(dev
, ap
->a_command
, ap
->a_data
,
552 retval
= (*bdevsw
[major(dev
)].d_ioctl
)(dev
, ap
->a_command
, ap
->a_data
,
560 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL
, 0) | DBG_FUNC_END
,
561 (unsigned int)dev
, (unsigned int)ap
->a_command
, (unsigned int)ap
->a_fflag
, retval
, 0);
567 spec_select(struct vnop_select_args
*ap
)
569 proc_t p
= vfs_context_proc(ap
->a_context
);
572 switch (ap
->a_vp
->v_type
) {
575 return (1); /* XXX */
578 dev
= ap
->a_vp
->v_rdev
;
579 return (*cdevsw
[major(dev
)].d_select
)(dev
, ap
->a_which
, ap
->a_wql
, p
);
583 static int filt_specattach(struct knote
*kn
);
586 spec_kqfilter(vnode_t vp
, struct knote
*kn
)
592 * For a few special kinds of devices, we can attach knotes.
593 * Each filter function must check whether the dev type matches it.
595 dev
= vnode_specrdev(vp
);
597 if (vnode_istty(vp
)) {
598 /* We can hook into TTYs... */
599 err
= filt_specattach(kn
);
601 /* Try a bpf device, as defined in bsd/net/bpf.c */
602 err
= bpfkqfilter(dev
, kn
);
609 * Synch buffers associated with a block device
612 spec_fsync_internal(vnode_t vp
, int waitfor
, __unused vfs_context_t context
)
614 if (vp
->v_type
== VCHR
)
617 * Flush all dirty buffers associated with a block device.
619 buf_flushdirtyblks(vp
, (waitfor
== MNT_WAIT
|| waitfor
== MNT_DWAIT
), 0, "spec_fsync");
625 spec_fsync(struct vnop_fsync_args
*ap
)
627 return spec_fsync_internal(ap
->a_vp
, ap
->a_waitfor
, ap
->a_context
);
631 * Just call the device strategy routine
633 extern int hard_throttle_on_root
;
636 // the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
637 #define LOWPRI_INITIAL_WINDOW_MSECS 100
638 #define LOWPRI_WINDOW_MSECS_INC 50
639 #define LOWPRI_MAX_WINDOW_MSECS 200
640 #define LOWPRI_MAX_WAITING_MSECS 200
643 #define LOWPRI_SLEEP_INTERVAL 5
645 #define LOWPRI_SLEEP_INTERVAL 2
648 struct _throttle_io_info_t
{
649 struct timeval last_normal_IO_timestamp
;
650 struct timeval last_IO_timestamp
;
651 SInt32 numthreads_throttling
;
656 struct _throttle_io_info_t _throttle_io_info
[LOWPRI_MAX_NUM_DEV
];
657 int lowpri_IO_initial_window_msecs
= LOWPRI_INITIAL_WINDOW_MSECS
;
658 int lowpri_IO_window_msecs_inc
= LOWPRI_WINDOW_MSECS_INC
;
659 int lowpri_max_window_msecs
= LOWPRI_MAX_WINDOW_MSECS
;
660 int lowpri_max_waiting_msecs
= LOWPRI_MAX_WAITING_MSECS
;
663 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) \
665 if ((debug_info)->alloc) \
666 printf("%s: "format, __FUNCTION__, ## args); \
670 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
673 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_IO_initial_window_msecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &lowpri_IO_initial_window_msecs
, LOWPRI_INITIAL_WINDOW_MSECS
, "");
674 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_IO_window_inc
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &lowpri_IO_window_msecs_inc
, LOWPRI_INITIAL_WINDOW_MSECS
, "");
675 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_max_window_msecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &lowpri_max_window_msecs
, LOWPRI_INITIAL_WINDOW_MSECS
, "");
676 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_max_waiting_msecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &lowpri_max_waiting_msecs
, LOWPRI_INITIAL_WINDOW_MSECS
, "");
679 * throttled I/O helper function
680 * convert the index of the lowest set bit to a device index
683 num_trailing_0(uint64_t n
)
686 * since in most cases the number of trailing 0s is very small,
687 * we simply counting sequentially from the lowest bit
690 return sizeof(n
) * 8;
692 while (!ISSET(n
, 1)) {
700 * Release the reference and if the item was allocated and this is the last
701 * reference then free it.
703 * This routine always returns the old value.
706 throttle_info_rel(struct _throttle_io_info_t
*info
)
708 SInt32 oldValue
= OSDecrementAtomic(&info
->refcnt
);
710 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
711 info
, (int)(oldValue
-1), info
);
713 /* The reference count just went negative, very bad */
715 panic("throttle info ref cnt went negative!");
718 * Once reference count is zero, no one else should be able to take a
721 if ((info
->refcnt
== 0) && (info
->alloc
)) {
722 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info
, info
);
729 * Just take a reference on the throttle info structure.
731 * This routine always returns the old value.
734 throttle_info_ref(struct _throttle_io_info_t
*info
)
736 SInt32 oldValue
= OSIncrementAtomic(&info
->refcnt
);
738 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
739 info
, (int)(oldValue
-1), info
);
740 /* Allocated items should never have a reference of zero */
741 if (info
->alloc
&& (oldValue
== 0))
742 panic("Taking a reference without calling create throttle info!\n");
750 * Create and take a reference on a throttle info structure and return a
751 * pointer for the file system to use when calling throttle_info_update.
752 * Calling file system must have a matching release for every create.
755 throttle_info_create(void)
757 struct _throttle_io_info_t
*info
;
759 MALLOC(info
, struct _throttle_io_info_t
*, sizeof(*info
), M_TEMP
, M_ZERO
| M_WAITOK
);
760 /* Should never happen but just in case */
763 /* Mark that this one was allocated and needs to be freed */
764 DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info
, info
);
766 /* Take a reference */
767 OSIncrementAtomic(&info
->refcnt
);
774 * Release the throttle info pointer if all the reference are gone. Should be
775 * called to release reference taken by throttle_info_create
778 throttle_info_release(void *throttle_info
)
780 DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
781 (struct _throttle_io_info_t
*)throttle_info
,
782 (struct _throttle_io_info_t
*)throttle_info
);
783 if (throttle_info
) /* Just to be careful */
784 throttle_info_rel(throttle_info
);
790 * File Systems that create an info structure, need to call this routine in
791 * their mount routine (used by cluster code). File Systems that call this in
792 * their mount routines must call throttle_info_mount_rel in their unmount
796 throttle_info_mount_ref(mount_t mp
, void *throttle_info
)
798 if ((throttle_info
== NULL
) || (mp
== NULL
))
800 throttle_info_ref(throttle_info
);
801 /* We already have a reference release it before adding the new one */
802 if (mp
->mnt_throttle_info
)
803 throttle_info_rel(mp
->mnt_throttle_info
);
804 mp
->mnt_throttle_info
= throttle_info
;
808 * Private KPI routine
810 * return a handle for accessing throttle_info given a throttle_mask. The
811 * handle must be released by throttle_info_rel_by_mask
814 throttle_info_ref_by_mask(uint64_t throttle_mask
,
815 throttle_info_handle_t
*throttle_info_handle
)
818 struct _throttle_io_info_t
*info
;
820 if (throttle_info_handle
== NULL
)
823 dev_index
= num_trailing_0(throttle_mask
);
824 info
= &_throttle_io_info
[dev_index
];
825 throttle_info_ref(info
);
826 *(struct _throttle_io_info_t
**)throttle_info_handle
= info
;
831 * Private KPI routine
833 * release the handle obtained by throttle_info_ref_by_mask
836 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle
)
838 /* for now the handle is just a pointer to _throttle_io_info_t */
839 throttle_info_rel((struct _throttle_io_info_t
*)throttle_info_handle
);
845 * File Systems that throttle_info_mount_ref, must call this routine in their
849 throttle_info_mount_rel(mount_t mp
)
851 if (mp
->mnt_throttle_info
)
852 throttle_info_rel(mp
->mnt_throttle_info
);
853 mp
->mnt_throttle_info
= NULL
;
857 throttle_info_get_last_io_time(mount_t mp
, struct timeval
*tv
)
859 struct _throttle_io_info_t
*info
;
862 info
= &_throttle_io_info
[LOWPRI_MAX_NUM_DEV
- 1];
863 else if (mp
->mnt_throttle_info
== NULL
)
864 info
= &_throttle_io_info
[mp
->mnt_devbsdunit
];
866 info
= mp
->mnt_throttle_info
;
868 *tv
= info
->last_IO_timestamp
;
872 update_last_io_time(mount_t mp
)
874 struct _throttle_io_info_t
*info
;
877 info
= &_throttle_io_info
[LOWPRI_MAX_NUM_DEV
- 1];
878 else if (mp
->mnt_throttle_info
== NULL
)
879 info
= &_throttle_io_info
[mp
->mnt_devbsdunit
];
881 info
= mp
->mnt_throttle_info
;
883 microuptime(&info
->last_IO_timestamp
);
889 int throttle_get_io_policy(struct uthread
**ut
)
891 int policy
= IOPOL_DEFAULT
;
892 proc_t p
= current_proc();
894 *ut
= get_bsdthread_info(current_thread());
897 policy
= p
->p_iopol_disk
;
900 // the I/O policy of the thread overrides that of the process
901 // unless the I/O policy of the thread is default
902 if ((*ut
)->uu_iopol_disk
!= IOPOL_DEFAULT
)
903 policy
= (*ut
)->uu_iopol_disk
;
909 int throttle_get_io_policy(__unused
struct uthread
**ut
)
911 *ut
= get_bsdthread_info(current_thread());
913 return (proc_get_task_selfdiskacc());
919 throttle_io_will_be_throttled_internal(int lowpri_window_msecs
, void * throttle_info
)
921 struct _throttle_io_info_t
*info
= throttle_info
;
922 struct timeval elapsed
;
927 policy
= throttle_get_io_policy(&ut
);
929 if (ut
->uu_throttle_bc
== FALSE
&& policy
!= IOPOL_THROTTLE
)
932 microuptime(&elapsed
);
933 timevalsub(&elapsed
, &info
->last_normal_IO_timestamp
);
934 elapsed_msecs
= elapsed
.tv_sec
* 1000 + elapsed
.tv_usec
/ 1000;
936 if (lowpri_window_msecs
== -1) // use the max waiting time
937 lowpri_window_msecs
= lowpri_max_waiting_msecs
;
939 return elapsed_msecs
< lowpri_window_msecs
;
943 * If we have a mount point and it has a throttle info pointer then
944 * use it to do the check, otherwise use the device unit number to find
945 * the correct throttle info array element.
948 throttle_io_will_be_throttled(int lowpri_window_msecs
, mount_t mp
)
952 /* Should we just return zero if no mount point */
954 info
= &_throttle_io_info
[LOWPRI_MAX_NUM_DEV
- 1];
955 else if (mp
->mnt_throttle_info
== NULL
)
956 info
= &_throttle_io_info
[mp
->mnt_devbsdunit
];
958 info
= mp
->mnt_throttle_info
;
959 return throttle_io_will_be_throttled_internal(lowpri_window_msecs
, info
);
963 throttle_lowpri_io(int sleep_amount
)
966 int numthreads_throttling
;
969 struct _throttle_io_info_t
*info
;
970 int max_waiting_msecs
;
972 ut
= get_bsdthread_info(current_thread());
974 if ((ut
->uu_lowpri_window
== 0) || (ut
->uu_throttle_info
== NULL
))
977 info
= ut
->uu_throttle_info
;
979 if (sleep_amount
!= 0) {
981 max_waiting_msecs
= lowpri_max_waiting_msecs
;
983 if (ut
->uu_throttle_isssd
== TRUE
)
984 max_waiting_msecs
= lowpri_max_waiting_msecs
/ 100;
986 max_waiting_msecs
= lowpri_max_waiting_msecs
;
988 if (max_waiting_msecs
< LOWPRI_SLEEP_INTERVAL
)
989 max_waiting_msecs
= LOWPRI_SLEEP_INTERVAL
;
991 numthreads_throttling
= info
->numthreads_throttling
+ MIN(10, MAX(1, sleep_amount
)) - 1;
992 max_try_num
= max_waiting_msecs
/ LOWPRI_SLEEP_INTERVAL
* MAX(1, numthreads_throttling
);
994 for (sleep_cnt
= 0; sleep_cnt
< max_try_num
; sleep_cnt
++) {
995 if (throttle_io_will_be_throttled_internal(ut
->uu_lowpri_window
, info
)) {
996 if (sleep_cnt
== 0) {
997 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_START
,
998 ut
->uu_lowpri_window
, max_try_num
, numthreads_throttling
, 0, 0);
1000 IOSleep(LOWPRI_SLEEP_INTERVAL
);
1001 DEBUG_ALLOC_THROTTLE_INFO("sleeping because of info = %p\n", info
, info
);
1007 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_END
,
1008 ut
->uu_lowpri_window
, sleep_cnt
, 0, 0, 0);
1012 oldValue
= OSDecrementAtomic(&info
->numthreads_throttling
);
1014 if (oldValue
<= 0) {
1015 panic("%s: numthreads negative", __func__
);
1018 ut
->uu_lowpri_window
= 0;
1019 if (ut
->uu_throttle_info
)
1020 throttle_info_rel(ut
->uu_throttle_info
);
1021 ut
->uu_throttle_info
= NULL
;
1022 ut
->uu_throttle_bc
= FALSE
;
1024 return (sleep_cnt
* LOWPRI_SLEEP_INTERVAL
);
1030 * set a kernel thread's IO policy. policy can be:
1031 * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE
1033 * explanations about these policies are in the man page of setiopolicy_np
1035 void throttle_set_thread_io_policy(int policy
)
1037 #if !CONFIG_EMBEDDED
1038 proc_apply_thread_selfdiskacc(policy
);
1039 #else /* !CONFIG_EMBEDDED */
1041 ut
= get_bsdthread_info(current_thread());
1042 ut
->uu_iopol_disk
= policy
;
1043 #endif /* !CONFIG_EMBEDDED */
1048 void throttle_info_reset_window(struct uthread
*ut
)
1050 struct _throttle_io_info_t
*info
;
1052 info
= ut
->uu_throttle_info
;
1054 OSDecrementAtomic(&info
->numthreads_throttling
);
1055 throttle_info_rel(info
);
1056 ut
->uu_throttle_info
= NULL
;
1057 ut
->uu_lowpri_window
= 0;
1061 void throttle_info_set_initial_window(struct uthread
*ut
, struct _throttle_io_info_t
*info
, boolean_t isssd
, boolean_t BC_throttle
)
1065 ut
->uu_throttle_info
= info
;
1066 throttle_info_ref(info
);
1067 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info
, info
);
1069 oldValue
= OSIncrementAtomic(&info
->numthreads_throttling
);
1071 panic("%s: numthreads negative", __func__
);
1073 ut
->uu_lowpri_window
= lowpri_IO_initial_window_msecs
;
1074 ut
->uu_lowpri_window
+= lowpri_IO_window_msecs_inc
* oldValue
;
1075 ut
->uu_throttle_isssd
= isssd
;
1076 ut
->uu_throttle_bc
= BC_throttle
;
1081 void throttle_info_update_internal(void *throttle_info
, int flags
, boolean_t isssd
)
1083 struct _throttle_io_info_t
*info
= throttle_info
;
1086 int is_throttleable_io
= 0;
1087 int is_passive_io
= 0;
1089 if (!lowpri_IO_initial_window_msecs
|| (info
== NULL
))
1091 policy
= throttle_get_io_policy(&ut
);
1097 case IOPOL_THROTTLE
:
1098 is_throttleable_io
= 1;
1104 printf("unknown I/O policy %d", policy
);
1108 if (!is_throttleable_io
&& ISSET(flags
, B_PASSIVE
))
1111 if (!is_throttleable_io
) {
1112 if (!is_passive_io
){
1113 microuptime(&info
->last_normal_IO_timestamp
);
1117 * I'd really like to do the IOSleep here, but
1118 * we may be holding all kinds of filesystem related locks
1119 * and the pages for this I/O marked 'busy'...
1120 * we don't want to cause a normal task to block on
1121 * one of these locks while we're throttling a task marked
1122 * for low priority I/O... we'll mark the uthread and
1123 * do the delay just before we return from the system
1124 * call that triggered this I/O or from vnode_pagein
1126 if (ut
->uu_lowpri_window
== 0)
1127 throttle_info_set_initial_window(ut
, info
, isssd
, FALSE
);
1129 /* The thread sends I/Os to different devices within the same system call */
1130 if (ut
->uu_throttle_info
!= info
) {
1131 struct _throttle_io_info_t
*old_info
= ut
->uu_throttle_info
;
1133 // keep track of the numthreads in the right device
1134 OSDecrementAtomic(&old_info
->numthreads_throttling
);
1135 OSIncrementAtomic(&info
->numthreads_throttling
);
1137 DEBUG_ALLOC_THROTTLE_INFO("switching from info = %p\n", old_info
, old_info
);
1138 DEBUG_ALLOC_THROTTLE_INFO("switching to info = %p\n", info
, info
);
1139 /* This thread no longer needs a reference on that throttle info */
1140 throttle_info_rel(ut
->uu_throttle_info
);
1141 ut
->uu_throttle_info
= info
;
1142 /* Need to take a reference on this throttle info */
1143 throttle_info_ref(ut
->uu_throttle_info
);
1145 int numthreads
= MAX(1, info
->numthreads_throttling
);
1146 ut
->uu_lowpri_window
+= lowpri_IO_window_msecs_inc
* numthreads
;
1147 if (ut
->uu_lowpri_window
> lowpri_max_window_msecs
* numthreads
)
1148 ut
->uu_lowpri_window
= lowpri_max_window_msecs
* numthreads
;
1150 if (isssd
== FALSE
) {
1152 * we're here because we've actually issued I/Os to different devices...
1153 * if at least one of them was a non SSD, then thottle the thread
1154 * using the policy for non SSDs
1156 ut
->uu_throttle_isssd
= FALSE
;
1165 * this is usually called before every I/O, used for throttled I/O
1166 * book keeping. This routine has low overhead and does not sleep
1168 void throttle_info_update(void *throttle_info
, int flags
)
1170 throttle_info_update_internal(throttle_info
, flags
, FALSE
);
1176 * this is usually called before every I/O, used for throttled I/O
1177 * book keeping. This routine has low overhead and does not sleep
1179 void throttle_info_update_by_mask(void *throttle_info_handle
, int flags
)
1181 void *throttle_info
= throttle_info_handle
;
1182 /* for now we only use the lowest bit of the throttle mask, so the
1183 * handle is the same as the throttle_info. Later if we store a
1184 * set of throttle infos in the handle, we will want to loop through
1185 * them and call throttle_info_update in a loop
1187 throttle_info_update(throttle_info
, flags
);
1190 extern int ignore_is_ssd
;
1193 spec_strategy(struct vnop_strategy_args
*ap
)
1202 struct _throttle_io_info_t
*throttle_info
;
1203 boolean_t isssd
= FALSE
;
1206 bdev
= buf_device(bp
);
1207 mp
= buf_vnode(bp
)->v_mount
;
1209 policy
= throttle_get_io_policy(&ut
);
1211 if (policy
== IOPOL_THROTTLE
) {
1212 bp
->b_flags
|= B_THROTTLED_IO
;
1213 bp
->b_flags
&= ~B_PASSIVE
;
1214 } else if (policy
== IOPOL_PASSIVE
)
1215 bp
->b_flags
|= B_PASSIVE
;
1217 bflags
= bp
->b_flags
;
1219 if (kdebug_enable
) {
1222 if (bflags
& B_READ
)
1224 if (bflags
& B_ASYNC
)
1227 if (bflags
& B_META
)
1229 else if (bflags
& B_PAGEIO
)
1230 code
|= DKIO_PAGING
;
1232 if (bflags
& B_THROTTLED_IO
)
1233 code
|= DKIO_THROTTLE
;
1234 else if (bflags
& B_PASSIVE
)
1235 code
|= DKIO_PASSIVE
;
1237 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW
, code
) | DBG_FUNC_NONE
,
1238 bp
, bdev
, (int)buf_blkno(bp
), buf_count(bp
), 0);
1240 if (((bflags
& (B_IOSTREAMING
| B_PAGEIO
| B_READ
)) == (B_PAGEIO
| B_READ
)) &&
1241 mp
&& (mp
->mnt_kern_flag
& MNTK_ROOTDEV
))
1242 hard_throttle_on_root
= 1;
1245 if ((mp
->mnt_kern_flag
& MNTK_SSD
) && !ignore_is_ssd
)
1247 throttle_info
= &_throttle_io_info
[mp
->mnt_devbsdunit
];
1249 throttle_info
= &_throttle_io_info
[LOWPRI_MAX_NUM_DEV
- 1];
1251 throttle_info_update_internal(throttle_info
, bflags
, isssd
);
1253 if ((bflags
& B_READ
) == 0) {
1254 microuptime(&throttle_info
->last_IO_timestamp
);
1256 INCR_PENDING_IO(buf_count(bp
), mp
->mnt_pending_write_size
);
1259 INCR_PENDING_IO(buf_count(bp
), mp
->mnt_pending_read_size
);
1262 * The BootCache may give us special information about
1263 * the IO, so it returns special values that we check
1266 * IO_SATISFIED_BY_CACHE
1267 * The read has been satisfied by the boot cache. Don't
1268 * throttle the thread unnecessarily.
1270 * IO_SHOULD_BE_THROTTLED
1271 * The boot cache is playing back a playlist and this IO
1272 * cut through. Throttle it so we're not cutting through
1273 * the boot cache too often.
1275 * Note that typical strategy routines are defined with
1276 * a void return so we'll get garbage here. In the
1277 * unlikely case the garbage matches our special return
1278 * value, it's not a big deal since we're only adjusting
1279 * the throttling delay.
1281 #define IO_SATISFIED_BY_CACHE ((int)0xcafefeed)
1282 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
1283 typedef int strategy_fcn_ret_t(struct buf
*bp
);
1285 strategy_ret
= (*(strategy_fcn_ret_t
*)bdevsw
[major(bdev
)].d_strategy
)(bp
);
1287 if ((IO_SATISFIED_BY_CACHE
== strategy_ret
) && (ut
->uu_lowpri_window
!= 0) && (ut
->uu_throttle_info
!= NULL
)) {
1289 * If this was a throttled IO satisfied by the boot cache,
1290 * don't delay the thread.
1292 throttle_info_reset_window(ut
);
1294 } else if ((IO_SHOULD_BE_THROTTLED
== strategy_ret
) && (ut
->uu_lowpri_window
== 0) && (ut
->uu_throttle_info
== NULL
)) {
1296 * If the boot cache indicates this IO should be throttled,
1299 throttle_info_set_initial_window(ut
, throttle_info
, isssd
, TRUE
);
1306 * This is a noop, simply returning what one has been given.
1309 spec_blockmap(__unused
struct vnop_blockmap_args
*ap
)
1316 * Device close routine
1319 spec_close(struct vnop_close_args
*ap
)
1321 struct vnode
*vp
= ap
->a_vp
;
1322 dev_t dev
= vp
->v_rdev
;
1324 int flags
= ap
->a_fflag
;
1325 struct proc
*p
= vfs_context_proc(ap
->a_context
);
1326 struct session
*sessp
;
1329 switch (vp
->v_type
) {
1333 * Hack: a tty device that is a controlling terminal
1334 * has a reference from the session structure.
1335 * We cannot easily tell that a character device is
1336 * a controlling terminal, unless it is the closing
1337 * process' controlling terminal. In that case,
1338 * if the reference count is 1 (this is the very
1341 sessp
= proc_session(p
);
1342 if (sessp
!= SESSION_NULL
) {
1343 if ((vcount(vp
) == 1) &&
1344 (vp
== sessp
->s_ttyvp
)) {
1346 session_lock(sessp
);
1347 if (vp
== sessp
->s_ttyvp
) {
1348 sessp
->s_ttyvp
= NULL
;
1349 sessp
->s_ttyvid
= 0;
1350 sessp
->s_ttyp
= TTY_NULL
;
1351 sessp
->s_ttypgrpid
= NO_PID
;
1354 session_unlock(sessp
);
1360 session_rele(sessp
);
1363 devsw_lock(dev
, S_IFCHR
);
1365 vp
->v_specinfo
->si_opencount
--;
1367 if (vp
->v_specinfo
->si_opencount
< 0) {
1368 panic("Negative open count?");
1371 * close on last reference or on vnode revoke call
1373 if ((vcount(vp
) > 0) && ((flags
& IO_REVOKE
) == 0)) {
1374 devsw_unlock(dev
, S_IFCHR
);
1378 error
= cdevsw
[major(dev
)].d_close(dev
, flags
, S_IFCHR
, p
);
1380 devsw_unlock(dev
, S_IFCHR
);
1385 * If there is more than one outstanding open, don't
1386 * send the close to the device.
1388 devsw_lock(dev
, S_IFBLK
);
1389 if (vcount(vp
) > 1) {
1390 vp
->v_specinfo
->si_opencount
--;
1391 devsw_unlock(dev
, S_IFBLK
);
1394 devsw_unlock(dev
, S_IFBLK
);
1397 * On last close of a block device (that isn't mounted)
1398 * we must invalidate any in core blocks, so that
1399 * we can, for instance, change floppy disks.
1401 if ((error
= spec_fsync_internal(vp
, MNT_WAIT
, ap
->a_context
)))
1404 error
= buf_invalidateblks(vp
, BUF_WRITE_DATA
, 0, 0);
1408 devsw_lock(dev
, S_IFBLK
);
1410 vp
->v_specinfo
->si_opencount
--;
1412 if (vp
->v_specinfo
->si_opencount
< 0) {
1413 panic("Negative open count?");
1416 if (vcount(vp
) > 0) {
1417 devsw_unlock(dev
, S_IFBLK
);
1421 error
= bdevsw
[major(dev
)].d_close(dev
, flags
, S_IFBLK
, p
);
1423 devsw_unlock(dev
, S_IFBLK
);
1427 panic("spec_close: not special");
1435 * Return POSIX pathconf information applicable to special devices.
1438 spec_pathconf(struct vnop_pathconf_args
*ap
)
1441 switch (ap
->a_name
) {
1443 *ap
->a_retval
= LINK_MAX
;
1446 *ap
->a_retval
= MAX_CANON
;
1449 *ap
->a_retval
= MAX_INPUT
;
1452 *ap
->a_retval
= PIPE_BUF
;
1454 case _PC_CHOWN_RESTRICTED
:
1455 *ap
->a_retval
= 200112; /* _POSIX_CHOWN_RESTRICTED */
1458 *ap
->a_retval
= _POSIX_VDISABLE
;
1467 * Special device failed operation
1470 spec_ebadf(__unused
void *dummy
)
1476 /* Blktooff derives file offset from logical block number */
1478 spec_blktooff(struct vnop_blktooff_args
*ap
)
1480 struct vnode
*vp
= ap
->a_vp
;
1482 switch (vp
->v_type
) {
1484 *ap
->a_offset
= (off_t
)-1; /* failure */
1488 printf("spec_blktooff: not implemented for VBLK\n");
1489 *ap
->a_offset
= (off_t
)-1; /* failure */
1493 panic("spec_blktooff type");
1500 /* Offtoblk derives logical block number from file offset */
1502 spec_offtoblk(struct vnop_offtoblk_args
*ap
)
1504 struct vnode
*vp
= ap
->a_vp
;
1506 switch (vp
->v_type
) {
1508 *ap
->a_lblkno
= (daddr64_t
)-1; /* failure */
1512 printf("spec_offtoblk: not implemented for VBLK\n");
1513 *ap
->a_lblkno
= (daddr64_t
)-1; /* failure */
1517 panic("spec_offtoblk type");
1524 static void filt_specdetach(struct knote
*kn
);
1525 static int filt_spec(struct knote
*kn
, long hint
);
1526 static unsigned filt_specpeek(struct knote
*kn
);
1528 struct filterops spec_filtops
= {
1530 .f_attach
= filt_specattach
,
1531 .f_detach
= filt_specdetach
,
1532 .f_event
= filt_spec
,
1533 .f_peek
= filt_specpeek
1537 filter_to_seltype(int16_t filter
)
1546 panic("filt_to_seltype(): invalid filter %d\n", filter
);
1552 filt_specattach(struct knote
*kn
)
1557 vp
= (vnode_t
)kn
->kn_fp
->f_fglob
->fg_data
; /* Already have iocount, and vnode is alive */
1559 assert(vnode_ischr(vp
));
1561 dev
= vnode_specrdev(vp
);
1563 if (major(dev
) > nchrdev
) {
1567 if ((cdevsw_flags
[major(dev
)] & CDEVSW_SELECT_KQUEUE
) == 0) {
1571 /* Resulting wql is safe to unlink even if it has never been linked */
1572 kn
->kn_hook
= wait_queue_link_allocate();
1573 if (kn
->kn_hook
== NULL
) {
1577 kn
->kn_fop
= &spec_filtops
;
1578 kn
->kn_hookid
= vnode_vid(vp
);
1580 knote_markstayqueued(kn
);
1586 filt_specdetach(struct knote
*kn
)
1591 * Given wait queue link and wait queue set, unlink. This is subtle.
1592 * If the device has been revoked from under us, selclearthread() will
1593 * have removed our link from the kqueue's wait queue set, which
1594 * wait_queue_set_unlink_one() will detect and handle.
1596 ret
= wait_queue_set_unlink_one(kn
->kn_kq
->kq_wqs
, kn
->kn_hook
);
1597 if (ret
!= KERN_SUCCESS
) {
1598 panic("filt_specdetach(): failed to unlink wait queue link.");
1601 (void)wait_queue_link_free(kn
->kn_hook
);
1603 kn
->kn_status
&= ~KN_STAYQUEUED
;
1607 filt_spec(struct knote
*kn
, long hint
)
1611 wait_queue_set_t old_wqs
;
1619 assert(kn
->kn_hook
!= NULL
);
1622 panic("filt_spec(): nonzero hint?");
1625 uth
= get_bsdthread_info(current_thread());
1626 ctx
= vfs_context_current();
1627 vp
= (vnode_t
)kn
->kn_fp
->f_fglob
->fg_data
;
1629 error
= vnode_getwithvid(vp
, kn
->kn_hookid
);
1631 kn
->kn_flags
|= (EV_EOF
| EV_ONESHOT
);
1635 dev
= vnode_specrdev(vp
);
1636 flags
= cdevsw_flags
[major(dev
)];
1637 use_offset
= ((flags
& CDEVSW_USE_OFFSET
) != 0);
1638 assert((flags
& CDEVSW_SELECT_KQUEUE
) != 0);
1640 /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */
1641 old_wqs
= uth
->uu_wqset
;
1642 uth
->uu_wqset
= kn
->kn_kq
->kq_wqs
;
1643 selres
= VNOP_SELECT(vp
, filter_to_seltype(kn
->kn_filter
), 0, kn
->kn_hook
, ctx
);
1644 uth
->uu_wqset
= old_wqs
;
1647 if (kn
->kn_fp
->f_fglob
->fg_offset
>= (uint32_t)selres
) {
1650 kn
->kn_data
= ((uint32_t)selres
) - kn
->kn_fp
->f_fglob
->fg_offset
;
1653 kn
->kn_data
= selres
;
1658 return (kn
->kn_data
!= 0);
1662 filt_specpeek(struct knote
*kn
)
1666 wait_queue_set_t old_wqs
;
1670 uth
= get_bsdthread_info(current_thread());
1671 ctx
= vfs_context_current();
1672 vp
= (vnode_t
)kn
->kn_fp
->f_fglob
->fg_data
;
1674 error
= vnode_getwithvid(vp
, kn
->kn_hookid
);
1676 return 1; /* Just like VNOP_SELECT() on recycled vnode */
1680 * Why pass the link here? Because we may not have registered in the past...
1682 old_wqs
= uth
->uu_wqset
;
1683 uth
->uu_wqset
= kn
->kn_kq
->kq_wqs
;
1684 selres
= VNOP_SELECT(vp
, filter_to_seltype(kn
->kn_filter
), 0, kn
->kn_hook
, ctx
);
1685 uth
->uu_wqset
= old_wqs
;