2 * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1989, 1993, 1995
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/kauth.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
70 #include <sys/buf_internal.h>
71 #include <sys/mount_internal.h>
72 #include <sys/vnode_internal.h>
73 #include <sys/file_internal.h>
74 #include <sys/namei.h>
76 #include <sys/errno.h>
77 #include <sys/ioctl.h>
80 #include <sys/malloc.h>
82 #include <sys/uio_internal.h>
83 #include <sys/resource.h>
84 #include <miscfs/specfs/specdev.h>
85 #include <vfs/vfs_support.h>
86 #include <kern/assert.h>
87 #include <kern/task.h>
89 #include <sys/kdebug.h>
91 /* XXX following three prototypes should be in a header file somewhere */
92 extern dev_t
chrtoblk(dev_t dev
);
93 extern int iskmemdev(dev_t dev
);
94 extern int bpfkqfilter(dev_t dev
, struct knote
*kn
);
95 extern int ptsd_kqfilter(dev_t dev
, struct knote
*kn
);
97 struct vnode
*speclisth
[SPECHSZ
];
99 /* symbolic sleep message strings for devices */
100 char devopn
[] = "devopn";
101 char devio
[] = "devio";
102 char devwait
[] = "devwait";
103 char devin
[] = "devin";
104 char devout
[] = "devout";
105 char devioc
[] = "devioc";
106 char devcls
[] = "devcls";
108 #define VOPFUNC int (*)(void *)
110 int (**spec_vnodeop_p
)(void *);
111 struct vnodeopv_entry_desc spec_vnodeop_entries
[] = {
112 { &vnop_default_desc
, (VOPFUNC
)vn_default_error
},
113 { &vnop_lookup_desc
, (VOPFUNC
)spec_lookup
}, /* lookup */
114 { &vnop_create_desc
, (VOPFUNC
)err_create
}, /* create */
115 { &vnop_mknod_desc
, (VOPFUNC
)err_mknod
}, /* mknod */
116 { &vnop_open_desc
, (VOPFUNC
)spec_open
}, /* open */
117 { &vnop_close_desc
, (VOPFUNC
)spec_close
}, /* close */
118 { &vnop_access_desc
, (VOPFUNC
)spec_access
}, /* access */
119 { &vnop_getattr_desc
, (VOPFUNC
)spec_getattr
}, /* getattr */
120 { &vnop_setattr_desc
, (VOPFUNC
)spec_setattr
}, /* setattr */
121 { &vnop_read_desc
, (VOPFUNC
)spec_read
}, /* read */
122 { &vnop_write_desc
, (VOPFUNC
)spec_write
}, /* write */
123 { &vnop_ioctl_desc
, (VOPFUNC
)spec_ioctl
}, /* ioctl */
124 { &vnop_select_desc
, (VOPFUNC
)spec_select
}, /* select */
125 { &vnop_revoke_desc
, (VOPFUNC
)nop_revoke
}, /* revoke */
126 { &vnop_mmap_desc
, (VOPFUNC
)err_mmap
}, /* mmap */
127 { &vnop_fsync_desc
, (VOPFUNC
)spec_fsync
}, /* fsync */
128 { &vnop_remove_desc
, (VOPFUNC
)err_remove
}, /* remove */
129 { &vnop_link_desc
, (VOPFUNC
)err_link
}, /* link */
130 { &vnop_rename_desc
, (VOPFUNC
)err_rename
}, /* rename */
131 { &vnop_mkdir_desc
, (VOPFUNC
)err_mkdir
}, /* mkdir */
132 { &vnop_rmdir_desc
, (VOPFUNC
)err_rmdir
}, /* rmdir */
133 { &vnop_symlink_desc
, (VOPFUNC
)err_symlink
}, /* symlink */
134 { &vnop_readdir_desc
, (VOPFUNC
)err_readdir
}, /* readdir */
135 { &vnop_readlink_desc
, (VOPFUNC
)err_readlink
}, /* readlink */
136 { &vnop_inactive_desc
, (VOPFUNC
)nop_inactive
}, /* inactive */
137 { &vnop_reclaim_desc
, (VOPFUNC
)nop_reclaim
}, /* reclaim */
138 { &vnop_strategy_desc
, (VOPFUNC
)spec_strategy
}, /* strategy */
139 { &vnop_pathconf_desc
, (VOPFUNC
)spec_pathconf
}, /* pathconf */
140 { &vnop_advlock_desc
, (VOPFUNC
)err_advlock
}, /* advlock */
141 { &vnop_bwrite_desc
, (VOPFUNC
)spec_bwrite
}, /* bwrite */
142 { &vnop_pagein_desc
, (VOPFUNC
)err_pagein
}, /* Pagein */
143 { &vnop_pageout_desc
, (VOPFUNC
)err_pageout
}, /* Pageout */
144 { &vnop_copyfile_desc
, (VOPFUNC
)err_copyfile
}, /* Copyfile */
145 { &vnop_blktooff_desc
, (VOPFUNC
)spec_blktooff
}, /* blktooff */
146 { &vnop_offtoblk_desc
, (VOPFUNC
)spec_offtoblk
}, /* offtoblk */
147 { &vnop_blockmap_desc
, (VOPFUNC
)spec_blockmap
}, /* blockmap */
148 { (struct vnodeop_desc
*)NULL
, (int(*)())NULL
}
150 struct vnodeopv_desc spec_vnodeop_opv_desc
=
151 { &spec_vnodeop_p
, spec_vnodeop_entries
};
154 static void set_blocksize(vnode_t
, dev_t
);
157 struct _throttle_io_info_t
{
158 struct timeval last_normal_IO_timestamp
;
159 struct timeval last_IO_timestamp
;
160 SInt32 numthreads_throttling
;
165 struct _throttle_io_info_t _throttle_io_info
[LOWPRI_MAX_NUM_DEV
];
167 static void throttle_info_update_internal(void *throttle_info
, int flags
, boolean_t isssd
);
172 * Trivial lookup routine that always fails.
175 spec_lookup(struct vnop_lookup_args
*ap
)
183 set_blocksize(struct vnode
*vp
, dev_t dev
)
188 if ((major(dev
) < nblkdev
) && (size
= bdevsw
[major(dev
)].d_psize
)) {
189 rsize
= (*size
)(dev
);
190 if (rsize
<= 0) /* did size fail? */
191 vp
->v_specsize
= DEV_BSIZE
;
193 vp
->v_specsize
= rsize
;
196 vp
->v_specsize
= DEV_BSIZE
;
200 set_fsblocksize(struct vnode
*vp
)
203 if (vp
->v_type
== VBLK
) {
204 dev_t dev
= (dev_t
)vp
->v_rdev
;
205 int maj
= major(dev
);
207 if ((u_int
)maj
>= (u_int
)nblkdev
)
211 set_blocksize(vp
, dev
);
219 * Open a special file.
222 spec_open(struct vnop_open_args
*ap
)
224 struct proc
*p
= vfs_context_proc(ap
->a_context
);
225 kauth_cred_t cred
= vfs_context_ucred(ap
->a_context
);
226 struct vnode
*vp
= ap
->a_vp
;
227 dev_t bdev
, dev
= (dev_t
)vp
->v_rdev
;
228 int maj
= major(dev
);
232 * Don't allow open if fs is mounted -nodev.
234 if (vp
->v_mount
&& (vp
->v_mount
->mnt_flag
& MNT_NODEV
))
237 switch (vp
->v_type
) {
240 if ((u_int
)maj
>= (u_int
)nchrdev
)
242 if (cred
!= FSCRED
&& (ap
->a_mode
& FWRITE
)) {
244 * When running in very secure mode, do not allow
245 * opens for writing of any disk character devices.
247 if (securelevel
>= 2 && isdisk(dev
, VCHR
))
250 * When running in secure mode, do not allow opens
251 * for writing of /dev/mem, /dev/kmem, or character
252 * devices whose corresponding block devices are
255 if (securelevel
>= 1) {
256 if ((bdev
= chrtoblk(dev
)) != NODEV
&& check_mountedon(bdev
, VBLK
, &error
))
262 if (cdevsw
[maj
].d_type
== D_TTY
) {
264 vp
->v_flag
|= VISTTY
;
268 devsw_lock(dev
, S_IFCHR
);
269 error
= (*cdevsw
[maj
].d_open
)(dev
, ap
->a_mode
, S_IFCHR
, p
);
272 vp
->v_specinfo
->si_opencount
++;
275 devsw_unlock(dev
, S_IFCHR
);
277 if (error
== 0 && cdevsw
[maj
].d_type
== D_DISK
&& !vp
->v_un
.vu_specinfo
->si_initted
) {
279 uint64_t throttle_mask
= 0;
280 uint32_t devbsdunit
= 0;
282 if (VNOP_IOCTL(vp
, DKIOCGETTHROTTLEMASK
, (caddr_t
)&throttle_mask
, 0, NULL
) == 0) {
284 if (VNOP_IOCTL(vp
, DKIOCISSOLIDSTATE
, (caddr_t
)&isssd
, 0, ap
->a_context
) == 0) {
286 * as a reasonable approximation, only use the lowest bit of the mask
287 * to generate a disk unit number
289 devbsdunit
= num_trailing_0(throttle_mask
);
293 vp
->v_un
.vu_specinfo
->si_isssd
= isssd
;
294 vp
->v_un
.vu_specinfo
->si_devbsdunit
= devbsdunit
;
295 vp
->v_un
.vu_specinfo
->si_throttle_mask
= throttle_mask
;
296 vp
->v_un
.vu_specinfo
->si_throttleable
= 1;
297 vp
->v_un
.vu_specinfo
->si_initted
= 1;
302 if (vp
->v_un
.vu_specinfo
->si_initted
== 0) {
304 vp
->v_un
.vu_specinfo
->si_initted
= 1;
311 if ((u_int
)maj
>= (u_int
)nblkdev
)
314 * When running in very secure mode, do not allow
315 * opens for writing of any disk block devices.
317 if (securelevel
>= 2 && cred
!= FSCRED
&&
318 (ap
->a_mode
& FWRITE
) && bdevsw
[maj
].d_type
== D_DISK
)
321 * Do not allow opens of block devices that are
324 if ( (error
= vfs_mountedon(vp
)) )
327 devsw_lock(dev
, S_IFBLK
);
328 error
= (*bdevsw
[maj
].d_open
)(dev
, ap
->a_mode
, S_IFBLK
, p
);
330 vp
->v_specinfo
->si_opencount
++;
332 devsw_unlock(dev
, S_IFBLK
);
338 u_int32_t size512
= 512;
341 if (!VNOP_IOCTL(vp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&blksize
, 0, ap
->a_context
)) {
342 /* Switch to 512 byte sectors (temporarily) */
344 if (!VNOP_IOCTL(vp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&size512
, FWRITE
, ap
->a_context
)) {
345 /* Get the number of 512 byte physical blocks. */
346 if (!VNOP_IOCTL(vp
, DKIOCGETBLOCKCOUNT
, (caddr_t
)&blkcnt
, 0, ap
->a_context
)) {
350 /* If it doesn't set back, we can't recover */
351 if (VNOP_IOCTL(vp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&blksize
, FWRITE
, ap
->a_context
))
357 set_blocksize(vp
, dev
);
360 * Cache the size in bytes of the block device for later
361 * use by spec_write().
364 vp
->v_specdevsize
= blkcnt
* (u_int64_t
)size512
;
366 vp
->v_specdevsize
= (u_int64_t
)0; /* Default: Can't get */
373 panic("spec_open type");
382 spec_read(struct vnop_read_args
*ap
)
384 struct vnode
*vp
= ap
->a_vp
;
385 struct uio
*uio
= ap
->a_uio
;
387 daddr64_t bn
, nextbn
;
395 if (uio
->uio_rw
!= UIO_READ
)
396 panic("spec_read mode");
397 if (UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))
398 panic("spec_read proc");
400 if (uio_resid(uio
) == 0)
403 switch (vp
->v_type
) {
406 if (cdevsw
[major(vp
->v_rdev
)].d_type
== D_DISK
&& vp
->v_un
.vu_specinfo
->si_throttleable
) {
407 struct _throttle_io_info_t
*throttle_info
;
409 throttle_info
= &_throttle_io_info
[vp
->v_un
.vu_specinfo
->si_devbsdunit
];
411 throttle_info_update_internal(throttle_info
, 0, vp
->v_un
.vu_specinfo
->si_isssd
);
414 error
= (*cdevsw
[major(vp
->v_rdev
)].d_read
)
415 (vp
->v_rdev
, uio
, ap
->a_ioflag
);
420 if (uio
->uio_offset
< 0)
425 devBlockSize
= vp
->v_specsize
;
427 if (devBlockSize
> PAGE_SIZE
)
430 bscale
= PAGE_SIZE
/ devBlockSize
;
431 bsize
= bscale
* devBlockSize
;
434 on
= uio
->uio_offset
% bsize
;
436 bn
= (daddr64_t
)((uio
->uio_offset
/ devBlockSize
) &~ (bscale
- 1));
438 if (vp
->v_speclastr
+ bscale
== bn
) {
439 nextbn
= bn
+ bscale
;
440 error
= buf_breadn(vp
, bn
, (int)bsize
, &nextbn
,
441 (int *)&bsize
, 1, NOCRED
, &bp
);
443 error
= buf_bread(vp
, bn
, (int)bsize
, NOCRED
, &bp
);
446 vp
->v_speclastr
= bn
;
449 n
= bsize
- buf_resid(bp
);
450 if ((on
> n
) || error
) {
456 n
= min((unsigned)(n
- on
), uio_resid(uio
));
458 error
= uiomove((char *)buf_dataptr(bp
) + on
, n
, uio
);
462 } while (error
== 0 && uio_resid(uio
) > 0 && n
!= 0);
466 panic("spec_read type");
477 spec_write(struct vnop_write_args
*ap
)
479 struct vnode
*vp
= ap
->a_vp
;
480 struct uio
*uio
= ap
->a_uio
;
483 int bsize
, blkmask
, bscale
;
491 if (uio
->uio_rw
!= UIO_WRITE
)
492 panic("spec_write mode");
493 if (UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))
494 panic("spec_write proc");
497 switch (vp
->v_type
) {
500 if (cdevsw
[major(vp
->v_rdev
)].d_type
== D_DISK
&& vp
->v_un
.vu_specinfo
->si_throttleable
) {
501 struct _throttle_io_info_t
*throttle_info
;
503 throttle_info
= &_throttle_io_info
[vp
->v_un
.vu_specinfo
->si_devbsdunit
];
505 throttle_info_update_internal(throttle_info
, 0, vp
->v_un
.vu_specinfo
->si_isssd
);
507 microuptime(&throttle_info
->last_IO_timestamp
);
510 error
= (*cdevsw
[major(vp
->v_rdev
)].d_write
)
511 (vp
->v_rdev
, uio
, ap
->a_ioflag
);
516 if (uio_resid(uio
) == 0)
518 if (uio
->uio_offset
< 0)
521 io_sync
= (ap
->a_ioflag
& IO_SYNC
);
525 devBlockSize
= vp
->v_specsize
;
526 if (devBlockSize
> PAGE_SIZE
)
529 bscale
= PAGE_SIZE
/ devBlockSize
;
530 blkmask
= bscale
- 1;
531 bsize
= bscale
* devBlockSize
;
535 bn
= (daddr64_t
)((uio
->uio_offset
/ devBlockSize
) &~ blkmask
);
536 on
= uio
->uio_offset
% bsize
;
538 n
= min((unsigned)(bsize
- on
), uio_resid(uio
));
541 * Use buf_getblk() as an optimization IFF:
543 * 1) We are reading exactly a block on a block
545 * 2) We know the size of the device from spec_open
546 * 3) The read doesn't span the end of the device
548 * Otherwise, we fall back on buf_bread().
551 vp
->v_specdevsize
!= (u_int64_t
)0 &&
552 (uio
->uio_offset
+ (u_int64_t
)n
) > vp
->v_specdevsize
) {
553 /* reduce the size of the read to what is there */
554 n
= (uio
->uio_offset
+ (u_int64_t
)n
) - vp
->v_specdevsize
;
558 bp
= buf_getblk(vp
, bn
, bsize
, 0, 0, BLK_WRITE
);
560 error
= (int)buf_bread(vp
, bn
, bsize
, NOCRED
, &bp
);
562 /* Translate downstream error for upstream, if needed */
564 error
= (int)buf_error(bp
);
569 n
= min(n
, bsize
- buf_resid(bp
));
571 error
= uiomove((char *)buf_dataptr(bp
) + on
, n
, uio
);
579 error
= buf_bwrite(bp
);
581 if ((n
+ on
) == bsize
)
582 error
= buf_bawrite(bp
);
584 error
= buf_bdwrite(bp
);
586 } while (error
== 0 && uio_resid(uio
) > 0 && n
!= 0);
590 panic("spec_write type");
598 * Device ioctl operation.
601 spec_ioctl(struct vnop_ioctl_args
*ap
)
603 proc_t p
= vfs_context_proc(ap
->a_context
);
604 dev_t dev
= ap
->a_vp
->v_rdev
;
607 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL
, 0) | DBG_FUNC_START
,
608 (unsigned int)dev
, (unsigned int)ap
->a_command
, (unsigned int)ap
->a_fflag
, (unsigned int)ap
->a_vp
->v_type
, 0);
610 switch (ap
->a_vp
->v_type
) {
613 retval
= (*cdevsw
[major(dev
)].d_ioctl
)(dev
, ap
->a_command
, ap
->a_data
,
618 retval
= (*bdevsw
[major(dev
)].d_ioctl
)(dev
, ap
->a_command
, ap
->a_data
,
626 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL
, 0) | DBG_FUNC_END
,
627 (unsigned int)dev
, (unsigned int)ap
->a_command
, (unsigned int)ap
->a_fflag
, retval
, 0);
633 spec_select(struct vnop_select_args
*ap
)
635 proc_t p
= vfs_context_proc(ap
->a_context
);
638 switch (ap
->a_vp
->v_type
) {
641 return (1); /* XXX */
644 dev
= ap
->a_vp
->v_rdev
;
645 return (*cdevsw
[major(dev
)].d_select
)(dev
, ap
->a_which
, ap
->a_wql
, p
);
649 static int filt_specattach(struct knote
*kn
);
652 spec_kqfilter(vnode_t vp
, struct knote
*kn
)
658 * For a few special kinds of devices, we can attach knotes.
659 * Each filter function must check whether the dev type matches it.
661 dev
= vnode_specrdev(vp
);
663 if (vnode_istty(vp
)) {
664 /* We can hook into TTYs... */
665 err
= filt_specattach(kn
);
667 /* Try a bpf device, as defined in bsd/net/bpf.c */
668 err
= bpfkqfilter(dev
, kn
);
675 * Synch buffers associated with a block device
678 spec_fsync_internal(vnode_t vp
, int waitfor
, __unused vfs_context_t context
)
680 if (vp
->v_type
== VCHR
)
683 * Flush all dirty buffers associated with a block device.
685 buf_flushdirtyblks(vp
, (waitfor
== MNT_WAIT
|| waitfor
== MNT_DWAIT
), 0, "spec_fsync");
691 spec_fsync(struct vnop_fsync_args
*ap
)
693 return spec_fsync_internal(ap
->a_vp
, ap
->a_waitfor
, ap
->a_context
);
697 * Just call the device strategy routine
699 extern int hard_throttle_on_root
;
702 // the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
703 #define LOWPRI_INITIAL_WINDOW_MSECS 100
704 #define LOWPRI_WINDOW_MSECS_INC 50
705 #define LOWPRI_MAX_WINDOW_MSECS 200
706 #define LOWPRI_MAX_WAITING_MSECS 200
709 #define LOWPRI_SLEEP_INTERVAL 5
711 #define LOWPRI_SLEEP_INTERVAL 2
714 int lowpri_IO_initial_window_msecs
= LOWPRI_INITIAL_WINDOW_MSECS
;
715 int lowpri_IO_window_msecs_inc
= LOWPRI_WINDOW_MSECS_INC
;
716 int lowpri_max_window_msecs
= LOWPRI_MAX_WINDOW_MSECS
;
717 int lowpri_max_waiting_msecs
= LOWPRI_MAX_WAITING_MSECS
;
720 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) \
722 if ((debug_info)->alloc) \
723 printf("%s: "format, __FUNCTION__, ## args); \
727 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
730 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_IO_initial_window_msecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &lowpri_IO_initial_window_msecs
, LOWPRI_INITIAL_WINDOW_MSECS
, "");
731 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_IO_window_inc
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &lowpri_IO_window_msecs_inc
, LOWPRI_INITIAL_WINDOW_MSECS
, "");
732 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_max_window_msecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &lowpri_max_window_msecs
, LOWPRI_INITIAL_WINDOW_MSECS
, "");
733 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_max_waiting_msecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &lowpri_max_waiting_msecs
, LOWPRI_INITIAL_WINDOW_MSECS
, "");
736 * throttled I/O helper function
737 * convert the index of the lowest set bit to a device index
740 num_trailing_0(uint64_t n
)
743 * since in most cases the number of trailing 0s is very small,
744 * we simply counting sequentially from the lowest bit
747 return sizeof(n
) * 8;
749 while (!ISSET(n
, 1)) {
757 * Release the reference and if the item was allocated and this is the last
758 * reference then free it.
760 * This routine always returns the old value.
763 throttle_info_rel(struct _throttle_io_info_t
*info
)
765 SInt32 oldValue
= OSDecrementAtomic(&info
->refcnt
);
767 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
768 info
, (int)(oldValue
-1), info
);
770 /* The reference count just went negative, very bad */
772 panic("throttle info ref cnt went negative!");
775 * Once reference count is zero, no one else should be able to take a
778 if ((info
->refcnt
== 0) && (info
->alloc
)) {
779 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info
, info
);
786 * Just take a reference on the throttle info structure.
788 * This routine always returns the old value.
791 throttle_info_ref(struct _throttle_io_info_t
*info
)
793 SInt32 oldValue
= OSIncrementAtomic(&info
->refcnt
);
795 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
796 info
, (int)(oldValue
-1), info
);
797 /* Allocated items should never have a reference of zero */
798 if (info
->alloc
&& (oldValue
== 0))
799 panic("Taking a reference without calling create throttle info!\n");
807 * Create and take a reference on a throttle info structure and return a
808 * pointer for the file system to use when calling throttle_info_update.
809 * Calling file system must have a matching release for every create.
812 throttle_info_create(void)
814 struct _throttle_io_info_t
*info
;
816 MALLOC(info
, struct _throttle_io_info_t
*, sizeof(*info
), M_TEMP
, M_ZERO
| M_WAITOK
);
817 /* Should never happen but just in case */
820 /* Mark that this one was allocated and needs to be freed */
821 DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info
, info
);
823 /* Take a reference */
824 OSIncrementAtomic(&info
->refcnt
);
831 * Release the throttle info pointer if all the reference are gone. Should be
832 * called to release reference taken by throttle_info_create
835 throttle_info_release(void *throttle_info
)
837 DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
838 (struct _throttle_io_info_t
*)throttle_info
,
839 (struct _throttle_io_info_t
*)throttle_info
);
840 if (throttle_info
) /* Just to be careful */
841 throttle_info_rel(throttle_info
);
847 * File Systems that create an info structure, need to call this routine in
848 * their mount routine (used by cluster code). File Systems that call this in
849 * their mount routines must call throttle_info_mount_rel in their unmount
853 throttle_info_mount_ref(mount_t mp
, void *throttle_info
)
855 if ((throttle_info
== NULL
) || (mp
== NULL
))
857 throttle_info_ref(throttle_info
);
858 /* We already have a reference release it before adding the new one */
859 if (mp
->mnt_throttle_info
)
860 throttle_info_rel(mp
->mnt_throttle_info
);
861 mp
->mnt_throttle_info
= throttle_info
;
865 * Private KPI routine
867 * return a handle for accessing throttle_info given a throttle_mask. The
868 * handle must be released by throttle_info_rel_by_mask
871 throttle_info_ref_by_mask(uint64_t throttle_mask
,
872 throttle_info_handle_t
*throttle_info_handle
)
875 struct _throttle_io_info_t
*info
;
877 if (throttle_info_handle
== NULL
)
880 dev_index
= num_trailing_0(throttle_mask
);
881 info
= &_throttle_io_info
[dev_index
];
882 throttle_info_ref(info
);
883 *(struct _throttle_io_info_t
**)throttle_info_handle
= info
;
888 * Private KPI routine
890 * release the handle obtained by throttle_info_ref_by_mask
893 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle
)
895 /* for now the handle is just a pointer to _throttle_io_info_t */
896 throttle_info_rel((struct _throttle_io_info_t
*)throttle_info_handle
);
902 * File Systems that throttle_info_mount_ref, must call this routine in their
906 throttle_info_mount_rel(mount_t mp
)
908 if (mp
->mnt_throttle_info
)
909 throttle_info_rel(mp
->mnt_throttle_info
);
910 mp
->mnt_throttle_info
= NULL
;
914 throttle_info_get_last_io_time(mount_t mp
, struct timeval
*tv
)
916 struct _throttle_io_info_t
*info
;
919 info
= &_throttle_io_info
[LOWPRI_MAX_NUM_DEV
- 1];
920 else if (mp
->mnt_throttle_info
== NULL
)
921 info
= &_throttle_io_info
[mp
->mnt_devbsdunit
];
923 info
= mp
->mnt_throttle_info
;
925 *tv
= info
->last_IO_timestamp
;
929 update_last_io_time(mount_t mp
)
931 struct _throttle_io_info_t
*info
;
934 info
= &_throttle_io_info
[LOWPRI_MAX_NUM_DEV
- 1];
935 else if (mp
->mnt_throttle_info
== NULL
)
936 info
= &_throttle_io_info
[mp
->mnt_devbsdunit
];
938 info
= mp
->mnt_throttle_info
;
940 microuptime(&info
->last_IO_timestamp
);
946 int throttle_get_io_policy(struct uthread
**ut
)
948 int policy
= IOPOL_DEFAULT
;
949 proc_t p
= current_proc();
951 *ut
= get_bsdthread_info(current_thread());
954 policy
= p
->p_iopol_disk
;
957 // the I/O policy of the thread overrides that of the process
958 // unless the I/O policy of the thread is default
959 if ((*ut
)->uu_iopol_disk
!= IOPOL_DEFAULT
)
960 policy
= (*ut
)->uu_iopol_disk
;
966 int throttle_get_io_policy(__unused
struct uthread
**ut
)
968 *ut
= get_bsdthread_info(current_thread());
970 return (proc_get_task_selfdiskacc());
976 throttle_io_will_be_throttled_internal(int lowpri_window_msecs
, void * throttle_info
)
978 struct _throttle_io_info_t
*info
= throttle_info
;
979 struct timeval elapsed
;
984 policy
= throttle_get_io_policy(&ut
);
986 if (ut
->uu_throttle_bc
== FALSE
&& policy
!= IOPOL_THROTTLE
)
989 microuptime(&elapsed
);
990 timevalsub(&elapsed
, &info
->last_normal_IO_timestamp
);
991 elapsed_msecs
= elapsed
.tv_sec
* 1000 + elapsed
.tv_usec
/ 1000;
993 if (lowpri_window_msecs
== -1) // use the max waiting time
994 lowpri_window_msecs
= lowpri_max_waiting_msecs
;
996 return elapsed_msecs
< lowpri_window_msecs
;
1000 * If we have a mount point and it has a throttle info pointer then
1001 * use it to do the check, otherwise use the device unit number to find
1002 * the correct throttle info array element.
1005 throttle_io_will_be_throttled(int lowpri_window_msecs
, mount_t mp
)
1009 /* Should we just return zero if no mount point */
1011 info
= &_throttle_io_info
[LOWPRI_MAX_NUM_DEV
- 1];
1012 else if (mp
->mnt_throttle_info
== NULL
)
1013 info
= &_throttle_io_info
[mp
->mnt_devbsdunit
];
1015 info
= mp
->mnt_throttle_info
;
1016 return throttle_io_will_be_throttled_internal(lowpri_window_msecs
, info
);
1020 throttle_lowpri_io(int sleep_amount
)
1023 int numthreads_throttling
;
1026 struct _throttle_io_info_t
*info
;
1027 int max_waiting_msecs
;
1029 ut
= get_bsdthread_info(current_thread());
1031 if ((ut
->uu_lowpri_window
== 0) || (ut
->uu_throttle_info
== NULL
))
1034 info
= ut
->uu_throttle_info
;
1036 if (sleep_amount
!= 0) {
1038 max_waiting_msecs
= lowpri_max_waiting_msecs
;
1040 if (ut
->uu_throttle_isssd
== TRUE
)
1041 max_waiting_msecs
= lowpri_max_waiting_msecs
/ 100;
1043 max_waiting_msecs
= lowpri_max_waiting_msecs
;
1045 if (max_waiting_msecs
< LOWPRI_SLEEP_INTERVAL
)
1046 max_waiting_msecs
= LOWPRI_SLEEP_INTERVAL
;
1048 numthreads_throttling
= info
->numthreads_throttling
+ MIN(10, MAX(1, sleep_amount
)) - 1;
1049 max_try_num
= max_waiting_msecs
/ LOWPRI_SLEEP_INTERVAL
* MAX(1, numthreads_throttling
);
1051 for (sleep_cnt
= 0; sleep_cnt
< max_try_num
; sleep_cnt
++) {
1052 if (throttle_io_will_be_throttled_internal(ut
->uu_lowpri_window
, info
)) {
1053 if (sleep_cnt
== 0) {
1054 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_START
,
1055 ut
->uu_lowpri_window
, max_try_num
, numthreads_throttling
, 0, 0);
1057 IOSleep(LOWPRI_SLEEP_INTERVAL
);
1058 DEBUG_ALLOC_THROTTLE_INFO("sleeping because of info = %p\n", info
, info
);
1064 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_END
,
1065 ut
->uu_lowpri_window
, sleep_cnt
, 0, 0, 0);
1069 oldValue
= OSDecrementAtomic(&info
->numthreads_throttling
);
1071 if (oldValue
<= 0) {
1072 panic("%s: numthreads negative", __func__
);
1075 ut
->uu_lowpri_window
= 0;
1076 if (ut
->uu_throttle_info
)
1077 throttle_info_rel(ut
->uu_throttle_info
);
1078 ut
->uu_throttle_info
= NULL
;
1079 ut
->uu_throttle_bc
= FALSE
;
1081 return (sleep_cnt
* LOWPRI_SLEEP_INTERVAL
);
1087 * set a kernel thread's IO policy. policy can be:
1088 * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE
1090 * explanations about these policies are in the man page of setiopolicy_np
1092 void throttle_set_thread_io_policy(int policy
)
1094 #if !CONFIG_EMBEDDED
1095 proc_apply_thread_selfdiskacc(policy
);
1096 #else /* !CONFIG_EMBEDDED */
1098 ut
= get_bsdthread_info(current_thread());
1099 ut
->uu_iopol_disk
= policy
;
1100 #endif /* !CONFIG_EMBEDDED */
1105 void throttle_info_reset_window(struct uthread
*ut
)
1107 struct _throttle_io_info_t
*info
;
1109 info
= ut
->uu_throttle_info
;
1111 OSDecrementAtomic(&info
->numthreads_throttling
);
1112 throttle_info_rel(info
);
1113 ut
->uu_throttle_info
= NULL
;
1114 ut
->uu_lowpri_window
= 0;
1118 void throttle_info_set_initial_window(struct uthread
*ut
, struct _throttle_io_info_t
*info
, boolean_t isssd
, boolean_t BC_throttle
)
1122 ut
->uu_throttle_info
= info
;
1123 throttle_info_ref(info
);
1124 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info
, info
);
1126 oldValue
= OSIncrementAtomic(&info
->numthreads_throttling
);
1128 panic("%s: numthreads negative", __func__
);
1130 ut
->uu_lowpri_window
= lowpri_IO_initial_window_msecs
;
1131 ut
->uu_lowpri_window
+= lowpri_IO_window_msecs_inc
* oldValue
;
1132 ut
->uu_throttle_isssd
= isssd
;
1133 ut
->uu_throttle_bc
= BC_throttle
;
1138 void throttle_info_update_internal(void *throttle_info
, int flags
, boolean_t isssd
)
1140 struct _throttle_io_info_t
*info
= throttle_info
;
1143 int is_throttleable_io
= 0;
1144 int is_passive_io
= 0;
1146 if (!lowpri_IO_initial_window_msecs
|| (info
== NULL
))
1148 policy
= throttle_get_io_policy(&ut
);
1154 case IOPOL_THROTTLE
:
1155 is_throttleable_io
= 1;
1161 printf("unknown I/O policy %d", policy
);
1165 if (!is_throttleable_io
&& ISSET(flags
, B_PASSIVE
))
1168 if (!is_throttleable_io
) {
1169 if (!is_passive_io
){
1170 microuptime(&info
->last_normal_IO_timestamp
);
1174 * I'd really like to do the IOSleep here, but
1175 * we may be holding all kinds of filesystem related locks
1176 * and the pages for this I/O marked 'busy'...
1177 * we don't want to cause a normal task to block on
1178 * one of these locks while we're throttling a task marked
1179 * for low priority I/O... we'll mark the uthread and
1180 * do the delay just before we return from the system
1181 * call that triggered this I/O or from vnode_pagein
1183 if (ut
->uu_lowpri_window
== 0)
1184 throttle_info_set_initial_window(ut
, info
, isssd
, FALSE
);
1186 /* The thread sends I/Os to different devices within the same system call */
1187 if (ut
->uu_throttle_info
!= info
) {
1188 struct _throttle_io_info_t
*old_info
= ut
->uu_throttle_info
;
1190 // keep track of the numthreads in the right device
1191 OSDecrementAtomic(&old_info
->numthreads_throttling
);
1192 OSIncrementAtomic(&info
->numthreads_throttling
);
1194 DEBUG_ALLOC_THROTTLE_INFO("switching from info = %p\n", old_info
, old_info
);
1195 DEBUG_ALLOC_THROTTLE_INFO("switching to info = %p\n", info
, info
);
1196 /* This thread no longer needs a reference on that throttle info */
1197 throttle_info_rel(ut
->uu_throttle_info
);
1198 ut
->uu_throttle_info
= info
;
1199 /* Need to take a reference on this throttle info */
1200 throttle_info_ref(ut
->uu_throttle_info
);
1202 int numthreads
= MAX(1, info
->numthreads_throttling
);
1203 ut
->uu_lowpri_window
+= lowpri_IO_window_msecs_inc
* numthreads
;
1204 if (ut
->uu_lowpri_window
> lowpri_max_window_msecs
* numthreads
)
1205 ut
->uu_lowpri_window
= lowpri_max_window_msecs
* numthreads
;
1207 if (isssd
== FALSE
) {
1209 * we're here because we've actually issued I/Os to different devices...
1210 * if at least one of them was a non SSD, then thottle the thread
1211 * using the policy for non SSDs
1213 ut
->uu_throttle_isssd
= FALSE
;
1222 * this is usually called before every I/O, used for throttled I/O
1223 * book keeping. This routine has low overhead and does not sleep
1225 void throttle_info_update(void *throttle_info
, int flags
)
1227 throttle_info_update_internal(throttle_info
, flags
, FALSE
);
1233 * this is usually called before every I/O, used for throttled I/O
1234 * book keeping. This routine has low overhead and does not sleep
1236 void throttle_info_update_by_mask(void *throttle_info_handle
, int flags
)
1238 void *throttle_info
= throttle_info_handle
;
1239 /* for now we only use the lowest bit of the throttle mask, so the
1240 * handle is the same as the throttle_info. Later if we store a
1241 * set of throttle infos in the handle, we will want to loop through
1242 * them and call throttle_info_update in a loop
1244 throttle_info_update(throttle_info
, flags
);
1247 extern int ignore_is_ssd
;
1250 spec_strategy(struct vnop_strategy_args
*ap
)
1259 struct _throttle_io_info_t
*throttle_info
;
1260 boolean_t isssd
= FALSE
;
1263 bdev
= buf_device(bp
);
1264 mp
= buf_vnode(bp
)->v_mount
;
1266 policy
= throttle_get_io_policy(&ut
);
1268 if (policy
== IOPOL_THROTTLE
) {
1269 bp
->b_flags
|= B_THROTTLED_IO
;
1270 bp
->b_attr
.ba_flags
|= BA_THROTTLED_IO
;
1271 bp
->b_flags
&= ~B_PASSIVE
;
1272 } else if (policy
== IOPOL_PASSIVE
)
1273 bp
->b_flags
|= B_PASSIVE
;
1275 bflags
= bp
->b_flags
;
1277 if (kdebug_enable
) {
1280 if (bflags
& B_READ
)
1282 if (bflags
& B_ASYNC
)
1285 if (bflags
& B_META
)
1287 else if (bflags
& B_PAGEIO
)
1288 code
|= DKIO_PAGING
;
1290 if (bflags
& B_THROTTLED_IO
)
1291 code
|= DKIO_THROTTLE
;
1292 else if (bflags
& B_PASSIVE
)
1293 code
|= DKIO_PASSIVE
;
1295 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW
, code
) | DBG_FUNC_NONE
,
1296 bp
, bdev
, (int)buf_blkno(bp
), buf_count(bp
), 0);
1298 if (((bflags
& (B_IOSTREAMING
| B_PAGEIO
| B_READ
)) == (B_PAGEIO
| B_READ
)) &&
1299 mp
&& (mp
->mnt_kern_flag
& MNTK_ROOTDEV
))
1300 hard_throttle_on_root
= 1;
1303 if ((mp
->mnt_kern_flag
& MNTK_SSD
) && !ignore_is_ssd
)
1305 throttle_info
= &_throttle_io_info
[mp
->mnt_devbsdunit
];
1307 throttle_info
= &_throttle_io_info
[LOWPRI_MAX_NUM_DEV
- 1];
1309 throttle_info_update_internal(throttle_info
, bflags
, isssd
);
1311 if ((bflags
& B_READ
) == 0) {
1312 microuptime(&throttle_info
->last_IO_timestamp
);
1314 INCR_PENDING_IO(buf_count(bp
), mp
->mnt_pending_write_size
);
1317 INCR_PENDING_IO(buf_count(bp
), mp
->mnt_pending_read_size
);
1320 * The BootCache may give us special information about
1321 * the IO, so it returns special values that we check
1324 * IO_SATISFIED_BY_CACHE
1325 * The read has been satisfied by the boot cache. Don't
1326 * throttle the thread unnecessarily.
1328 * IO_SHOULD_BE_THROTTLED
1329 * The boot cache is playing back a playlist and this IO
1330 * cut through. Throttle it so we're not cutting through
1331 * the boot cache too often.
1333 * Note that typical strategy routines are defined with
1334 * a void return so we'll get garbage here. In the
1335 * unlikely case the garbage matches our special return
1336 * value, it's not a big deal since we're only adjusting
1337 * the throttling delay.
1339 #define IO_SATISFIED_BY_CACHE ((int)0xcafefeed)
1340 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
1341 typedef int strategy_fcn_ret_t(struct buf
*bp
);
1343 strategy_ret
= (*(strategy_fcn_ret_t
*)bdevsw
[major(bdev
)].d_strategy
)(bp
);
1345 if ((IO_SATISFIED_BY_CACHE
== strategy_ret
) && (ut
->uu_lowpri_window
!= 0) && (ut
->uu_throttle_info
!= NULL
)) {
1347 * If this was a throttled IO satisfied by the boot cache,
1348 * don't delay the thread.
1350 throttle_info_reset_window(ut
);
1352 } else if ((IO_SHOULD_BE_THROTTLED
== strategy_ret
) && (ut
->uu_lowpri_window
== 0) && (ut
->uu_throttle_info
== NULL
)) {
1354 * If the boot cache indicates this IO should be throttled,
1357 throttle_info_set_initial_window(ut
, throttle_info
, isssd
, TRUE
);
1364 * This is a noop, simply returning what one has been given.
1367 spec_blockmap(__unused
struct vnop_blockmap_args
*ap
)
1374 * Device close routine
1377 spec_close(struct vnop_close_args
*ap
)
1379 struct vnode
*vp
= ap
->a_vp
;
1380 dev_t dev
= vp
->v_rdev
;
1382 int flags
= ap
->a_fflag
;
1383 struct proc
*p
= vfs_context_proc(ap
->a_context
);
1384 struct session
*sessp
;
1387 switch (vp
->v_type
) {
1391 * Hack: a tty device that is a controlling terminal
1392 * has a reference from the session structure.
1393 * We cannot easily tell that a character device is
1394 * a controlling terminal, unless it is the closing
1395 * process' controlling terminal. In that case,
1396 * if the reference count is 1 (this is the very
1399 sessp
= proc_session(p
);
1400 if (sessp
!= SESSION_NULL
) {
1401 if ((vcount(vp
) == 1) &&
1402 (vp
== sessp
->s_ttyvp
)) {
1404 session_lock(sessp
);
1405 if (vp
== sessp
->s_ttyvp
) {
1406 sessp
->s_ttyvp
= NULL
;
1407 sessp
->s_ttyvid
= 0;
1408 sessp
->s_ttyp
= TTY_NULL
;
1409 sessp
->s_ttypgrpid
= NO_PID
;
1412 session_unlock(sessp
);
1418 session_rele(sessp
);
1421 devsw_lock(dev
, S_IFCHR
);
1423 vp
->v_specinfo
->si_opencount
--;
1425 if (vp
->v_specinfo
->si_opencount
< 0) {
1426 panic("Negative open count?");
1429 * close on last reference or on vnode revoke call
1431 if ((vcount(vp
) > 0) && ((flags
& IO_REVOKE
) == 0)) {
1432 devsw_unlock(dev
, S_IFCHR
);
1436 error
= cdevsw
[major(dev
)].d_close(dev
, flags
, S_IFCHR
, p
);
1438 devsw_unlock(dev
, S_IFCHR
);
1443 * If there is more than one outstanding open, don't
1444 * send the close to the device.
1446 devsw_lock(dev
, S_IFBLK
);
1447 if (vcount(vp
) > 1) {
1448 vp
->v_specinfo
->si_opencount
--;
1449 devsw_unlock(dev
, S_IFBLK
);
1452 devsw_unlock(dev
, S_IFBLK
);
1455 * On last close of a block device (that isn't mounted)
1456 * we must invalidate any in core blocks, so that
1457 * we can, for instance, change floppy disks.
1459 if ((error
= spec_fsync_internal(vp
, MNT_WAIT
, ap
->a_context
)))
1462 error
= buf_invalidateblks(vp
, BUF_WRITE_DATA
, 0, 0);
1466 devsw_lock(dev
, S_IFBLK
);
1468 vp
->v_specinfo
->si_opencount
--;
1470 if (vp
->v_specinfo
->si_opencount
< 0) {
1471 panic("Negative open count?");
1474 if (vcount(vp
) > 0) {
1475 devsw_unlock(dev
, S_IFBLK
);
1479 error
= bdevsw
[major(dev
)].d_close(dev
, flags
, S_IFBLK
, p
);
1481 devsw_unlock(dev
, S_IFBLK
);
1485 panic("spec_close: not special");
1493 * Return POSIX pathconf information applicable to special devices.
1496 spec_pathconf(struct vnop_pathconf_args
*ap
)
1499 switch (ap
->a_name
) {
1501 *ap
->a_retval
= LINK_MAX
;
1504 *ap
->a_retval
= MAX_CANON
;
1507 *ap
->a_retval
= MAX_INPUT
;
1510 *ap
->a_retval
= PIPE_BUF
;
1512 case _PC_CHOWN_RESTRICTED
:
1513 *ap
->a_retval
= 200112; /* _POSIX_CHOWN_RESTRICTED */
1516 *ap
->a_retval
= _POSIX_VDISABLE
;
1525 * Special device failed operation
1528 spec_ebadf(__unused
void *dummy
)
1534 /* Blktooff derives file offset from logical block number */
1536 spec_blktooff(struct vnop_blktooff_args
*ap
)
1538 struct vnode
*vp
= ap
->a_vp
;
1540 switch (vp
->v_type
) {
1542 *ap
->a_offset
= (off_t
)-1; /* failure */
1546 printf("spec_blktooff: not implemented for VBLK\n");
1547 *ap
->a_offset
= (off_t
)-1; /* failure */
1551 panic("spec_blktooff type");
1558 /* Offtoblk derives logical block number from file offset */
1560 spec_offtoblk(struct vnop_offtoblk_args
*ap
)
1562 struct vnode
*vp
= ap
->a_vp
;
1564 switch (vp
->v_type
) {
1566 *ap
->a_lblkno
= (daddr64_t
)-1; /* failure */
1570 printf("spec_offtoblk: not implemented for VBLK\n");
1571 *ap
->a_lblkno
= (daddr64_t
)-1; /* failure */
1575 panic("spec_offtoblk type");
1582 static void filt_specdetach(struct knote
*kn
);
1583 static int filt_spec(struct knote
*kn
, long hint
);
1584 static unsigned filt_specpeek(struct knote
*kn
);
1586 struct filterops spec_filtops
= {
1588 .f_attach
= filt_specattach
,
1589 .f_detach
= filt_specdetach
,
1590 .f_event
= filt_spec
,
1591 .f_peek
= filt_specpeek
1595 filter_to_seltype(int16_t filter
)
1604 panic("filt_to_seltype(): invalid filter %d\n", filter
);
1610 filt_specattach(struct knote
*kn
)
1615 vp
= (vnode_t
)kn
->kn_fp
->f_fglob
->fg_data
; /* Already have iocount, and vnode is alive */
1617 assert(vnode_ischr(vp
));
1619 dev
= vnode_specrdev(vp
);
1621 if (major(dev
) > nchrdev
) {
1625 if ((cdevsw_flags
[major(dev
)] & CDEVSW_SELECT_KQUEUE
) == 0) {
1629 /* Resulting wql is safe to unlink even if it has never been linked */
1630 kn
->kn_hook
= wait_queue_link_allocate();
1631 if (kn
->kn_hook
== NULL
) {
1635 kn
->kn_fop
= &spec_filtops
;
1636 kn
->kn_hookid
= vnode_vid(vp
);
1638 knote_markstayqueued(kn
);
1644 filt_specdetach(struct knote
*kn
)
1649 * Given wait queue link and wait queue set, unlink. This is subtle.
1650 * If the device has been revoked from under us, selclearthread() will
1651 * have removed our link from the kqueue's wait queue set, which
1652 * wait_queue_set_unlink_one() will detect and handle.
1654 ret
= wait_queue_set_unlink_one(kn
->kn_kq
->kq_wqs
, kn
->kn_hook
);
1655 if (ret
!= KERN_SUCCESS
) {
1656 panic("filt_specdetach(): failed to unlink wait queue link.");
1659 (void)wait_queue_link_free(kn
->kn_hook
);
1661 kn
->kn_status
&= ~KN_STAYQUEUED
;
1665 filt_spec(struct knote
*kn
, long hint
)
1669 wait_queue_set_t old_wqs
;
1677 assert(kn
->kn_hook
!= NULL
);
1680 panic("filt_spec(): nonzero hint?");
1683 uth
= get_bsdthread_info(current_thread());
1684 ctx
= vfs_context_current();
1685 vp
= (vnode_t
)kn
->kn_fp
->f_fglob
->fg_data
;
1687 error
= vnode_getwithvid(vp
, kn
->kn_hookid
);
1689 kn
->kn_flags
|= (EV_EOF
| EV_ONESHOT
);
1693 dev
= vnode_specrdev(vp
);
1694 flags
= cdevsw_flags
[major(dev
)];
1695 use_offset
= ((flags
& CDEVSW_USE_OFFSET
) != 0);
1696 assert((flags
& CDEVSW_SELECT_KQUEUE
) != 0);
1698 /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */
1699 old_wqs
= uth
->uu_wqset
;
1700 uth
->uu_wqset
= kn
->kn_kq
->kq_wqs
;
1701 selres
= VNOP_SELECT(vp
, filter_to_seltype(kn
->kn_filter
), 0, kn
->kn_hook
, ctx
);
1702 uth
->uu_wqset
= old_wqs
;
1705 if (kn
->kn_fp
->f_fglob
->fg_offset
>= (uint32_t)selres
) {
1708 kn
->kn_data
= ((uint32_t)selres
) - kn
->kn_fp
->f_fglob
->fg_offset
;
1711 kn
->kn_data
= selres
;
1716 return (kn
->kn_data
!= 0);
1720 filt_specpeek(struct knote
*kn
)
1724 wait_queue_set_t old_wqs
;
1728 uth
= get_bsdthread_info(current_thread());
1729 ctx
= vfs_context_current();
1730 vp
= (vnode_t
)kn
->kn_fp
->f_fglob
->fg_data
;
1732 error
= vnode_getwithvid(vp
, kn
->kn_hookid
);
1734 return 1; /* Just like VNOP_SELECT() on recycled vnode */
1738 * Why pass the link here? Because we may not have registered in the past...
1740 old_wqs
= uth
->uu_wqset
;
1741 uth
->uu_wqset
= kn
->kn_kq
->kq_wqs
;
1742 selres
= VNOP_SELECT(vp
, filter_to_seltype(kn
->kn_filter
), 0, kn
->kn_hook
, ctx
);
1743 uth
->uu_wqset
= old_wqs
;