2 * Copyright (c) 2000-2012 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1989, 1993, 1995
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/kauth.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
70 #include <sys/buf_internal.h>
71 #include <sys/mount_internal.h>
72 #include <sys/vnode_internal.h>
73 #include <sys/file_internal.h>
74 #include <sys/namei.h>
76 #include <sys/errno.h>
77 #include <sys/ioctl.h>
80 #include <sys/malloc.h>
82 #include <sys/uio_internal.h>
83 #include <sys/resource.h>
84 #include <miscfs/specfs/specdev.h>
85 #include <vfs/vfs_support.h>
86 #include <kern/assert.h>
87 #include <kern/task.h>
89 #include <sys/kdebug.h>
91 /* XXX following three prototypes should be in a header file somewhere */
92 extern dev_t
chrtoblk(dev_t dev
);
93 extern int iskmemdev(dev_t dev
);
94 extern int bpfkqfilter(dev_t dev
, struct knote
*kn
);
95 extern int ptsd_kqfilter(dev_t dev
, struct knote
*kn
);
97 extern int ignore_is_ssd
;
99 struct vnode
*speclisth
[SPECHSZ
];
101 /* symbolic sleep message strings for devices */
102 char devopn
[] = "devopn";
103 char devio
[] = "devio";
104 char devwait
[] = "devwait";
105 char devin
[] = "devin";
106 char devout
[] = "devout";
107 char devioc
[] = "devioc";
108 char devcls
[] = "devcls";
110 #define VOPFUNC int (*)(void *)
112 int (**spec_vnodeop_p
)(void *);
113 struct vnodeopv_entry_desc spec_vnodeop_entries
[] = {
114 { &vnop_default_desc
, (VOPFUNC
)vn_default_error
},
115 { &vnop_lookup_desc
, (VOPFUNC
)spec_lookup
}, /* lookup */
116 { &vnop_create_desc
, (VOPFUNC
)err_create
}, /* create */
117 { &vnop_mknod_desc
, (VOPFUNC
)err_mknod
}, /* mknod */
118 { &vnop_open_desc
, (VOPFUNC
)spec_open
}, /* open */
119 { &vnop_close_desc
, (VOPFUNC
)spec_close
}, /* close */
120 { &vnop_access_desc
, (VOPFUNC
)spec_access
}, /* access */
121 { &vnop_getattr_desc
, (VOPFUNC
)spec_getattr
}, /* getattr */
122 { &vnop_setattr_desc
, (VOPFUNC
)spec_setattr
}, /* setattr */
123 { &vnop_read_desc
, (VOPFUNC
)spec_read
}, /* read */
124 { &vnop_write_desc
, (VOPFUNC
)spec_write
}, /* write */
125 { &vnop_ioctl_desc
, (VOPFUNC
)spec_ioctl
}, /* ioctl */
126 { &vnop_select_desc
, (VOPFUNC
)spec_select
}, /* select */
127 { &vnop_revoke_desc
, (VOPFUNC
)nop_revoke
}, /* revoke */
128 { &vnop_mmap_desc
, (VOPFUNC
)err_mmap
}, /* mmap */
129 { &vnop_fsync_desc
, (VOPFUNC
)spec_fsync
}, /* fsync */
130 { &vnop_remove_desc
, (VOPFUNC
)err_remove
}, /* remove */
131 { &vnop_link_desc
, (VOPFUNC
)err_link
}, /* link */
132 { &vnop_rename_desc
, (VOPFUNC
)err_rename
}, /* rename */
133 { &vnop_mkdir_desc
, (VOPFUNC
)err_mkdir
}, /* mkdir */
134 { &vnop_rmdir_desc
, (VOPFUNC
)err_rmdir
}, /* rmdir */
135 { &vnop_symlink_desc
, (VOPFUNC
)err_symlink
}, /* symlink */
136 { &vnop_readdir_desc
, (VOPFUNC
)err_readdir
}, /* readdir */
137 { &vnop_readlink_desc
, (VOPFUNC
)err_readlink
}, /* readlink */
138 { &vnop_inactive_desc
, (VOPFUNC
)nop_inactive
}, /* inactive */
139 { &vnop_reclaim_desc
, (VOPFUNC
)nop_reclaim
}, /* reclaim */
140 { &vnop_strategy_desc
, (VOPFUNC
)spec_strategy
}, /* strategy */
141 { &vnop_pathconf_desc
, (VOPFUNC
)spec_pathconf
}, /* pathconf */
142 { &vnop_advlock_desc
, (VOPFUNC
)err_advlock
}, /* advlock */
143 { &vnop_bwrite_desc
, (VOPFUNC
)spec_bwrite
}, /* bwrite */
144 { &vnop_pagein_desc
, (VOPFUNC
)err_pagein
}, /* Pagein */
145 { &vnop_pageout_desc
, (VOPFUNC
)err_pageout
}, /* Pageout */
146 { &vnop_copyfile_desc
, (VOPFUNC
)err_copyfile
}, /* Copyfile */
147 { &vnop_blktooff_desc
, (VOPFUNC
)spec_blktooff
}, /* blktooff */
148 { &vnop_offtoblk_desc
, (VOPFUNC
)spec_offtoblk
}, /* offtoblk */
149 { &vnop_blockmap_desc
, (VOPFUNC
)spec_blockmap
}, /* blockmap */
150 { (struct vnodeop_desc
*)NULL
, (int(*)())NULL
}
152 struct vnodeopv_desc spec_vnodeop_opv_desc
=
153 { &spec_vnodeop_p
, spec_vnodeop_entries
};
156 static void set_blocksize(vnode_t
, dev_t
);
159 #define THROTTLE_LEVEL_NONE -1
160 #define THROTTLE_LEVEL_TIER0 0
162 #define THROTTLE_LEVEL_THROTTLED 1
163 #define THROTTLE_LEVEL_TIER1 1
164 #define THROTTLE_LEVEL_TIER2 2
166 #define THROTTLE_LEVEL_START 0
167 #define THROTTLE_LEVEL_END 2
170 struct _throttle_io_info_t
{
171 struct timeval throttle_last_IO_timestamp
[THROTTLE_LEVEL_END
+ 1];
172 struct timeval throttle_last_write_timestamp
;
173 struct timeval throttle_start_IO_period_timestamp
;
175 TAILQ_HEAD( , uthread
) throttle_uthlist
; /* List of throttled uthreads */
177 lck_mtx_t throttle_lock
;
178 thread_call_t throttle_timer_call
;
179 int32_t throttle_timer_running
;
180 int32_t throttle_io_count
;
181 int32_t throttle_io_count_begin
;
182 int32_t throttle_io_period
;
183 uint32_t throttle_io_period_num
;
184 int32_t throttle_refcnt
;
185 int32_t throttle_alloc
;
188 struct _throttle_io_info_t _throttle_io_info
[LOWPRI_MAX_NUM_DEV
];
190 static void throttle_info_update_internal(struct _throttle_io_info_t
*info
, uthread_t ut
, int policy
, int flags
, boolean_t isssd
);
191 static int throttle_get_thread_throttle_level(uthread_t ut
, int policy
);
193 __private_extern__
int32_t throttle_legacy_process_count
= 0;
196 * Trivial lookup routine that always fails.
199 spec_lookup(struct vnop_lookup_args
*ap
)
207 set_blocksize(struct vnode
*vp
, dev_t dev
)
212 if ((major(dev
) < nblkdev
) && (size
= bdevsw
[major(dev
)].d_psize
)) {
213 rsize
= (*size
)(dev
);
214 if (rsize
<= 0) /* did size fail? */
215 vp
->v_specsize
= DEV_BSIZE
;
217 vp
->v_specsize
= rsize
;
220 vp
->v_specsize
= DEV_BSIZE
;
224 set_fsblocksize(struct vnode
*vp
)
227 if (vp
->v_type
== VBLK
) {
228 dev_t dev
= (dev_t
)vp
->v_rdev
;
229 int maj
= major(dev
);
231 if ((u_int
)maj
>= (u_int
)nblkdev
)
235 set_blocksize(vp
, dev
);
243 * Open a special file.
246 spec_open(struct vnop_open_args
*ap
)
248 struct proc
*p
= vfs_context_proc(ap
->a_context
);
249 kauth_cred_t cred
= vfs_context_ucred(ap
->a_context
);
250 struct vnode
*vp
= ap
->a_vp
;
251 dev_t bdev
, dev
= (dev_t
)vp
->v_rdev
;
252 int maj
= major(dev
);
256 * Don't allow open if fs is mounted -nodev.
258 if (vp
->v_mount
&& (vp
->v_mount
->mnt_flag
& MNT_NODEV
))
261 switch (vp
->v_type
) {
264 if ((u_int
)maj
>= (u_int
)nchrdev
)
266 if (cred
!= FSCRED
&& (ap
->a_mode
& FWRITE
)) {
268 * When running in very secure mode, do not allow
269 * opens for writing of any disk character devices.
271 if (securelevel
>= 2 && isdisk(dev
, VCHR
))
274 * When running in secure mode, do not allow opens
275 * for writing of /dev/mem, /dev/kmem, or character
276 * devices whose corresponding block devices are
279 if (securelevel
>= 1) {
280 if ((bdev
= chrtoblk(dev
)) != NODEV
&& check_mountedon(bdev
, VBLK
, &error
))
287 devsw_lock(dev
, S_IFCHR
);
288 error
= (*cdevsw
[maj
].d_open
)(dev
, ap
->a_mode
, S_IFCHR
, p
);
291 vp
->v_specinfo
->si_opencount
++;
294 devsw_unlock(dev
, S_IFCHR
);
296 if (error
== 0 && (D_TYPEMASK
& cdevsw
[maj
].d_type
) == D_DISK
&& !vp
->v_un
.vu_specinfo
->si_initted
) {
298 uint64_t throttle_mask
= 0;
299 uint32_t devbsdunit
= 0;
301 if (VNOP_IOCTL(vp
, DKIOCGETTHROTTLEMASK
, (caddr_t
)&throttle_mask
, 0, NULL
) == 0) {
303 if (throttle_mask
!= 0 &&
304 VNOP_IOCTL(vp
, DKIOCISSOLIDSTATE
, (caddr_t
)&isssd
, 0, ap
->a_context
) == 0) {
306 * as a reasonable approximation, only use the lowest bit of the mask
307 * to generate a disk unit number
309 devbsdunit
= num_trailing_0(throttle_mask
);
313 vp
->v_un
.vu_specinfo
->si_isssd
= isssd
;
314 vp
->v_un
.vu_specinfo
->si_devbsdunit
= devbsdunit
;
315 vp
->v_un
.vu_specinfo
->si_throttle_mask
= throttle_mask
;
316 vp
->v_un
.vu_specinfo
->si_throttleable
= 1;
317 vp
->v_un
.vu_specinfo
->si_initted
= 1;
322 if (vp
->v_un
.vu_specinfo
->si_initted
== 0) {
324 vp
->v_un
.vu_specinfo
->si_initted
= 1;
331 if ((u_int
)maj
>= (u_int
)nblkdev
)
334 * When running in very secure mode, do not allow
335 * opens for writing of any disk block devices.
337 if (securelevel
>= 2 && cred
!= FSCRED
&&
338 (ap
->a_mode
& FWRITE
) && isdisk(dev
, VBLK
))
341 * Do not allow opens of block devices that are
344 if ( (error
= vfs_mountedon(vp
)) )
347 devsw_lock(dev
, S_IFBLK
);
348 error
= (*bdevsw
[maj
].d_open
)(dev
, ap
->a_mode
, S_IFBLK
, p
);
350 vp
->v_specinfo
->si_opencount
++;
352 devsw_unlock(dev
, S_IFBLK
);
358 u_int32_t size512
= 512;
361 if (!VNOP_IOCTL(vp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&blksize
, 0, ap
->a_context
)) {
362 /* Switch to 512 byte sectors (temporarily) */
364 if (!VNOP_IOCTL(vp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&size512
, FWRITE
, ap
->a_context
)) {
365 /* Get the number of 512 byte physical blocks. */
366 if (!VNOP_IOCTL(vp
, DKIOCGETBLOCKCOUNT
, (caddr_t
)&blkcnt
, 0, ap
->a_context
)) {
370 /* If it doesn't set back, we can't recover */
371 if (VNOP_IOCTL(vp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&blksize
, FWRITE
, ap
->a_context
))
377 set_blocksize(vp
, dev
);
380 * Cache the size in bytes of the block device for later
381 * use by spec_write().
384 vp
->v_specdevsize
= blkcnt
* (u_int64_t
)size512
;
386 vp
->v_specdevsize
= (u_int64_t
)0; /* Default: Can't get */
393 panic("spec_open type");
402 spec_read(struct vnop_read_args
*ap
)
404 struct vnode
*vp
= ap
->a_vp
;
405 struct uio
*uio
= ap
->a_uio
;
407 daddr64_t bn
, nextbn
;
415 if (uio
->uio_rw
!= UIO_READ
)
416 panic("spec_read mode");
417 if (UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))
418 panic("spec_read proc");
420 if (uio_resid(uio
) == 0)
423 switch (vp
->v_type
) {
426 if ((D_TYPEMASK
& cdevsw
[major(vp
->v_rdev
)].d_type
) == D_DISK
&& vp
->v_un
.vu_specinfo
->si_throttleable
) {
427 struct _throttle_io_info_t
*throttle_info
;
429 throttle_info
= &_throttle_io_info
[vp
->v_un
.vu_specinfo
->si_devbsdunit
];
431 throttle_info_update_internal(throttle_info
, NULL
, -1, 0, vp
->v_un
.vu_specinfo
->si_isssd
);
433 error
= (*cdevsw
[major(vp
->v_rdev
)].d_read
)
434 (vp
->v_rdev
, uio
, ap
->a_ioflag
);
439 if (uio
->uio_offset
< 0)
444 devBlockSize
= vp
->v_specsize
;
446 if (devBlockSize
> PAGE_SIZE
)
449 bscale
= PAGE_SIZE
/ devBlockSize
;
450 bsize
= bscale
* devBlockSize
;
453 on
= uio
->uio_offset
% bsize
;
455 bn
= (daddr64_t
)((uio
->uio_offset
/ devBlockSize
) &~ (bscale
- 1));
457 if (vp
->v_speclastr
+ bscale
== bn
) {
458 nextbn
= bn
+ bscale
;
459 error
= buf_breadn(vp
, bn
, (int)bsize
, &nextbn
,
460 (int *)&bsize
, 1, NOCRED
, &bp
);
462 error
= buf_bread(vp
, bn
, (int)bsize
, NOCRED
, &bp
);
465 vp
->v_speclastr
= bn
;
468 n
= bsize
- buf_resid(bp
);
469 if ((on
> n
) || error
) {
475 n
= min((unsigned)(n
- on
), uio_resid(uio
));
477 error
= uiomove((char *)buf_dataptr(bp
) + on
, n
, uio
);
481 } while (error
== 0 && uio_resid(uio
) > 0 && n
!= 0);
485 panic("spec_read type");
496 spec_write(struct vnop_write_args
*ap
)
498 struct vnode
*vp
= ap
->a_vp
;
499 struct uio
*uio
= ap
->a_uio
;
502 int bsize
, blkmask
, bscale
;
510 if (uio
->uio_rw
!= UIO_WRITE
)
511 panic("spec_write mode");
512 if (UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))
513 panic("spec_write proc");
516 switch (vp
->v_type
) {
519 if ((D_TYPEMASK
& cdevsw
[major(vp
->v_rdev
)].d_type
) == D_DISK
&& vp
->v_un
.vu_specinfo
->si_throttleable
) {
520 struct _throttle_io_info_t
*throttle_info
;
522 throttle_info
= &_throttle_io_info
[vp
->v_un
.vu_specinfo
->si_devbsdunit
];
524 throttle_info_update_internal(throttle_info
, NULL
, -1, 0, vp
->v_un
.vu_specinfo
->si_isssd
);
526 microuptime(&throttle_info
->throttle_last_write_timestamp
);
528 error
= (*cdevsw
[major(vp
->v_rdev
)].d_write
)
529 (vp
->v_rdev
, uio
, ap
->a_ioflag
);
534 if (uio_resid(uio
) == 0)
536 if (uio
->uio_offset
< 0)
539 io_sync
= (ap
->a_ioflag
& IO_SYNC
);
543 devBlockSize
= vp
->v_specsize
;
544 if (devBlockSize
> PAGE_SIZE
)
547 bscale
= PAGE_SIZE
/ devBlockSize
;
548 blkmask
= bscale
- 1;
549 bsize
= bscale
* devBlockSize
;
553 bn
= (daddr64_t
)((uio
->uio_offset
/ devBlockSize
) &~ blkmask
);
554 on
= uio
->uio_offset
% bsize
;
556 n
= min((unsigned)(bsize
- on
), uio_resid(uio
));
559 * Use buf_getblk() as an optimization IFF:
561 * 1) We are reading exactly a block on a block
563 * 2) We know the size of the device from spec_open
564 * 3) The read doesn't span the end of the device
566 * Otherwise, we fall back on buf_bread().
569 vp
->v_specdevsize
!= (u_int64_t
)0 &&
570 (uio
->uio_offset
+ (u_int64_t
)n
) > vp
->v_specdevsize
) {
571 /* reduce the size of the read to what is there */
572 n
= (uio
->uio_offset
+ (u_int64_t
)n
) - vp
->v_specdevsize
;
576 bp
= buf_getblk(vp
, bn
, bsize
, 0, 0, BLK_WRITE
);
578 error
= (int)buf_bread(vp
, bn
, bsize
, NOCRED
, &bp
);
580 /* Translate downstream error for upstream, if needed */
582 error
= (int)buf_error(bp
);
587 n
= min(n
, bsize
- buf_resid(bp
));
589 error
= uiomove((char *)buf_dataptr(bp
) + on
, n
, uio
);
597 error
= buf_bwrite(bp
);
599 if ((n
+ on
) == bsize
)
600 error
= buf_bawrite(bp
);
602 error
= buf_bdwrite(bp
);
604 } while (error
== 0 && uio_resid(uio
) > 0 && n
!= 0);
608 panic("spec_write type");
616 * Device ioctl operation.
619 spec_ioctl(struct vnop_ioctl_args
*ap
)
621 proc_t p
= vfs_context_proc(ap
->a_context
);
622 dev_t dev
= ap
->a_vp
->v_rdev
;
625 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL
, 0) | DBG_FUNC_START
,
626 (unsigned int)dev
, (unsigned int)ap
->a_command
, (unsigned int)ap
->a_fflag
, (unsigned int)ap
->a_vp
->v_type
, 0);
628 switch (ap
->a_vp
->v_type
) {
631 retval
= (*cdevsw
[major(dev
)].d_ioctl
)(dev
, ap
->a_command
, ap
->a_data
,
637 if (ap
->a_command
== DKIOCUNMAP
) {
642 unmap
= (dk_unmap_t
*)ap
->a_data
;
643 extent
= unmap
->extents
;
645 for (i
= 0; i
< unmap
->extentsCount
; i
++, extent
++) {
646 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL
, 1) | DBG_FUNC_NONE
, dev
, extent
->offset
/ap
->a_vp
->v_specsize
, extent
->length
, 0, 0);
650 retval
= (*bdevsw
[major(dev
)].d_ioctl
)(dev
, ap
->a_command
, ap
->a_data
, ap
->a_fflag
, p
);
657 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL
, 0) | DBG_FUNC_END
,
658 (unsigned int)dev
, (unsigned int)ap
->a_command
, (unsigned int)ap
->a_fflag
, retval
, 0);
664 spec_select(struct vnop_select_args
*ap
)
666 proc_t p
= vfs_context_proc(ap
->a_context
);
669 switch (ap
->a_vp
->v_type
) {
672 return (1); /* XXX */
675 dev
= ap
->a_vp
->v_rdev
;
676 return (*cdevsw
[major(dev
)].d_select
)(dev
, ap
->a_which
, ap
->a_wql
, p
);
680 static int filt_specattach(struct knote
*kn
);
683 spec_kqfilter(vnode_t vp
, struct knote
*kn
)
689 * For a few special kinds of devices, we can attach knotes.
690 * Each filter function must check whether the dev type matches it.
692 dev
= vnode_specrdev(vp
);
694 if (vnode_istty(vp
)) {
695 /* We can hook into TTYs... */
696 err
= filt_specattach(kn
);
698 /* Try a bpf device, as defined in bsd/net/bpf.c */
699 err
= bpfkqfilter(dev
, kn
);
706 * Synch buffers associated with a block device
709 spec_fsync_internal(vnode_t vp
, int waitfor
, __unused vfs_context_t context
)
711 if (vp
->v_type
== VCHR
)
714 * Flush all dirty buffers associated with a block device.
716 buf_flushdirtyblks(vp
, (waitfor
== MNT_WAIT
|| waitfor
== MNT_DWAIT
), 0, "spec_fsync");
722 spec_fsync(struct vnop_fsync_args
*ap
)
724 return spec_fsync_internal(ap
->a_vp
, ap
->a_waitfor
, ap
->a_context
);
729 * Just call the device strategy routine
731 extern int hard_throttle_on_root
;
733 void throttle_init(void);
736 #define LOWPRI_THROTTLE_WINDOW_MSECS 500
737 #define LOWPRI_LEGACY_THROTTLE_WINDOW_MSECS 200
738 #define LOWPRI_IO_PERIOD_MSECS 200
739 #define LOWPRI_IO_PERIOD_SSD_MSECS 20
740 #define LOWPRI_TIMER_PERIOD_MSECS 10
743 int lowpri_throttle_window_msecs
= LOWPRI_THROTTLE_WINDOW_MSECS
;
744 int lowpri_legacy_throttle_window_msecs
= LOWPRI_LEGACY_THROTTLE_WINDOW_MSECS
;
745 int lowpri_io_period_msecs
= LOWPRI_IO_PERIOD_MSECS
;
746 int lowpri_io_period_ssd_msecs
= LOWPRI_IO_PERIOD_SSD_MSECS
;
747 int lowpri_timer_period_msecs
= LOWPRI_TIMER_PERIOD_MSECS
;
750 * If a process requiring legacy iothrottle behavior is running on the
751 * system, use legacy limits for throttle window and max IO size.
754 #define THROTTLE_WINDOW (lowpri_throttle_window_msecs)
756 #define THROTTLE_WINDOW (throttle_legacy_process_count == 0 ? lowpri_throttle_window_msecs : lowpri_legacy_throttle_window_msecs)
760 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) \
762 if ((debug_info)->alloc) \
763 printf("%s: "format, __FUNCTION__, ## args); \
767 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
770 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_throttle_window_msecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &lowpri_throttle_window_msecs
, 0, "");
771 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_legacy_throttle_window_msecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &lowpri_legacy_throttle_window_msecs
, 0, "");
772 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_io_period_msecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &lowpri_io_period_msecs
, 0, "");
773 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_io_period_ssd_msecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &lowpri_io_period_ssd_msecs
, 0, "");
774 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_timer_period_msecs
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &lowpri_timer_period_msecs
, 0, "");
775 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_legacy_process_count
, CTLFLAG_RD
| CTLFLAG_LOCKED
, &throttle_legacy_process_count
, 0, "");
777 static lck_grp_t
*throttle_mtx_grp
;
778 static lck_attr_t
*throttle_mtx_attr
;
779 static lck_grp_attr_t
*throttle_mtx_grp_attr
;
783 * throttled I/O helper function
784 * convert the index of the lowest set bit to a device index
787 num_trailing_0(uint64_t n
)
790 * since in most cases the number of trailing 0s is very small,
791 * we simply counting sequentially from the lowest bit
794 return sizeof(n
) * 8;
796 while (!ISSET(n
, 1)) {
805 * Release the reference and if the item was allocated and this is the last
806 * reference then free it.
808 * This routine always returns the old value.
811 throttle_info_rel(struct _throttle_io_info_t
*info
)
813 SInt32 oldValue
= OSDecrementAtomic(&info
->throttle_refcnt
);
815 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
816 info
, (int)(oldValue
-1), info
);
818 /* The reference count just went negative, very bad */
820 panic("throttle info ref cnt went negative!");
823 * Once reference count is zero, no one else should be able to take a
826 if ((info
->throttle_refcnt
== 0) && (info
->throttle_alloc
)) {
827 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info
);
829 lck_mtx_destroy(&info
->throttle_lock
, throttle_mtx_grp
);
837 * Just take a reference on the throttle info structure.
839 * This routine always returns the old value.
842 throttle_info_ref(struct _throttle_io_info_t
*info
)
844 SInt32 oldValue
= OSIncrementAtomic(&info
->throttle_refcnt
);
846 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
847 info
, (int)(oldValue
-1), info
);
848 /* Allocated items should never have a reference of zero */
849 if (info
->throttle_alloc
&& (oldValue
== 0))
850 panic("Taking a reference without calling create throttle info!\n");
857 * on entry the throttle_lock is held...
858 * this function is responsible for taking
859 * and dropping the reference on the info
860 * structure which will keep it from going
861 * away while the timer is running if it
862 * happens to have been dynamically allocated by
863 * a network fileystem kext which is now trying
867 throttle_timer_start(struct _throttle_io_info_t
*info
, boolean_t update_io_count
)
869 struct timeval elapsed
;
870 uint64_t elapsed_msecs
;
874 if (update_io_count
== TRUE
) {
875 info
->throttle_io_count_begin
= info
->throttle_io_count
;
876 info
->throttle_io_period_num
++;
878 microuptime(&info
->throttle_start_IO_period_timestamp
);
880 for (throttle_level
= THROTTLE_LEVEL_START
; throttle_level
< THROTTLE_LEVEL_END
; throttle_level
++) {
882 microuptime(&elapsed
);
883 timevalsub(&elapsed
, &info
->throttle_last_IO_timestamp
[throttle_level
]);
884 elapsed_msecs
= (uint64_t)elapsed
.tv_sec
* (uint64_t)1000 + (elapsed
.tv_usec
/ 1000);
886 if (elapsed_msecs
< (uint64_t)THROTTLE_WINDOW
) {
888 * we had an I/O occur in this level within
889 * our throttle window, so we need to
890 * to make sure the timer continues to run
895 if (throttle_level
>= THROTTLE_LEVEL_END
) {
897 * we're outside all of the throttle windows...
898 * don't start a new timer
900 info
->throttle_timer_running
= 0;
902 return (THROTTLE_LEVEL_END
);
904 if (info
->throttle_timer_running
== 0) {
906 * take a reference for the timer
908 throttle_info_ref(info
);
910 info
->throttle_timer_running
= 1;
912 clock_interval_to_deadline(lowpri_timer_period_msecs
, 1000000, &deadline
);
914 thread_call_enter_delayed(info
->throttle_timer_call
, deadline
);
916 return (throttle_level
);
921 throttle_timer(struct _throttle_io_info_t
*info
)
923 uthread_t ut
, utlist
;
924 struct timeval elapsed
;
925 uint64_t elapsed_msecs
;
927 boolean_t update_io_count
= FALSE
;
928 boolean_t need_wakeup
= FALSE
;
929 boolean_t need_release
= FALSE
;
931 lck_mtx_lock(&info
->throttle_lock
);
933 microuptime(&elapsed
);
934 timevalsub(&elapsed
, &info
->throttle_start_IO_period_timestamp
);
935 elapsed_msecs
= (uint64_t)elapsed
.tv_sec
* (uint64_t)1000 + (elapsed
.tv_usec
/ 1000);
937 if (elapsed_msecs
>= (uint64_t)info
->throttle_io_period
) {
939 * we're closing out the current IO period...
940 * if we have a waiting thread, wake it up
941 * after we have reset the I/O window info
944 update_io_count
= TRUE
;
946 if ((throttle_level
= throttle_timer_start(info
, update_io_count
)) == THROTTLE_LEVEL_END
) {
948 * we are now outside of the throttle window
949 * for all throttle levels...
951 * the timer is not restarted in this case, so
952 * we need to get rid of the reference we took when
953 * we started up the timer... we can't do this
954 * until we are entirely done playing with 'info'
959 TAILQ_FOREACH_SAFE(ut
, &info
->throttle_uthlist
, uu_throttlelist
, utlist
) {
961 * if we are now outside of the throttle window release
962 * all of the currently blocked threads, otherwise
963 * look for threads that have had their IO policy changed
964 * by someone else and are no longer throttleable, or are
965 * not at the current throttle level and unblock them
967 if (throttle_level
== THROTTLE_LEVEL_END
|| throttle_get_thread_throttle_level(ut
, -1) <= throttle_level
) {
969 TAILQ_REMOVE(&info
->throttle_uthlist
, ut
, uu_throttlelist
);
970 ut
->uu_on_throttlelist
= 0;
972 wakeup(&ut
->uu_on_throttlelist
);
975 if (need_wakeup
&& !TAILQ_EMPTY(&info
->throttle_uthlist
)) {
977 * we've entered a new I/O period and we're still
978 * in the throttle window, so wakeup the next guy in line
980 ut
= (uthread_t
)TAILQ_FIRST(&info
->throttle_uthlist
);
981 TAILQ_REMOVE(&info
->throttle_uthlist
, ut
, uu_throttlelist
);
982 ut
->uu_on_throttlelist
= 0;
984 wakeup(&ut
->uu_on_throttlelist
);
986 lck_mtx_unlock(&info
->throttle_lock
);
988 if (need_release
== TRUE
)
989 throttle_info_rel(info
);
996 struct _throttle_io_info_t
*info
;
1000 * allocate lock group attribute and group
1002 throttle_mtx_grp_attr
= lck_grp_attr_alloc_init();
1003 throttle_mtx_grp
= lck_grp_alloc_init("throttle I/O", throttle_mtx_grp_attr
);
1006 * allocate the lock attribute
1008 throttle_mtx_attr
= lck_attr_alloc_init();
1010 for (i
= 0; i
< LOWPRI_MAX_NUM_DEV
; i
++) {
1011 info
= &_throttle_io_info
[i
];
1013 lck_mtx_init(&info
->throttle_lock
, throttle_mtx_grp
, throttle_mtx_attr
);
1014 info
->throttle_timer_call
= thread_call_allocate((thread_call_func_t
)throttle_timer
, (thread_call_param_t
)info
);
1016 TAILQ_INIT(&info
->throttle_uthlist
);
1024 * wakeup and remove the specified thread from the throttle queue
1025 * if it's no longer in a throttleable state...
1026 * takes a valid uthread (which may or may not be on the
1027 * throttle queue) as input
1030 unthrottle_thread(uthread_t ut
)
1032 struct _throttle_io_info_t
*info
;
1034 if ((info
= ut
->uu_throttle_info
) == NULL
)
1037 lck_mtx_lock(&info
->throttle_lock
);
1039 if (ut
->uu_on_throttlelist
&& throttle_get_thread_throttle_level(ut
, -1) <= THROTTLE_LEVEL_THROTTLED
) {
1040 TAILQ_REMOVE(&info
->throttle_uthlist
, ut
, uu_throttlelist
);
1041 ut
->uu_on_throttlelist
= 0;
1043 wakeup(&ut
->uu_on_throttlelist
);
1045 lck_mtx_unlock(&info
->throttle_lock
);
1052 * Create and take a reference on a throttle info structure and return a
1053 * pointer for the file system to use when calling throttle_info_update.
1054 * Calling file system must have a matching release for every create.
1057 throttle_info_create(void)
1059 struct _throttle_io_info_t
*info
;
1061 MALLOC(info
, struct _throttle_io_info_t
*, sizeof(*info
), M_TEMP
, M_ZERO
| M_WAITOK
);
1062 /* Should never happen but just in case */
1065 /* Mark that this one was allocated and needs to be freed */
1066 DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info
, info
);
1067 info
->throttle_alloc
= TRUE
;
1069 lck_mtx_init(&info
->throttle_lock
, throttle_mtx_grp
, throttle_mtx_attr
);
1070 info
->throttle_timer_call
= thread_call_allocate((thread_call_func_t
)throttle_timer
, (thread_call_param_t
)info
);
1072 TAILQ_INIT(&info
->throttle_uthlist
);
1074 /* Take a reference */
1075 OSIncrementAtomic(&info
->throttle_refcnt
);
1082 * Release the throttle info pointer if all the reference are gone. Should be
1083 * called to release reference taken by throttle_info_create
1086 throttle_info_release(void *throttle_info
)
1088 DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
1089 (struct _throttle_io_info_t
*)throttle_info
,
1090 (struct _throttle_io_info_t
*)throttle_info
);
1091 if (throttle_info
) /* Just to be careful */
1092 throttle_info_rel(throttle_info
);
1098 * File Systems that create an info structure, need to call this routine in
1099 * their mount routine (used by cluster code). File Systems that call this in
1100 * their mount routines must call throttle_info_mount_rel in their unmount
1104 throttle_info_mount_ref(mount_t mp
, void *throttle_info
)
1106 if ((throttle_info
== NULL
) || (mp
== NULL
))
1108 throttle_info_ref(throttle_info
);
1111 * We already have a reference release it before adding the new one
1113 if (mp
->mnt_throttle_info
)
1114 throttle_info_rel(mp
->mnt_throttle_info
);
1115 mp
->mnt_throttle_info
= throttle_info
;
1119 * Private KPI routine
1121 * return a handle for accessing throttle_info given a throttle_mask. The
1122 * handle must be released by throttle_info_rel_by_mask
1125 throttle_info_ref_by_mask(uint64_t throttle_mask
, throttle_info_handle_t
*throttle_info_handle
)
1128 struct _throttle_io_info_t
*info
;
1130 if (throttle_info_handle
== NULL
)
1133 dev_index
= num_trailing_0(throttle_mask
);
1134 info
= &_throttle_io_info
[dev_index
];
1135 throttle_info_ref(info
);
1136 *(struct _throttle_io_info_t
**)throttle_info_handle
= info
;
1142 * Private KPI routine
1144 * release the handle obtained by throttle_info_ref_by_mask
1147 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle
)
1150 * for now the handle is just a pointer to _throttle_io_info_t
1152 throttle_info_rel((struct _throttle_io_info_t
*)throttle_info_handle
);
1158 * File Systems that throttle_info_mount_ref, must call this routine in their
1162 throttle_info_mount_rel(mount_t mp
)
1164 if (mp
->mnt_throttle_info
)
1165 throttle_info_rel(mp
->mnt_throttle_info
);
1166 mp
->mnt_throttle_info
= NULL
;
1170 throttle_info_get_last_io_time(mount_t mp
, struct timeval
*tv
)
1172 struct _throttle_io_info_t
*info
;
1175 info
= &_throttle_io_info
[LOWPRI_MAX_NUM_DEV
- 1];
1176 else if (mp
->mnt_throttle_info
== NULL
)
1177 info
= &_throttle_io_info
[mp
->mnt_devbsdunit
];
1179 info
= mp
->mnt_throttle_info
;
1181 *tv
= info
->throttle_last_write_timestamp
;
1185 update_last_io_time(mount_t mp
)
1187 struct _throttle_io_info_t
*info
;
1190 info
= &_throttle_io_info
[LOWPRI_MAX_NUM_DEV
- 1];
1191 else if (mp
->mnt_throttle_info
== NULL
)
1192 info
= &_throttle_io_info
[mp
->mnt_devbsdunit
];
1194 info
= mp
->mnt_throttle_info
;
1196 microuptime(&info
->throttle_last_write_timestamp
);
1201 throttle_get_io_policy(uthread_t
*ut
)
1203 *ut
= get_bsdthread_info(current_thread());
1205 return (proc_get_task_selfdiskacc());
1211 throttle_get_thread_throttle_level(uthread_t ut
, int policy
)
1213 int thread_throttle_level
= THROTTLE_LEVEL_NONE
;
1216 ut
= get_bsdthread_info(current_thread());
1219 policy
= proc_get_diskacc(ut
->uu_thread
);
1225 thread_throttle_level
= THROTTLE_LEVEL_TIER0
;
1227 if (ut
->uu_throttle_bc
== TRUE
)
1228 thread_throttle_level
= THROTTLE_LEVEL_TIER2
;
1230 case IOPOL_THROTTLE
:
1231 thread_throttle_level
= THROTTLE_LEVEL_TIER2
;
1234 thread_throttle_level
= THROTTLE_LEVEL_TIER1
;
1237 printf("unknown I/O policy %d", policy
);
1240 return (thread_throttle_level
);
1245 throttle_io_will_be_throttled_internal(void * throttle_info
)
1247 struct _throttle_io_info_t
*info
= throttle_info
;
1248 struct timeval elapsed
;
1249 uint64_t elapsed_msecs
;
1250 int thread_throttle_level
;
1253 if ((thread_throttle_level
= throttle_get_thread_throttle_level(NULL
, -1)) < THROTTLE_LEVEL_THROTTLED
)
1256 for (throttle_level
= THROTTLE_LEVEL_START
; throttle_level
< thread_throttle_level
; throttle_level
++) {
1258 microuptime(&elapsed
);
1259 timevalsub(&elapsed
, &info
->throttle_last_IO_timestamp
[throttle_level
]);
1260 elapsed_msecs
= (uint64_t)elapsed
.tv_sec
* (uint64_t)1000 + (elapsed
.tv_usec
/ 1000);
1262 if (elapsed_msecs
< (uint64_t)THROTTLE_WINDOW
)
1265 if (throttle_level
>= thread_throttle_level
) {
1267 * we're beyond all of the throttle windows
1268 * that affect the throttle level of this thread,
1269 * so go ahead and treat as normal I/O
1273 if (info
->throttle_io_count
!= info
->throttle_io_count_begin
) {
1275 * we've already issued at least one throttleable I/O
1276 * in the current I/O window, so avoid issuing another one
1281 * we're in the throttle window, so
1282 * cut the I/O size back
1288 * If we have a mount point and it has a throttle info pointer then
1289 * use it to do the check, otherwise use the device unit number to find
1290 * the correct throttle info array element.
1293 throttle_io_will_be_throttled(__unused
int lowpri_window_msecs
, mount_t mp
)
1298 * Should we just return zero if no mount point
1301 info
= &_throttle_io_info
[LOWPRI_MAX_NUM_DEV
- 1];
1302 else if (mp
->mnt_throttle_info
== NULL
)
1303 info
= &_throttle_io_info
[mp
->mnt_devbsdunit
];
1305 info
= mp
->mnt_throttle_info
;
1307 return throttle_io_will_be_throttled_internal(info
);
1312 throttle_lowpri_io(int sleep_amount
)
1315 struct _throttle_io_info_t
*info
;
1316 int throttle_type
= 0;
1319 uint32_t throttle_io_period_num
= 0;
1320 boolean_t insert_tail
= TRUE
;
1322 ut
= get_bsdthread_info(current_thread());
1324 if (ut
->uu_lowpri_window
== 0)
1327 info
= ut
->uu_throttle_info
;
1329 if ((sleep_amount
== 0) || (info
== NULL
))
1332 if (sleep_amount
== 1 && ut
->uu_throttle_bc
== FALSE
)
1335 throttle_io_period_num
= info
->throttle_io_period_num
;
1337 while ( (throttle_type
= throttle_io_will_be_throttled_internal(info
)) ) {
1339 if (throttle_type
== 1) {
1340 if (sleep_amount
== 0)
1342 if (info
->throttle_io_period_num
< throttle_io_period_num
)
1344 if ((info
->throttle_io_period_num
- throttle_io_period_num
) >= (uint32_t)sleep_amount
)
1348 lck_mtx_lock(&info
->throttle_lock
);
1351 if (info
->throttle_timer_running
== 0) {
1353 * try to start the timer since it's
1354 * currently not running. on failure, no
1355 * timer reference to drop since it wasn't started
1357 if (throttle_timer_start(info
, TRUE
) == THROTTLE_LEVEL_END
)
1360 if (sleep_cnt
== 0) {
1361 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_START
,
1362 ut
->uu_lowpri_window
, info
->throttle_io_period
, info
->throttle_io_count
, 0, 0);
1364 if (ut
->uu_on_throttlelist
== 0) {
1365 if (insert_tail
== TRUE
)
1366 TAILQ_INSERT_TAIL(&info
->throttle_uthlist
, ut
, uu_throttlelist
);
1368 TAILQ_INSERT_HEAD(&info
->throttle_uthlist
, ut
, uu_throttlelist
);
1370 ut
->uu_on_throttlelist
= 1;
1372 msleep((caddr_t
)&ut
->uu_on_throttlelist
, &info
->throttle_lock
, PRIBIO
+ 1, "throttle_lowpri_io", NULL
);
1376 if (sleep_amount
== 0)
1377 insert_tail
= FALSE
;
1378 else if (info
->throttle_io_period_num
< throttle_io_period_num
||
1379 (info
->throttle_io_period_num
- throttle_io_period_num
) >= (uint32_t)sleep_amount
) {
1380 insert_tail
= FALSE
;
1385 if (ut
->uu_on_throttlelist
) {
1387 lck_mtx_lock(&info
->throttle_lock
);
1390 if (ut
->uu_on_throttlelist
) {
1391 TAILQ_REMOVE(&info
->throttle_uthlist
, ut
, uu_throttlelist
);
1393 ut
->uu_on_throttlelist
= 0;
1397 lck_mtx_unlock(&info
->throttle_lock
);
1400 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_END
,
1401 ut
->uu_lowpri_window
, info
->throttle_io_period
, info
->throttle_io_count
, 0, 0);
1403 throttle_info_rel(info
);
1405 ut
->uu_throttle_info
= NULL
;
1406 ut
->uu_throttle_bc
= FALSE
;
1407 ut
->uu_lowpri_window
= 0;
1415 * set a kernel thread's IO policy. policy can be:
1416 * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE
1418 * explanations about these policies are in the man page of setiopolicy_np
1420 void throttle_set_thread_io_policy(int policy
)
1422 proc_apply_thread_selfdiskacc(policy
);
1427 void throttle_info_reset_window(uthread_t ut
)
1429 struct _throttle_io_info_t
*info
;
1431 if ( (info
= ut
->uu_throttle_info
) ) {
1432 throttle_info_rel(info
);
1434 ut
->uu_throttle_info
= NULL
;
1435 ut
->uu_lowpri_window
= 0;
1436 ut
->uu_throttle_bc
= FALSE
;
1441 void throttle_info_set_initial_window(uthread_t ut
, struct _throttle_io_info_t
*info
, boolean_t BC_throttle
)
1443 if (ut
->uu_throttle_info
== NULL
) {
1445 ut
->uu_throttle_info
= info
;
1446 throttle_info_ref(info
);
1447 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info
, info
);
1449 ut
->uu_lowpri_window
= THROTTLE_WINDOW
;
1450 ut
->uu_throttle_bc
= BC_throttle
;
1456 void throttle_info_update_internal(struct _throttle_io_info_t
*info
, uthread_t ut
, int policy
, int flags
, boolean_t isssd
)
1458 int thread_throttle_level
;
1460 if (THROTTLE_WINDOW
== 0)
1464 ut
= get_bsdthread_info(current_thread());
1466 thread_throttle_level
= throttle_get_thread_throttle_level(ut
, policy
);
1468 if (thread_throttle_level
== THROTTLE_LEVEL_TIER0
&& ISSET(flags
, B_PASSIVE
))
1469 thread_throttle_level
= THROTTLE_LEVEL_NONE
;
1471 if (thread_throttle_level
!= THROTTLE_LEVEL_NONE
)
1472 microuptime(&info
->throttle_last_IO_timestamp
[thread_throttle_level
]);
1474 if (thread_throttle_level
>= THROTTLE_LEVEL_THROTTLED
) {
1476 * I'd really like to do the IOSleep here, but
1477 * we may be holding all kinds of filesystem related locks
1478 * and the pages for this I/O marked 'busy'...
1479 * we don't want to cause a normal task to block on
1480 * one of these locks while we're throttling a task marked
1481 * for low priority I/O... we'll mark the uthread and
1482 * do the delay just before we return from the system
1483 * call that triggered this I/O or from vnode_pagein
1485 if (info
->throttle_io_period
== 0) {
1488 info
->throttle_io_period
= lowpri_io_period_ssd_msecs
;
1490 info
->throttle_io_period
= lowpri_io_period_msecs
;
1492 if (info
->throttle_io_period
< lowpri_timer_period_msecs
)
1493 info
->throttle_io_period
= lowpri_timer_period_msecs
;
1495 OSAddAtomic(1, &info
->throttle_io_count
);
1497 throttle_info_set_initial_window(ut
, info
, FALSE
);
1501 void throttle_info_update_by_mount(mount_t mp
)
1503 struct _throttle_io_info_t
*info
;
1505 boolean_t isssd
= FALSE
;
1507 ut
= get_bsdthread_info(current_thread());
1509 if (ut
->uu_lowpri_window
)
1513 if ((mp
->mnt_kern_flag
& MNTK_SSD
) && !ignore_is_ssd
)
1515 info
= &_throttle_io_info
[mp
->mnt_devbsdunit
];
1517 info
= &_throttle_io_info
[LOWPRI_MAX_NUM_DEV
- 1];
1519 if (info
->throttle_io_period
== 0) {
1522 info
->throttle_io_period
= lowpri_io_period_ssd_msecs
;
1524 info
->throttle_io_period
= lowpri_io_period_msecs
;
1526 if (info
->throttle_io_period
< lowpri_timer_period_msecs
)
1527 info
->throttle_io_period
= lowpri_timer_period_msecs
;
1529 throttle_info_set_initial_window(ut
, info
, FALSE
);
1536 * this is usually called before every I/O, used for throttled I/O
1537 * book keeping. This routine has low overhead and does not sleep
1539 void throttle_info_update(void *throttle_info
, int flags
)
1542 throttle_info_update_internal(throttle_info
, NULL
, -1, flags
, FALSE
);
1548 * this is usually called before every I/O, used for throttled I/O
1549 * book keeping. This routine has low overhead and does not sleep
1551 void throttle_info_update_by_mask(void *throttle_info_handle
, int flags
)
1553 void *throttle_info
= throttle_info_handle
;
1556 * for now we only use the lowest bit of the throttle mask, so the
1557 * handle is the same as the throttle_info. Later if we store a
1558 * set of throttle infos in the handle, we will want to loop through
1559 * them and call throttle_info_update in a loop
1561 throttle_info_update(throttle_info
, flags
);
1565 int throttle_info_io_will_be_throttled(void * throttle_info
, int policy
)
1567 struct _throttle_io_info_t
*info
= throttle_info
;
1568 struct timeval elapsed
;
1569 uint64_t elapsed_msecs
;
1571 int thread_throttle_level
;
1575 case IOPOL_THROTTLE
:
1576 thread_throttle_level
= THROTTLE_LEVEL_TIER2
;
1579 thread_throttle_level
= THROTTLE_LEVEL_TIER1
;
1582 thread_throttle_level
= THROTTLE_LEVEL_TIER0
;
1585 for (throttle_level
= THROTTLE_LEVEL_START
; throttle_level
< thread_throttle_level
; throttle_level
++) {
1587 microuptime(&elapsed
);
1588 timevalsub(&elapsed
, &info
->throttle_last_IO_timestamp
[throttle_level
]);
1589 elapsed_msecs
= (uint64_t)elapsed
.tv_sec
* (uint64_t)1000 + (elapsed
.tv_usec
/ 1000);
1591 if (elapsed_msecs
< (uint64_t)THROTTLE_WINDOW
)
1594 if (throttle_level
>= thread_throttle_level
) {
1596 * we're beyond all of the throttle windows
1597 * so go ahead and treat as normal I/O
1602 * we're in the throttle window
1608 throttle_legacy_process_incr(void)
1610 OSIncrementAtomic(&throttle_legacy_process_count
);
1614 throttle_legacy_process_decr(void)
1616 OSDecrementAtomic(&throttle_legacy_process_count
);
1621 spec_strategy(struct vnop_strategy_args
*ap
)
1630 struct _throttle_io_info_t
*throttle_info
;
1631 boolean_t isssd
= FALSE
;
1632 #if !CONFIG_EMBEDDED
1633 proc_t curproc
= current_proc();
1634 #endif /* !CONFIG_EMBEDDED */
1637 bdev
= buf_device(bp
);
1638 mp
= buf_vnode(bp
)->v_mount
;
1640 policy
= throttle_get_io_policy(&ut
);
1642 if (bp
->b_flags
& B_META
)
1643 bp
->b_attr
.ba_flags
|= BA_META
;
1645 if (policy
== IOPOL_THROTTLE
|| policy
== IOPOL_UTILITY
) {
1646 bp
->b_flags
|= B_THROTTLED_IO
;
1647 bp
->b_attr
.ba_flags
|= BA_THROTTLED_IO
;
1648 bp
->b_flags
&= ~B_PASSIVE
;
1649 } else if (policy
== IOPOL_PASSIVE
)
1650 bp
->b_flags
|= B_PASSIVE
;
1652 #if !CONFIG_EMBEDDED
1653 if ((curproc
!= NULL
) && ((curproc
->p_flag
& P_DELAYIDLESLEEP
) == P_DELAYIDLESLEEP
))
1654 bp
->b_attr
.ba_flags
|= BA_DELAYIDLESLEEP
;
1655 #endif /* !CONFIG_EMBEDDED */
1657 bflags
= bp
->b_flags
;
1659 if (kdebug_enable
) {
1662 if (bflags
& B_READ
)
1664 if (bflags
& B_ASYNC
)
1667 if (bflags
& B_META
)
1669 else if (bflags
& B_PAGEIO
)
1670 code
|= DKIO_PAGING
;
1672 if (bflags
& B_THROTTLED_IO
)
1673 code
|= DKIO_THROTTLE
;
1674 else if (bflags
& B_PASSIVE
)
1675 code
|= DKIO_PASSIVE
;
1677 if (bp
->b_attr
.ba_flags
& BA_NOCACHE
)
1678 code
|= DKIO_NOCACHE
;
1680 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON
, FSDBG_CODE(DBG_DKRW
, code
) | DBG_FUNC_NONE
,
1681 bp
, bdev
, (int)buf_blkno(bp
), buf_count(bp
), 0);
1683 if (((bflags
& (B_THROTTLED_IO
| B_PASSIVE
| B_IOSTREAMING
| B_PAGEIO
| B_READ
)) == (B_PAGEIO
| B_READ
)) &&
1684 mp
&& (mp
->mnt_kern_flag
& MNTK_ROOTDEV
))
1685 hard_throttle_on_root
= 1;
1688 if ((mp
->mnt_kern_flag
& MNTK_SSD
) && !ignore_is_ssd
)
1690 throttle_info
= &_throttle_io_info
[mp
->mnt_devbsdunit
];
1692 throttle_info
= &_throttle_io_info
[LOWPRI_MAX_NUM_DEV
- 1];
1694 throttle_info_update_internal(throttle_info
, ut
, policy
, bflags
, isssd
);
1696 if ((bflags
& B_READ
) == 0) {
1697 microuptime(&throttle_info
->throttle_last_write_timestamp
);
1700 INCR_PENDING_IO(buf_count(bp
), mp
->mnt_pending_write_size
);
1703 INCR_PENDING_IO(buf_count(bp
), mp
->mnt_pending_read_size
);
1706 * The BootCache may give us special information about
1707 * the IO, so it returns special values that we check
1710 * IO_SATISFIED_BY_CACHE
1711 * The read has been satisfied by the boot cache. Don't
1712 * throttle the thread unnecessarily.
1714 * IO_SHOULD_BE_THROTTLED
1715 * The boot cache is playing back a playlist and this IO
1716 * cut through. Throttle it so we're not cutting through
1717 * the boot cache too often.
1719 * Note that typical strategy routines are defined with
1720 * a void return so we'll get garbage here. In the
1721 * unlikely case the garbage matches our special return
1722 * value, it's not a big deal since we're only adjusting
1723 * the throttling delay.
1725 #define IO_SATISFIED_BY_CACHE ((int)0xcafefeed)
1726 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
1727 typedef int strategy_fcn_ret_t(struct buf
*bp
);
1729 strategy_ret
= (*(strategy_fcn_ret_t
*)bdevsw
[major(bdev
)].d_strategy
)(bp
);
1731 if (IO_SATISFIED_BY_CACHE
== strategy_ret
) {
1733 * If this was a throttled IO satisfied by the boot cache,
1734 * don't delay the thread.
1736 throttle_info_reset_window(ut
);
1738 } else if (IO_SHOULD_BE_THROTTLED
== strategy_ret
) {
1740 * If the boot cache indicates this IO should be throttled,
1743 throttle_info_set_initial_window(ut
, throttle_info
, TRUE
);
1750 * This is a noop, simply returning what one has been given.
1753 spec_blockmap(__unused
struct vnop_blockmap_args
*ap
)
1760 * Device close routine
1763 spec_close(struct vnop_close_args
*ap
)
1765 struct vnode
*vp
= ap
->a_vp
;
1766 dev_t dev
= vp
->v_rdev
;
1768 int flags
= ap
->a_fflag
;
1769 struct proc
*p
= vfs_context_proc(ap
->a_context
);
1770 struct session
*sessp
;
1773 switch (vp
->v_type
) {
1777 * Hack: a tty device that is a controlling terminal
1778 * has a reference from the session structure.
1779 * We cannot easily tell that a character device is
1780 * a controlling terminal, unless it is the closing
1781 * process' controlling terminal. In that case,
1782 * if the reference count is 1 (this is the very
1785 sessp
= proc_session(p
);
1786 if (sessp
!= SESSION_NULL
) {
1787 if (vp
== sessp
->s_ttyvp
&& vcount(vp
) == 1) {
1790 session_lock(sessp
);
1791 if (vp
== sessp
->s_ttyvp
) {
1792 tp
= SESSION_TP(sessp
);
1793 sessp
->s_ttyvp
= NULL
;
1794 sessp
->s_ttyvid
= 0;
1795 sessp
->s_ttyp
= TTY_NULL
;
1796 sessp
->s_ttypgrpid
= NO_PID
;
1799 session_unlock(sessp
);
1807 session_rele(sessp
);
1810 devsw_lock(dev
, S_IFCHR
);
1812 if (--vp
->v_specinfo
->si_opencount
< 0)
1813 panic("negative open count (c, %u, %u)", major(dev
), minor(dev
));
1816 * close always, or close on last reference, or close on revoke
1818 if ((D_TRACKCLOSE
& cdevsw
[major(dev
)].d_type
) != 0 ||
1819 vcount(vp
) == 0 || (flags
& IO_REVOKE
) != 0)
1820 error
= cdevsw
[major(dev
)].d_close(dev
, flags
, S_IFCHR
, p
);
1822 devsw_unlock(dev
, S_IFCHR
);
1827 * If there is more than one outstanding open, don't
1828 * send the close to the device.
1830 devsw_lock(dev
, S_IFBLK
);
1831 if (vcount(vp
) > 1) {
1832 vp
->v_specinfo
->si_opencount
--;
1833 devsw_unlock(dev
, S_IFBLK
);
1836 devsw_unlock(dev
, S_IFBLK
);
1839 * On last close of a block device (that isn't mounted)
1840 * we must invalidate any in core blocks, so that
1841 * we can, for instance, change floppy disks.
1843 if ((error
= spec_fsync_internal(vp
, MNT_WAIT
, ap
->a_context
)))
1846 error
= buf_invalidateblks(vp
, BUF_WRITE_DATA
, 0, 0);
1850 devsw_lock(dev
, S_IFBLK
);
1852 if (--vp
->v_specinfo
->si_opencount
< 0)
1853 panic("negative open count (b, %u, %u)", major(dev
), minor(dev
));
1855 if (vcount(vp
) == 0)
1856 error
= bdevsw
[major(dev
)].d_close(dev
, flags
, S_IFBLK
, p
);
1858 devsw_unlock(dev
, S_IFBLK
);
1862 panic("spec_close: not special");
1870 * Return POSIX pathconf information applicable to special devices.
1873 spec_pathconf(struct vnop_pathconf_args
*ap
)
1876 switch (ap
->a_name
) {
1878 *ap
->a_retval
= LINK_MAX
;
1881 *ap
->a_retval
= MAX_CANON
;
1884 *ap
->a_retval
= MAX_INPUT
;
1887 *ap
->a_retval
= PIPE_BUF
;
1889 case _PC_CHOWN_RESTRICTED
:
1890 *ap
->a_retval
= 200112; /* _POSIX_CHOWN_RESTRICTED */
1893 *ap
->a_retval
= _POSIX_VDISABLE
;
1902 * Special device failed operation
1905 spec_ebadf(__unused
void *dummy
)
1911 /* Blktooff derives file offset from logical block number */
1913 spec_blktooff(struct vnop_blktooff_args
*ap
)
1915 struct vnode
*vp
= ap
->a_vp
;
1917 switch (vp
->v_type
) {
1919 *ap
->a_offset
= (off_t
)-1; /* failure */
1923 printf("spec_blktooff: not implemented for VBLK\n");
1924 *ap
->a_offset
= (off_t
)-1; /* failure */
1928 panic("spec_blktooff type");
1935 /* Offtoblk derives logical block number from file offset */
1937 spec_offtoblk(struct vnop_offtoblk_args
*ap
)
1939 struct vnode
*vp
= ap
->a_vp
;
1941 switch (vp
->v_type
) {
1943 *ap
->a_lblkno
= (daddr64_t
)-1; /* failure */
1947 printf("spec_offtoblk: not implemented for VBLK\n");
1948 *ap
->a_lblkno
= (daddr64_t
)-1; /* failure */
1952 panic("spec_offtoblk type");
1959 static void filt_specdetach(struct knote
*kn
);
1960 static int filt_spec(struct knote
*kn
, long hint
);
1961 static unsigned filt_specpeek(struct knote
*kn
);
1963 struct filterops spec_filtops
= {
1965 .f_attach
= filt_specattach
,
1966 .f_detach
= filt_specdetach
,
1967 .f_event
= filt_spec
,
1968 .f_peek
= filt_specpeek
1972 filter_to_seltype(int16_t filter
)
1981 panic("filt_to_seltype(): invalid filter %d\n", filter
);
1987 filt_specattach(struct knote
*kn
)
1992 vp
= (vnode_t
)kn
->kn_fp
->f_fglob
->fg_data
; /* Already have iocount, and vnode is alive */
1994 assert(vnode_ischr(vp
));
1996 dev
= vnode_specrdev(vp
);
1998 if (major(dev
) > nchrdev
) {
2002 if ((cdevsw_flags
[major(dev
)] & CDEVSW_SELECT_KQUEUE
) == 0) {
2006 /* Resulting wql is safe to unlink even if it has never been linked */
2007 kn
->kn_hook
= wait_queue_link_allocate();
2008 if (kn
->kn_hook
== NULL
) {
2012 kn
->kn_fop
= &spec_filtops
;
2013 kn
->kn_hookid
= vnode_vid(vp
);
2015 knote_markstayqueued(kn
);
2021 filt_specdetach(struct knote
*kn
)
2026 * Given wait queue link and wait queue set, unlink. This is subtle.
2027 * If the device has been revoked from under us, selclearthread() will
2028 * have removed our link from the kqueue's wait queue set, which
2029 * wait_queue_set_unlink_one() will detect and handle.
2031 ret
= wait_queue_set_unlink_one(kn
->kn_kq
->kq_wqs
, kn
->kn_hook
);
2032 if (ret
!= KERN_SUCCESS
) {
2033 panic("filt_specdetach(): failed to unlink wait queue link.");
2036 (void)wait_queue_link_free(kn
->kn_hook
);
2038 kn
->kn_status
&= ~KN_STAYQUEUED
;
2042 filt_spec(struct knote
*kn
, long hint
)
2046 wait_queue_set_t old_wqs
;
2054 assert(kn
->kn_hook
!= NULL
);
2057 panic("filt_spec(): nonzero hint?");
2060 uth
= get_bsdthread_info(current_thread());
2061 ctx
= vfs_context_current();
2062 vp
= (vnode_t
)kn
->kn_fp
->f_fglob
->fg_data
;
2064 error
= vnode_getwithvid(vp
, kn
->kn_hookid
);
2066 kn
->kn_flags
|= (EV_EOF
| EV_ONESHOT
);
2070 dev
= vnode_specrdev(vp
);
2071 flags
= cdevsw_flags
[major(dev
)];
2072 use_offset
= ((flags
& CDEVSW_USE_OFFSET
) != 0);
2073 assert((flags
& CDEVSW_SELECT_KQUEUE
) != 0);
2075 /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */
2076 old_wqs
= uth
->uu_wqset
;
2077 uth
->uu_wqset
= kn
->kn_kq
->kq_wqs
;
2078 selres
= VNOP_SELECT(vp
, filter_to_seltype(kn
->kn_filter
), 0, kn
->kn_hook
, ctx
);
2079 uth
->uu_wqset
= old_wqs
;
2082 if (kn
->kn_fp
->f_fglob
->fg_offset
>= (uint32_t)selres
) {
2085 kn
->kn_data
= ((uint32_t)selres
) - kn
->kn_fp
->f_fglob
->fg_offset
;
2088 kn
->kn_data
= selres
;
2093 return (kn
->kn_data
!= 0);
2097 filt_specpeek(struct knote
*kn
)
2101 wait_queue_set_t old_wqs
;
2105 uth
= get_bsdthread_info(current_thread());
2106 ctx
= vfs_context_current();
2107 vp
= (vnode_t
)kn
->kn_fp
->f_fglob
->fg_data
;
2109 error
= vnode_getwithvid(vp
, kn
->kn_hookid
);
2111 return 1; /* Just like VNOP_SELECT() on recycled vnode */
2115 * Why pass the link here? Because we may not have registered in the past...
2117 old_wqs
= uth
->uu_wqset
;
2118 uth
->uu_wqset
= kn
->kn_kq
->kq_wqs
;
2119 selres
= VNOP_SELECT(vp
, filter_to_seltype(kn
->kn_filter
), 0, kn
->kn_hook
, ctx
);
2120 uth
->uu_wqset
= old_wqs
;