2 * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1989, 1993, 1995
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/kauth.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
70 #include <sys/buf_internal.h>
71 #include <sys/mount_internal.h>
72 #include <sys/namei.h>
73 #include <sys/vnode_internal.h>
75 #include <sys/errno.h>
76 #include <sys/ioctl.h>
79 #include <sys/malloc.h>
81 #include <sys/uio_internal.h>
82 #include <sys/resource.h>
83 #include <miscfs/specfs/specdev.h>
84 #include <vfs/vfs_support.h>
86 #include <sys/kdebug.h>
88 /* XXX following three prototypes should be in a header file somewhere */
89 extern dev_t
chrtoblk(dev_t dev
);
90 extern int iskmemdev(dev_t dev
);
91 extern int bpfkqfilter(dev_t dev
, struct knote
*kn
);
92 extern int ptsd_kqfilter(dev_t dev
, struct knote
*kn
);
94 struct vnode
*speclisth
[SPECHSZ
];
96 /* symbolic sleep message strings for devices */
97 char devopn
[] = "devopn";
98 char devio
[] = "devio";
99 char devwait
[] = "devwait";
100 char devin
[] = "devin";
101 char devout
[] = "devout";
102 char devioc
[] = "devioc";
103 char devcls
[] = "devcls";
105 #define VOPFUNC int (*)(void *)
107 int (**spec_vnodeop_p
)(void *);
108 struct vnodeopv_entry_desc spec_vnodeop_entries
[] = {
109 { &vnop_default_desc
, (VOPFUNC
)vn_default_error
},
110 { &vnop_lookup_desc
, (VOPFUNC
)spec_lookup
}, /* lookup */
111 { &vnop_create_desc
, (VOPFUNC
)err_create
}, /* create */
112 { &vnop_mknod_desc
, (VOPFUNC
)err_mknod
}, /* mknod */
113 { &vnop_open_desc
, (VOPFUNC
)spec_open
}, /* open */
114 { &vnop_close_desc
, (VOPFUNC
)spec_close
}, /* close */
115 { &vnop_access_desc
, (VOPFUNC
)spec_access
}, /* access */
116 { &vnop_getattr_desc
, (VOPFUNC
)spec_getattr
}, /* getattr */
117 { &vnop_setattr_desc
, (VOPFUNC
)spec_setattr
}, /* setattr */
118 { &vnop_read_desc
, (VOPFUNC
)spec_read
}, /* read */
119 { &vnop_write_desc
, (VOPFUNC
)spec_write
}, /* write */
120 { &vnop_ioctl_desc
, (VOPFUNC
)spec_ioctl
}, /* ioctl */
121 { &vnop_select_desc
, (VOPFUNC
)spec_select
}, /* select */
122 { &vnop_revoke_desc
, (VOPFUNC
)nop_revoke
}, /* revoke */
123 { &vnop_mmap_desc
, (VOPFUNC
)err_mmap
}, /* mmap */
124 { &vnop_fsync_desc
, (VOPFUNC
)spec_fsync
}, /* fsync */
125 { &vnop_remove_desc
, (VOPFUNC
)err_remove
}, /* remove */
126 { &vnop_link_desc
, (VOPFUNC
)err_link
}, /* link */
127 { &vnop_rename_desc
, (VOPFUNC
)err_rename
}, /* rename */
128 { &vnop_mkdir_desc
, (VOPFUNC
)err_mkdir
}, /* mkdir */
129 { &vnop_rmdir_desc
, (VOPFUNC
)err_rmdir
}, /* rmdir */
130 { &vnop_symlink_desc
, (VOPFUNC
)err_symlink
}, /* symlink */
131 { &vnop_readdir_desc
, (VOPFUNC
)err_readdir
}, /* readdir */
132 { &vnop_readlink_desc
, (VOPFUNC
)err_readlink
}, /* readlink */
133 { &vnop_inactive_desc
, (VOPFUNC
)nop_inactive
}, /* inactive */
134 { &vnop_reclaim_desc
, (VOPFUNC
)nop_reclaim
}, /* reclaim */
135 { &vnop_strategy_desc
, (VOPFUNC
)spec_strategy
}, /* strategy */
136 { &vnop_pathconf_desc
, (VOPFUNC
)spec_pathconf
}, /* pathconf */
137 { &vnop_advlock_desc
, (VOPFUNC
)err_advlock
}, /* advlock */
138 { &vnop_bwrite_desc
, (VOPFUNC
)spec_bwrite
}, /* bwrite */
139 { &vnop_pagein_desc
, (VOPFUNC
)err_pagein
}, /* Pagein */
140 { &vnop_pageout_desc
, (VOPFUNC
)err_pageout
}, /* Pageout */
141 { &vnop_copyfile_desc
, (VOPFUNC
)err_copyfile
}, /* Copyfile */
142 { &vnop_blktooff_desc
, (VOPFUNC
)spec_blktooff
}, /* blktooff */
143 { &vnop_offtoblk_desc
, (VOPFUNC
)spec_offtoblk
}, /* offtoblk */
144 { &vnop_blockmap_desc
, (VOPFUNC
)spec_blockmap
}, /* blockmap */
145 { (struct vnodeop_desc
*)NULL
, (int(*)())NULL
}
147 struct vnodeopv_desc spec_vnodeop_opv_desc
=
148 { &spec_vnodeop_p
, spec_vnodeop_entries
};
151 static void set_blocksize(vnode_t
, dev_t
);
155 * Trivial lookup routine that always fails.
158 spec_lookup(struct vnop_lookup_args
*ap
)
166 set_blocksize(struct vnode
*vp
, dev_t dev
)
171 if ((major(dev
) < nblkdev
) && (size
= bdevsw
[major(dev
)].d_psize
)) {
172 rsize
= (*size
)(dev
);
173 if (rsize
<= 0) /* did size fail? */
174 vp
->v_specsize
= DEV_BSIZE
;
176 vp
->v_specsize
= rsize
;
179 vp
->v_specsize
= DEV_BSIZE
;
183 set_fsblocksize(struct vnode
*vp
)
186 if (vp
->v_type
== VBLK
) {
187 dev_t dev
= (dev_t
)vp
->v_rdev
;
188 int maj
= major(dev
);
190 if ((u_int
)maj
>= (u_int
)nblkdev
)
194 set_blocksize(vp
, dev
);
202 * Open a special file.
205 spec_open(struct vnop_open_args
*ap
)
207 struct proc
*p
= vfs_context_proc(ap
->a_context
);
208 kauth_cred_t cred
= vfs_context_ucred(ap
->a_context
);
209 struct vnode
*vp
= ap
->a_vp
;
210 dev_t bdev
, dev
= (dev_t
)vp
->v_rdev
;
211 int maj
= major(dev
);
215 * Don't allow open if fs is mounted -nodev.
217 if (vp
->v_mount
&& (vp
->v_mount
->mnt_flag
& MNT_NODEV
))
220 switch (vp
->v_type
) {
223 if ((u_int
)maj
>= (u_int
)nchrdev
)
225 if (cred
!= FSCRED
&& (ap
->a_mode
& FWRITE
)) {
227 * When running in very secure mode, do not allow
228 * opens for writing of any disk character devices.
230 if (securelevel
>= 2 && isdisk(dev
, VCHR
))
233 * When running in secure mode, do not allow opens
234 * for writing of /dev/mem, /dev/kmem, or character
235 * devices whose corresponding block devices are
238 if (securelevel
>= 1) {
239 if ((bdev
= chrtoblk(dev
)) != NODEV
&& check_mountedon(bdev
, VBLK
, &error
))
245 if (cdevsw
[maj
].d_type
== D_TTY
) {
247 vp
->v_flag
|= VISTTY
;
250 error
= (*cdevsw
[maj
].d_open
)(dev
, ap
->a_mode
, S_IFCHR
, p
);
254 if ((u_int
)maj
>= (u_int
)nblkdev
)
257 * When running in very secure mode, do not allow
258 * opens for writing of any disk block devices.
260 if (securelevel
>= 2 && cred
!= FSCRED
&&
261 (ap
->a_mode
& FWRITE
) && bdevsw
[maj
].d_type
== D_DISK
)
264 * Do not allow opens of block devices that are
267 if ( (error
= vfs_mountedon(vp
)) )
269 error
= (*bdevsw
[maj
].d_open
)(dev
, ap
->a_mode
, S_IFBLK
, p
);
274 u_int32_t size512
= 512;
277 if (!VNOP_IOCTL(vp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&blksize
, 0, ap
->a_context
)) {
278 /* Switch to 512 byte sectors (temporarily) */
280 if (!VNOP_IOCTL(vp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&size512
, FWRITE
, ap
->a_context
)) {
281 /* Get the number of 512 byte physical blocks. */
282 if (!VNOP_IOCTL(vp
, DKIOCGETBLOCKCOUNT
, (caddr_t
)&blkcnt
, 0, ap
->a_context
)) {
286 /* If it doesn't set back, we can't recover */
287 if (VNOP_IOCTL(vp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&blksize
, FWRITE
, ap
->a_context
))
293 set_blocksize(vp
, dev
);
296 * Cache the size in bytes of the block device for later
297 * use by spec_write().
300 vp
->v_specdevsize
= blkcnt
* (u_int64_t
)size512
;
302 vp
->v_specdevsize
= (u_int64_t
)0; /* Default: Can't get */
309 panic("spec_open type");
318 spec_read(struct vnop_read_args
*ap
)
320 struct vnode
*vp
= ap
->a_vp
;
321 struct uio
*uio
= ap
->a_uio
;
323 daddr64_t bn
, nextbn
;
331 if (uio
->uio_rw
!= UIO_READ
)
332 panic("spec_read mode");
333 if (UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))
334 panic("spec_read proc");
336 if (uio_resid(uio
) == 0)
339 switch (vp
->v_type
) {
342 error
= (*cdevsw
[major(vp
->v_rdev
)].d_read
)
343 (vp
->v_rdev
, uio
, ap
->a_ioflag
);
347 if (uio
->uio_offset
< 0)
352 devBlockSize
= vp
->v_specsize
;
354 if (devBlockSize
> PAGE_SIZE
)
357 bscale
= PAGE_SIZE
/ devBlockSize
;
358 bsize
= bscale
* devBlockSize
;
361 on
= uio
->uio_offset
% bsize
;
363 bn
= (daddr64_t
)((uio
->uio_offset
/ devBlockSize
) &~ (bscale
- 1));
365 if (vp
->v_speclastr
+ bscale
== bn
) {
366 nextbn
= bn
+ bscale
;
367 error
= buf_breadn(vp
, bn
, (int)bsize
, &nextbn
,
368 (int *)&bsize
, 1, NOCRED
, &bp
);
370 error
= buf_bread(vp
, bn
, (int)bsize
, NOCRED
, &bp
);
373 vp
->v_speclastr
= bn
;
376 n
= bsize
- buf_resid(bp
);
377 if ((on
> n
) || error
) {
383 n
= min((unsigned)(n
- on
), uio_resid(uio
));
385 error
= uiomove((char *)0 + buf_dataptr(bp
) + on
, n
, uio
);
389 } while (error
== 0 && uio_resid(uio
) > 0 && n
!= 0);
393 panic("spec_read type");
404 spec_write(struct vnop_write_args
*ap
)
406 struct vnode
*vp
= ap
->a_vp
;
407 struct uio
*uio
= ap
->a_uio
;
410 int bsize
, blkmask
, bscale
;
418 if (uio
->uio_rw
!= UIO_WRITE
)
419 panic("spec_write mode");
420 if (UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))
421 panic("spec_write proc");
424 switch (vp
->v_type
) {
427 error
= (*cdevsw
[major(vp
->v_rdev
)].d_write
)
428 (vp
->v_rdev
, uio
, ap
->a_ioflag
);
432 if (uio_resid(uio
) == 0)
434 if (uio
->uio_offset
< 0)
437 io_sync
= (ap
->a_ioflag
& IO_SYNC
);
441 devBlockSize
= vp
->v_specsize
;
442 if (devBlockSize
> PAGE_SIZE
)
445 bscale
= PAGE_SIZE
/ devBlockSize
;
446 blkmask
= bscale
- 1;
447 bsize
= bscale
* devBlockSize
;
451 bn
= (daddr64_t
)((uio
->uio_offset
/ devBlockSize
) &~ blkmask
);
452 on
= uio
->uio_offset
% bsize
;
454 n
= min((unsigned)(bsize
- on
), uio_resid(uio
));
457 * Use buf_getblk() as an optimization IFF:
459 * 1) We are reading exactly a block on a block
461 * 2) We know the size of the device from spec_open
462 * 3) The read doesn't span the end of the device
464 * Otherwise, we fall back on buf_bread().
467 vp
->v_specdevsize
!= (u_int64_t
)0 &&
468 (uio
->uio_offset
+ (u_int64_t
)n
) > vp
->v_specdevsize
) {
469 /* reduce the size of the read to what is there */
470 n
= (uio
->uio_offset
+ (u_int64_t
)n
) - vp
->v_specdevsize
;
474 bp
= buf_getblk(vp
, bn
, bsize
, 0, 0, BLK_WRITE
);
476 error
= (int)buf_bread(vp
, bn
, bsize
, NOCRED
, &bp
);
478 /* Translate downstream error for upstream, if needed */
480 error
= (int)buf_error(bp
);
485 n
= min(n
, bsize
- buf_resid(bp
));
487 error
= uiomove((char *)0 + buf_dataptr(bp
) + on
, n
, uio
);
495 error
= buf_bwrite(bp
);
497 if ((n
+ on
) == bsize
)
498 error
= buf_bawrite(bp
);
500 error
= buf_bdwrite(bp
);
502 } while (error
== 0 && uio_resid(uio
) > 0 && n
!= 0);
506 panic("spec_write type");
514 * Device ioctl operation.
517 spec_ioctl(struct vnop_ioctl_args
*ap
)
519 proc_t p
= vfs_context_proc(ap
->a_context
);
520 dev_t dev
= ap
->a_vp
->v_rdev
;
523 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL
, 0) | DBG_FUNC_START
,
524 (unsigned int)dev
, (unsigned int)ap
->a_command
, (unsigned int)ap
->a_fflag
, (unsigned int)ap
->a_vp
->v_type
, 0);
526 switch (ap
->a_vp
->v_type
) {
529 retval
= (*cdevsw
[major(dev
)].d_ioctl
)(dev
, ap
->a_command
, ap
->a_data
,
534 retval
= (*bdevsw
[major(dev
)].d_ioctl
)(dev
, ap
->a_command
, ap
->a_data
,
542 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL
, 0) | DBG_FUNC_END
,
543 (unsigned int)dev
, (unsigned int)ap
->a_command
, (unsigned int)ap
->a_fflag
, retval
, 0);
549 spec_select(struct vnop_select_args
*ap
)
551 proc_t p
= vfs_context_proc(ap
->a_context
);
554 switch (ap
->a_vp
->v_type
) {
557 return (1); /* XXX */
560 dev
= ap
->a_vp
->v_rdev
;
561 return (*cdevsw
[major(dev
)].d_select
)(dev
, ap
->a_which
, ap
->a_wql
, p
);
566 spec_kqfilter(vnode_t vp
, struct knote
*kn
)
572 * For a few special kinds of devices, we can attach knotes.
573 * Each filter function must check whether the dev type matches it.
575 dev
= vnode_specrdev(vp
);
577 if (vnode_istty(vp
)) {
578 /* We can hook into the slave side of a tty */
579 err
= ptsd_kqfilter(dev
, kn
);
581 /* Try a bpf device, as defined in bsd/net/bpf.c */
582 err
= bpfkqfilter(dev
, kn
);
589 * Synch buffers associated with a block device
592 spec_fsync_internal(vnode_t vp
, int waitfor
, __unused vfs_context_t context
)
594 if (vp
->v_type
== VCHR
)
597 * Flush all dirty buffers associated with a block device.
599 buf_flushdirtyblks(vp
, (waitfor
== MNT_WAIT
|| waitfor
== MNT_DWAIT
), 0, "spec_fsync");
605 spec_fsync(struct vnop_fsync_args
*ap
)
607 return spec_fsync_internal(ap
->a_vp
, ap
->a_waitfor
, ap
->a_context
);
611 * Just call the device strategy routine
613 extern int hard_throttle_on_root
;
616 // the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
617 #define LOWPRI_INITIAL_WINDOW_MSECS 100
618 #define LOWPRI_WINDOW_MSECS_INC 50
619 #define LOWPRI_MAX_WINDOW_MSECS 200
620 #define LOWPRI_MAX_WAITING_MSECS 200
621 #define LOWPRI_SLEEP_INTERVAL 5
624 struct _throttle_io_info_t
{
625 struct timeval last_normal_IO_timestamp
;
626 struct timeval last_IO_timestamp
;
627 SInt32 numthreads_throttling
;
633 struct _throttle_io_info_t _throttle_io_info
[LOWPRI_MAX_NUM_DEV
];
634 int lowpri_IO_initial_window_msecs
= LOWPRI_INITIAL_WINDOW_MSECS
;
635 int lowpri_IO_window_msecs_inc
= LOWPRI_WINDOW_MSECS_INC
;
636 int lowpri_max_window_msecs
= LOWPRI_MAX_WINDOW_MSECS
;
637 int lowpri_max_waiting_msecs
= LOWPRI_MAX_WAITING_MSECS
;
640 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) \
642 if ((debug_info)->alloc) \
643 printf("%s: "format, __FUNCTION__, ## args); \
647 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
650 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_IO_initial_window_msecs
, CTLFLAG_RW
, &lowpri_IO_initial_window_msecs
, LOWPRI_INITIAL_WINDOW_MSECS
, "");
651 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_IO_window_inc
, CTLFLAG_RW
, &lowpri_IO_window_msecs_inc
, LOWPRI_INITIAL_WINDOW_MSECS
, "");
652 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_max_window_msecs
, CTLFLAG_RW
, &lowpri_max_window_msecs
, LOWPRI_INITIAL_WINDOW_MSECS
, "");
653 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_max_waiting_msecs
, CTLFLAG_RW
, &lowpri_max_waiting_msecs
, LOWPRI_INITIAL_WINDOW_MSECS
, "");
656 * Release the reference and if the item was allocated and this is the last
657 * reference then free it.
659 * This routine always returns the old value.
662 throttle_info_rel(struct _throttle_io_info_t
*info
)
664 SInt32 oldValue
= OSDecrementAtomic(&info
->refcnt
);
666 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
667 info
, (int)(oldValue
-1), info
);
669 /* The reference count just went negative, very bad */
671 panic("throttle info ref cnt went negative!");
674 * Once reference count is zero, no one else should be able to take a
677 if ((info
->refcnt
== 0) && (info
->alloc
)) {
678 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info
, info
);
685 * Just take a reference on the throttle info structure.
687 * This routine always returns the old value.
690 throttle_info_ref(struct _throttle_io_info_t
*info
)
692 SInt32 oldValue
= OSIncrementAtomic(&info
->refcnt
);
694 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
695 info
, (int)(oldValue
-1), info
);
696 /* Allocated items should never have a reference of zero */
697 if (info
->alloc
&& (oldValue
== 0))
698 panic("Taking a reference without calling create throttle info!\n");
706 * Create and take a reference on a throttle info structure and return a
707 * pointer for the file system to use when calling throttle_info_update.
708 * Calling file system must have a matching release for every create.
711 throttle_info_create(void)
713 struct _throttle_io_info_t
*info
;
715 MALLOC(info
, struct _throttle_io_info_t
*, sizeof(*info
), M_TEMP
, M_ZERO
| M_WAITOK
);
716 /* Should never happen but just in case */
719 /* Mark that this one was allocated and needs to be freed */
720 DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info
, info
);
722 /* Take a reference */
723 OSIncrementAtomic(&info
->refcnt
);
730 * Release the throttle info pointer if all the reference are gone. Should be
731 * called to release reference taken by throttle_info_create
734 throttle_info_release(void *throttle_info
)
736 DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
737 (struct _throttle_io_info_t
*)throttle_info
,
738 (struct _throttle_io_info_t
*)throttle_info
);
739 if (throttle_info
) /* Just to be careful */
740 throttle_info_rel(throttle_info
);
746 * File Systems that create an info structure, need to call this routine in
747 * their mount routine (used by cluster code). File Systems that call this in
748 * their mount routines must call throttle_info_mount_rel in their unmount
752 throttle_info_mount_ref(mount_t mp
, void *throttle_info
)
754 if ((throttle_info
== NULL
) || (mp
== NULL
))
756 throttle_info_ref(throttle_info
);
757 /* We already have a reference release it before adding the new one */
758 if (mp
->mnt_throttle_info
)
759 throttle_info_rel(mp
->mnt_throttle_info
);
760 mp
->mnt_throttle_info
= throttle_info
;
766 * File Systems that throttle_info_mount_ref, must call this routine in their
770 throttle_info_mount_rel(mount_t mp
)
772 if (mp
->mnt_throttle_info
)
773 throttle_info_rel(mp
->mnt_throttle_info
);
774 mp
->mnt_throttle_info
= NULL
;
778 throttle_info_get_last_io_time(mount_t mp
, struct timeval
*tv
)
780 struct _throttle_io_info_t
*info
;
783 info
= &_throttle_io_info
[LOWPRI_MAX_NUM_DEV
- 1];
784 else if (mp
->mnt_throttle_info
== NULL
)
785 info
= &_throttle_io_info
[mp
->mnt_devbsdunit
];
787 info
= mp
->mnt_throttle_info
;
789 *tv
= info
->last_IO_timestamp
;
793 update_last_io_time(mount_t mp
)
795 struct _throttle_io_info_t
*info
;
798 info
= &_throttle_io_info
[LOWPRI_MAX_NUM_DEV
- 1];
799 else if (mp
->mnt_throttle_info
== NULL
)
800 info
= &_throttle_io_info
[mp
->mnt_devbsdunit
];
802 info
= mp
->mnt_throttle_info
;
804 microuptime(&info
->last_IO_timestamp
);
808 throttle_io_will_be_throttled_internal(int lowpri_window_msecs
, void * throttle_info
)
810 struct _throttle_io_info_t
*info
= throttle_info
;
811 struct timeval elapsed
;
814 microuptime(&elapsed
);
815 timevalsub(&elapsed
, &info
->last_normal_IO_timestamp
);
816 elapsed_msecs
= elapsed
.tv_sec
* 1000 + elapsed
.tv_usec
/ 1000;
818 if (lowpri_window_msecs
== -1) // use the max waiting time
819 lowpri_window_msecs
= lowpri_max_waiting_msecs
;
821 return elapsed_msecs
< lowpri_window_msecs
;
825 * If we have a mount point and it has a throttle info pointer then
826 * use it to do the check, otherwise use the device unit number to find
827 * the correct throttle info array element.
830 throttle_io_will_be_throttled(int lowpri_window_msecs
, mount_t mp
)
834 /* Should we just return zero if no mount point */
836 info
= &_throttle_io_info
[LOWPRI_MAX_NUM_DEV
- 1];
837 else if (mp
->mnt_throttle_info
== NULL
)
838 info
= &_throttle_io_info
[mp
->mnt_devbsdunit
];
840 info
= mp
->mnt_throttle_info
;
841 return throttle_io_will_be_throttled_internal(lowpri_window_msecs
, info
);
844 void throttle_lowpri_io(boolean_t ok_to_sleep
)
849 struct _throttle_io_info_t
*info
;
851 ut
= get_bsdthread_info(current_thread());
853 if ((ut
->uu_lowpri_window
== 0) || (ut
->uu_throttle_info
== NULL
))
856 info
= ut
->uu_throttle_info
;
857 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_START
,
858 ut
->uu_lowpri_window
, ok_to_sleep
, 0, 0, 0);
860 if (ok_to_sleep
== TRUE
) {
861 max_try_num
= lowpri_max_waiting_msecs
/ LOWPRI_SLEEP_INTERVAL
* MAX(1, info
->numthreads_throttling
);
863 for (i
=0; i
<max_try_num
; i
++) {
864 if (throttle_io_will_be_throttled_internal(ut
->uu_lowpri_window
, info
)) {
865 IOSleep(LOWPRI_SLEEP_INTERVAL
);
866 DEBUG_ALLOC_THROTTLE_INFO("sleeping because of info = %p\n", info
, info
);
872 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_END
,
873 ut
->uu_lowpri_window
, i
*5, 0, 0, 0);
875 oldValue
= OSDecrementAtomic(&info
->numthreads_throttling
);
878 panic("%s: numthreads negative", __func__
);
881 ut
->uu_lowpri_window
= 0;
882 if (ut
->uu_throttle_info
)
883 throttle_info_rel(ut
->uu_throttle_info
);
884 ut
->uu_throttle_info
= NULL
;
887 int throttle_get_io_policy(struct uthread
**ut
)
889 int policy
= IOPOL_DEFAULT
;
890 proc_t p
= current_proc();
892 *ut
= get_bsdthread_info(current_thread());
895 policy
= p
->p_iopol_disk
;
898 // the I/O policy of the thread overrides that of the process
899 // unless the I/O policy of the thread is default
900 if ((*ut
)->uu_iopol_disk
!= IOPOL_DEFAULT
)
901 policy
= (*ut
)->uu_iopol_disk
;
906 void throttle_info_update(void *throttle_info
, int flags
)
908 struct _throttle_io_info_t
*info
= throttle_info
;
911 int is_throttleable_io
= 0;
912 int is_passive_io
= 0;
915 if (!lowpri_IO_initial_window_msecs
|| (info
== NULL
))
917 policy
= throttle_get_io_policy(&ut
);
924 is_throttleable_io
= 1;
930 printf("unknown I/O policy %d", policy
);
934 if (!is_throttleable_io
&& ISSET(flags
, B_PASSIVE
))
937 if (!is_throttleable_io
) {
939 microuptime(&info
->last_normal_IO_timestamp
);
943 * I'd really like to do the IOSleep here, but
944 * we may be holding all kinds of filesystem related locks
945 * and the pages for this I/O marked 'busy'...
946 * we don't want to cause a normal task to block on
947 * one of these locks while we're throttling a task marked
948 * for low priority I/O... we'll mark the uthread and
949 * do the delay just before we return from the system
950 * call that triggered this I/O or from vnode_pagein
952 if (ut
->uu_lowpri_window
== 0) {
953 ut
->uu_throttle_info
= info
;
954 throttle_info_ref(ut
->uu_throttle_info
);
955 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info
, info
);
957 oldValue
= OSIncrementAtomic(&info
->numthreads_throttling
);
959 panic("%s: numthreads negative", __func__
);
961 ut
->uu_lowpri_window
= lowpri_IO_initial_window_msecs
;
962 ut
->uu_lowpri_window
+= lowpri_IO_window_msecs_inc
* oldValue
;
964 /* The thread sends I/Os to different devices within the same system call */
965 if (ut
->uu_throttle_info
!= info
) {
966 struct _throttle_io_info_t
*old_info
= ut
->uu_throttle_info
;
968 // keep track of the numthreads in the right device
969 OSDecrementAtomic(&old_info
->numthreads_throttling
);
970 OSIncrementAtomic(&info
->numthreads_throttling
);
972 DEBUG_ALLOC_THROTTLE_INFO("switching from info = %p\n", old_info
, old_info
);
973 DEBUG_ALLOC_THROTTLE_INFO("switching to info = %p\n", info
, info
);
974 /* This thread no longer needs a reference on that throttle info */
975 throttle_info_rel(ut
->uu_throttle_info
);
976 ut
->uu_throttle_info
= info
;
977 /* Need to take a reference on this throttle info */
978 throttle_info_ref(ut
->uu_throttle_info
);
980 int numthreads
= MAX(1, info
->numthreads_throttling
);
981 ut
->uu_lowpri_window
+= lowpri_IO_window_msecs_inc
* numthreads
;
982 if (ut
->uu_lowpri_window
> lowpri_max_window_msecs
* numthreads
)
983 ut
->uu_lowpri_window
= lowpri_max_window_msecs
* numthreads
;
989 spec_strategy(struct vnop_strategy_args
*ap
)
1000 bdev
= buf_device(bp
);
1001 bflags
= buf_flags(bp
);
1002 mp
= buf_vnode(bp
)->v_mount
;
1004 if (kdebug_enable
) {
1007 if (bflags
& B_READ
)
1009 if (bflags
& B_ASYNC
)
1012 if (bflags
& B_META
)
1014 else if (bflags
& B_PAGEIO
)
1015 code
|= DKIO_PAGING
;
1017 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW
, code
) | DBG_FUNC_NONE
,
1018 bp
, bdev
, (int)buf_blkno(bp
), buf_count(bp
), 0);
1020 if (((bflags
& (B_IOSTREAMING
| B_PAGEIO
| B_READ
)) == (B_PAGEIO
| B_READ
)) &&
1021 mp
&& (mp
->mnt_kern_flag
& MNTK_ROOTDEV
))
1022 hard_throttle_on_root
= 1;
1026 devbsdunit
= mp
->mnt_devbsdunit
;
1028 devbsdunit
= LOWPRI_MAX_NUM_DEV
- 1;
1030 throttle_info_update(&_throttle_io_info
[devbsdunit
], bflags
);
1031 if ((policy
= throttle_get_io_policy(&ut
)) == IOPOL_THROTTLE
) {
1032 bp
->b_flags
|= B_THROTTLED_IO
;
1036 if ((bflags
& B_READ
) == 0) {
1037 microuptime(&_throttle_io_info
[devbsdunit
].last_IO_timestamp
);
1039 INCR_PENDING_IO(buf_count(bp
), mp
->mnt_pending_write_size
);
1042 INCR_PENDING_IO(buf_count(bp
), mp
->mnt_pending_read_size
);
1045 (*bdevsw
[major(bdev
)].d_strategy
)(bp
);
1052 * This is a noop, simply returning what one has been given.
1055 spec_blockmap(__unused
struct vnop_blockmap_args
*ap
)
1062 * Device close routine
1065 spec_close(struct vnop_close_args
*ap
)
1067 struct vnode
*vp
= ap
->a_vp
;
1068 dev_t dev
= vp
->v_rdev
;
1069 int (*devclose
)(dev_t
, int, int, struct proc
*);
1071 int flags
= ap
->a_fflag
;
1072 struct proc
*p
= vfs_context_proc(ap
->a_context
);
1073 struct session
*sessp
;
1075 switch (vp
->v_type
) {
1079 * Hack: a tty device that is a controlling terminal
1080 * has a reference from the session structure.
1081 * We cannot easily tell that a character device is
1082 * a controlling terminal, unless it is the closing
1083 * process' controlling terminal. In that case,
1084 * if the reference count is 1 (this is the very
1087 sessp
= proc_session(p
);
1088 if (sessp
!= SESSION_NULL
) {
1089 if ((vcount(vp
) == 1) &&
1090 (vp
== sessp
->s_ttyvp
)) {
1091 session_lock(sessp
);
1092 sessp
->s_ttyvp
= NULL
;
1093 sessp
->s_ttyvid
= 0;
1094 sessp
->s_ttyp
= TTY_NULL
;
1095 sessp
->s_ttypgrpid
= NO_PID
;
1096 session_unlock(sessp
);
1099 session_rele(sessp
);
1102 devclose
= cdevsw
[major(dev
)].d_close
;
1105 * close on last reference or on vnode revoke call
1107 if ((flags
& IO_REVOKE
) != 0)
1115 * Since every use (buffer, vnode, swap, blockmap)
1116 * holds a reference to the vnode, and because we mark
1117 * any other vnodes that alias this device, when the
1118 * sum of the reference counts on all the aliased
1119 * vnodes descends to zero, we are on last close.
1125 * On last close of a block device (that isn't mounted)
1126 * we must invalidate any in core blocks, so that
1127 * we can, for instance, change floppy disks.
1129 if ((error
= spec_fsync_internal(vp
, MNT_WAIT
, ap
->a_context
)))
1132 error
= buf_invalidateblks(vp
, BUF_WRITE_DATA
, 0, 0);
1136 devclose
= bdevsw
[major(dev
)].d_close
;
1141 panic("spec_close: not special");
1145 return ((*devclose
)(dev
, flags
, mode
, p
));
1149 * Return POSIX pathconf information applicable to special devices.
1152 spec_pathconf(struct vnop_pathconf_args
*ap
)
1155 switch (ap
->a_name
) {
1157 *ap
->a_retval
= LINK_MAX
;
1160 *ap
->a_retval
= MAX_CANON
;
1163 *ap
->a_retval
= MAX_INPUT
;
1166 *ap
->a_retval
= PIPE_BUF
;
1168 case _PC_CHOWN_RESTRICTED
:
1169 *ap
->a_retval
= 200112; /* _POSIX_CHOWN_RESTRICTED */
1172 *ap
->a_retval
= _POSIX_VDISABLE
;
1181 * Special device failed operation
1184 spec_ebadf(__unused
void *dummy
)
1190 /* Blktooff derives file offset from logical block number */
1192 spec_blktooff(struct vnop_blktooff_args
*ap
)
1194 struct vnode
*vp
= ap
->a_vp
;
1196 switch (vp
->v_type
) {
1198 *ap
->a_offset
= (off_t
)-1; /* failure */
1202 printf("spec_blktooff: not implemented for VBLK\n");
1203 *ap
->a_offset
= (off_t
)-1; /* failure */
1207 panic("spec_blktooff type");
1214 /* Offtoblk derives logical block number from file offset */
1216 spec_offtoblk(struct vnop_offtoblk_args
*ap
)
1218 struct vnode
*vp
= ap
->a_vp
;
1220 switch (vp
->v_type
) {
1222 *ap
->a_lblkno
= (daddr64_t
)-1; /* failure */
1226 printf("spec_offtoblk: not implemented for VBLK\n");
1227 *ap
->a_lblkno
= (daddr64_t
)-1; /* failure */
1231 panic("spec_offtoblk type");