2 * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1989, 1993, 1995
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/kauth.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
70 #include <sys/buf_internal.h>
71 #include <sys/mount_internal.h>
72 #include <sys/namei.h>
73 #include <sys/vnode_internal.h>
75 #include <sys/errno.h>
76 #include <sys/ioctl.h>
79 #include <sys/malloc.h>
81 #include <sys/uio_internal.h>
82 #include <sys/resource.h>
83 #include <miscfs/specfs/specdev.h>
84 #include <vfs/vfs_support.h>
86 #include <sys/kdebug.h>
88 /* XXX following three prototypes should be in a header file somewhere */
89 extern int isdisk(dev_t dev
, int type
);
90 extern dev_t
chrtoblk(dev_t dev
);
91 extern int iskmemdev(dev_t dev
);
93 struct vnode
*speclisth
[SPECHSZ
];
95 /* symbolic sleep message strings for devices */
96 char devopn
[] = "devopn";
97 char devio
[] = "devio";
98 char devwait
[] = "devwait";
99 char devin
[] = "devin";
100 char devout
[] = "devout";
101 char devioc
[] = "devioc";
102 char devcls
[] = "devcls";
104 #define VOPFUNC int (*)(void *)
106 int (**spec_vnodeop_p
)(void *);
107 struct vnodeopv_entry_desc spec_vnodeop_entries
[] = {
108 { &vnop_default_desc
, (VOPFUNC
)vn_default_error
},
109 { &vnop_lookup_desc
, (VOPFUNC
)spec_lookup
}, /* lookup */
110 { &vnop_create_desc
, (VOPFUNC
)err_create
}, /* create */
111 { &vnop_mknod_desc
, (VOPFUNC
)err_mknod
}, /* mknod */
112 { &vnop_open_desc
, (VOPFUNC
)spec_open
}, /* open */
113 { &vnop_close_desc
, (VOPFUNC
)spec_close
}, /* close */
114 { &vnop_access_desc
, (VOPFUNC
)spec_access
}, /* access */
115 { &vnop_getattr_desc
, (VOPFUNC
)spec_getattr
}, /* getattr */
116 { &vnop_setattr_desc
, (VOPFUNC
)spec_setattr
}, /* setattr */
117 { &vnop_read_desc
, (VOPFUNC
)spec_read
}, /* read */
118 { &vnop_write_desc
, (VOPFUNC
)spec_write
}, /* write */
119 { &vnop_ioctl_desc
, (VOPFUNC
)spec_ioctl
}, /* ioctl */
120 { &vnop_select_desc
, (VOPFUNC
)spec_select
}, /* select */
121 { &vnop_revoke_desc
, (VOPFUNC
)nop_revoke
}, /* revoke */
122 { &vnop_mmap_desc
, (VOPFUNC
)err_mmap
}, /* mmap */
123 { &vnop_fsync_desc
, (VOPFUNC
)spec_fsync
}, /* fsync */
124 { &vnop_remove_desc
, (VOPFUNC
)err_remove
}, /* remove */
125 { &vnop_link_desc
, (VOPFUNC
)err_link
}, /* link */
126 { &vnop_rename_desc
, (VOPFUNC
)err_rename
}, /* rename */
127 { &vnop_mkdir_desc
, (VOPFUNC
)err_mkdir
}, /* mkdir */
128 { &vnop_rmdir_desc
, (VOPFUNC
)err_rmdir
}, /* rmdir */
129 { &vnop_symlink_desc
, (VOPFUNC
)err_symlink
}, /* symlink */
130 { &vnop_readdir_desc
, (VOPFUNC
)err_readdir
}, /* readdir */
131 { &vnop_readlink_desc
, (VOPFUNC
)err_readlink
}, /* readlink */
132 { &vnop_inactive_desc
, (VOPFUNC
)nop_inactive
}, /* inactive */
133 { &vnop_reclaim_desc
, (VOPFUNC
)nop_reclaim
}, /* reclaim */
134 { &vnop_strategy_desc
, (VOPFUNC
)spec_strategy
}, /* strategy */
135 { &vnop_pathconf_desc
, (VOPFUNC
)spec_pathconf
}, /* pathconf */
136 { &vnop_advlock_desc
, (VOPFUNC
)err_advlock
}, /* advlock */
137 { &vnop_bwrite_desc
, (VOPFUNC
)spec_bwrite
}, /* bwrite */
138 { &vnop_pagein_desc
, (VOPFUNC
)err_pagein
}, /* Pagein */
139 { &vnop_pageout_desc
, (VOPFUNC
)err_pageout
}, /* Pageout */
140 { &vnop_copyfile_desc
, (VOPFUNC
)err_copyfile
}, /* Copyfile */
141 { &vnop_blktooff_desc
, (VOPFUNC
)spec_blktooff
}, /* blktooff */
142 { &vnop_offtoblk_desc
, (VOPFUNC
)spec_offtoblk
}, /* offtoblk */
143 { &vnop_blockmap_desc
, (VOPFUNC
)spec_blockmap
}, /* blockmap */
144 { (struct vnodeop_desc
*)NULL
, (int(*)())NULL
}
146 struct vnodeopv_desc spec_vnodeop_opv_desc
=
147 { &spec_vnodeop_p
, spec_vnodeop_entries
};
150 static void set_blocksize(vnode_t
, dev_t
);
154 * Trivial lookup routine that always fails.
157 spec_lookup(struct vnop_lookup_args
*ap
)
165 set_blocksize(struct vnode
*vp
, dev_t dev
)
170 if ((major(dev
) < nblkdev
) && (size
= bdevsw
[major(dev
)].d_psize
)) {
171 rsize
= (*size
)(dev
);
172 if (rsize
<= 0) /* did size fail? */
173 vp
->v_specsize
= DEV_BSIZE
;
175 vp
->v_specsize
= rsize
;
178 vp
->v_specsize
= DEV_BSIZE
;
182 set_fsblocksize(struct vnode
*vp
)
185 if (vp
->v_type
== VBLK
) {
186 dev_t dev
= (dev_t
)vp
->v_rdev
;
187 int maj
= major(dev
);
189 if ((u_int
)maj
>= (u_int
)nblkdev
)
193 set_blocksize(vp
, dev
);
201 * Open a special file.
204 spec_open(struct vnop_open_args
*ap
)
206 struct proc
*p
= vfs_context_proc(ap
->a_context
);
207 kauth_cred_t cred
= vfs_context_ucred(ap
->a_context
);
208 struct vnode
*vp
= ap
->a_vp
;
209 dev_t bdev
, dev
= (dev_t
)vp
->v_rdev
;
210 int maj
= major(dev
);
214 * Don't allow open if fs is mounted -nodev.
216 if (vp
->v_mount
&& (vp
->v_mount
->mnt_flag
& MNT_NODEV
))
219 switch (vp
->v_type
) {
222 if ((u_int
)maj
>= (u_int
)nchrdev
)
224 if (cred
!= FSCRED
&& (ap
->a_mode
& FWRITE
)) {
226 * When running in very secure mode, do not allow
227 * opens for writing of any disk character devices.
229 if (securelevel
>= 2 && isdisk(dev
, VCHR
))
232 * When running in secure mode, do not allow opens
233 * for writing of /dev/mem, /dev/kmem, or character
234 * devices whose corresponding block devices are
237 if (securelevel
>= 1) {
238 if ((bdev
= chrtoblk(dev
)) != NODEV
&& check_mountedon(bdev
, VBLK
, &error
))
244 if (cdevsw
[maj
].d_type
== D_TTY
) {
246 vp
->v_flag
|= VISTTY
;
249 error
= (*cdevsw
[maj
].d_open
)(dev
, ap
->a_mode
, S_IFCHR
, p
);
253 if ((u_int
)maj
>= (u_int
)nblkdev
)
256 * When running in very secure mode, do not allow
257 * opens for writing of any disk block devices.
259 if (securelevel
>= 2 && cred
!= FSCRED
&&
260 (ap
->a_mode
& FWRITE
) && bdevsw
[maj
].d_type
== D_DISK
)
263 * Do not allow opens of block devices that are
266 if ( (error
= vfs_mountedon(vp
)) )
268 error
= (*bdevsw
[maj
].d_open
)(dev
, ap
->a_mode
, S_IFBLK
, p
);
273 u_int32_t size512
= 512;
276 if (!VNOP_IOCTL(vp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&blksize
, 0, ap
->a_context
)) {
277 /* Switch to 512 byte sectors (temporarily) */
279 if (!VNOP_IOCTL(vp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&size512
, FWRITE
, ap
->a_context
)) {
280 /* Get the number of 512 byte physical blocks. */
281 if (!VNOP_IOCTL(vp
, DKIOCGETBLOCKCOUNT
, (caddr_t
)&blkcnt
, 0, ap
->a_context
)) {
285 /* If it doesn't set back, we can't recover */
286 if (VNOP_IOCTL(vp
, DKIOCSETBLOCKSIZE
, (caddr_t
)&blksize
, FWRITE
, ap
->a_context
))
292 set_blocksize(vp
, dev
);
295 * Cache the size in bytes of the block device for later
296 * use by spec_write().
299 vp
->v_specdevsize
= blkcnt
* (u_int64_t
)size512
;
301 vp
->v_specdevsize
= (u_int64_t
)0; /* Default: Can't get */
308 panic("spec_open type");
317 spec_read(struct vnop_read_args
*ap
)
319 struct vnode
*vp
= ap
->a_vp
;
320 struct uio
*uio
= ap
->a_uio
;
322 daddr64_t bn
, nextbn
;
330 if (uio
->uio_rw
!= UIO_READ
)
331 panic("spec_read mode");
332 if (UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))
333 panic("spec_read proc");
335 if (uio_resid(uio
) == 0)
338 switch (vp
->v_type
) {
341 error
= (*cdevsw
[major(vp
->v_rdev
)].d_read
)
342 (vp
->v_rdev
, uio
, ap
->a_ioflag
);
346 if (uio
->uio_offset
< 0)
351 devBlockSize
= vp
->v_specsize
;
353 if (devBlockSize
> PAGE_SIZE
)
356 bscale
= PAGE_SIZE
/ devBlockSize
;
357 bsize
= bscale
* devBlockSize
;
360 on
= uio
->uio_offset
% bsize
;
362 bn
= (daddr64_t
)((uio
->uio_offset
/ devBlockSize
) &~ (bscale
- 1));
364 if (vp
->v_speclastr
+ bscale
== bn
) {
365 nextbn
= bn
+ bscale
;
366 error
= buf_breadn(vp
, bn
, (int)bsize
, &nextbn
,
367 (int *)&bsize
, 1, NOCRED
, &bp
);
369 error
= buf_bread(vp
, bn
, (int)bsize
, NOCRED
, &bp
);
372 vp
->v_speclastr
= bn
;
375 n
= bsize
- buf_resid(bp
);
376 if ((on
> n
) || error
) {
382 // LP64todo - fix this!
383 n
= min((unsigned)(n
- on
), uio_resid(uio
));
385 error
= uiomove((char *)0 + buf_dataptr(bp
) + on
, n
, uio
);
389 } while (error
== 0 && uio_resid(uio
) > 0 && n
!= 0);
393 panic("spec_read type");
404 spec_write(struct vnop_write_args
*ap
)
406 struct vnode
*vp
= ap
->a_vp
;
407 struct uio
*uio
= ap
->a_uio
;
410 int bsize
, blkmask
, bscale
;
419 if (uio
->uio_rw
!= UIO_WRITE
)
420 panic("spec_write mode");
421 if (UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))
422 panic("spec_write proc");
425 switch (vp
->v_type
) {
428 error
= (*cdevsw
[major(vp
->v_rdev
)].d_write
)
429 (vp
->v_rdev
, uio
, ap
->a_ioflag
);
433 if (uio_resid(uio
) == 0)
435 if (uio
->uio_offset
< 0)
438 io_sync
= (ap
->a_ioflag
& IO_SYNC
);
439 // LP64todo - fix this!
440 io_size
= uio_resid(uio
);
444 devBlockSize
= vp
->v_specsize
;
445 if (devBlockSize
> PAGE_SIZE
)
448 bscale
= PAGE_SIZE
/ devBlockSize
;
449 blkmask
= bscale
- 1;
450 bsize
= bscale
* devBlockSize
;
454 bn
= (daddr64_t
)((uio
->uio_offset
/ devBlockSize
) &~ blkmask
);
455 on
= uio
->uio_offset
% bsize
;
457 // LP64todo - fix this!
458 n
= min((unsigned)(bsize
- on
), uio_resid(uio
));
461 * Use buf_getblk() as an optimization IFF:
463 * 1) We are reading exactly a block on a block
465 * 2) We know the size of the device from spec_open
466 * 3) The read doesn't span the end of the device
468 * Otherwise, we fall back on buf_bread().
471 vp
->v_specdevsize
!= (u_int64_t
)0 &&
472 (uio
->uio_offset
+ (u_int64_t
)n
) > vp
->v_specdevsize
) {
473 /* reduce the size of the read to what is there */
474 n
= (uio
->uio_offset
+ (u_int64_t
)n
) - vp
->v_specdevsize
;
478 bp
= buf_getblk(vp
, bn
, bsize
, 0, 0, BLK_WRITE
);
480 error
= (int)buf_bread(vp
, bn
, bsize
, NOCRED
, &bp
);
482 /* Translate downstream error for upstream, if needed */
484 error
= (int)buf_error(bp
);
489 n
= min(n
, bsize
- buf_resid(bp
));
491 error
= uiomove((char *)0 + buf_dataptr(bp
) + on
, n
, uio
);
499 error
= buf_bwrite(bp
);
501 if ((n
+ on
) == bsize
)
502 error
= buf_bawrite(bp
);
504 error
= buf_bdwrite(bp
);
506 } while (error
== 0 && uio_resid(uio
) > 0 && n
!= 0);
510 panic("spec_write type");
518 * Device ioctl operation.
521 spec_ioctl(struct vnop_ioctl_args
*ap
)
523 proc_t p
= vfs_context_proc(ap
->a_context
);
524 dev_t dev
= ap
->a_vp
->v_rdev
;
526 switch (ap
->a_vp
->v_type
) {
529 return ((*cdevsw
[major(dev
)].d_ioctl
)(dev
, ap
->a_command
, ap
->a_data
,
533 if (ap
->a_command
== 0 && (unsigned int)ap
->a_data
== B_TAPE
) {
534 if (bdevsw
[major(dev
)].d_type
== D_TAPE
)
539 return ((*bdevsw
[major(dev
)].d_ioctl
)(dev
, ap
->a_command
, ap
->a_data
,
550 spec_select(struct vnop_select_args
*ap
)
552 proc_t p
= vfs_context_proc(ap
->a_context
);
555 switch (ap
->a_vp
->v_type
) {
558 return (1); /* XXX */
561 dev
= ap
->a_vp
->v_rdev
;
562 return (*cdevsw
[major(dev
)].d_select
)(dev
, ap
->a_which
, ap
->a_wql
, p
);
567 * Synch buffers associated with a block device
570 spec_fsync_internal(vnode_t vp
, int waitfor
, __unused vfs_context_t context
)
572 if (vp
->v_type
== VCHR
)
575 * Flush all dirty buffers associated with a block device.
577 buf_flushdirtyblks(vp
, waitfor
== MNT_WAIT
, 0, "spec_fsync");
583 spec_fsync(struct vnop_fsync_args
*ap
)
585 return spec_fsync_internal(ap
->a_vp
, ap
->a_waitfor
, ap
->a_context
);
589 * Just call the device strategy routine
591 extern int hard_throttle_on_root
;
594 // the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
595 #define LOWPRI_INITIAL_WINDOW_MSECS 100
596 #define LOWPRI_WINDOW_MSECS_INC 50
597 #define LOWPRI_MAX_WINDOW_MSECS 200
598 #define LOWPRI_MAX_WAITING_MSECS 200
599 #define LOWPRI_SLEEP_INTERVAL 5
601 struct _throttle_io_info_t
{
602 struct timeval last_normal_IO_timestamp
;
603 struct timeval last_IO_timestamp
;
604 SInt32 numthreads_throttling
;
607 struct _throttle_io_info_t _throttle_io_info
[LOWPRI_MAX_NUM_DEV
];
608 int lowpri_IO_initial_window_msecs
= LOWPRI_INITIAL_WINDOW_MSECS
;
609 int lowpri_IO_window_msecs_inc
= LOWPRI_WINDOW_MSECS_INC
;
610 int lowpri_max_window_msecs
= LOWPRI_MAX_WINDOW_MSECS
;
611 int lowpri_max_waiting_msecs
= LOWPRI_MAX_WAITING_MSECS
;
613 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_IO_initial_window_msecs
, CTLFLAG_RW
, &lowpri_IO_initial_window_msecs
, LOWPRI_INITIAL_WINDOW_MSECS
, "");
614 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_IO_window_inc
, CTLFLAG_RW
, &lowpri_IO_window_msecs_inc
, LOWPRI_INITIAL_WINDOW_MSECS
, "");
615 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_max_window_msecs
, CTLFLAG_RW
, &lowpri_max_window_msecs
, LOWPRI_INITIAL_WINDOW_MSECS
, "");
616 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_max_waiting_msecs
, CTLFLAG_RW
, &lowpri_max_waiting_msecs
, LOWPRI_INITIAL_WINDOW_MSECS
, "");
619 throttle_info_get_last_io_time(mount_t mp
, struct timeval
*tv
)
623 devbsdunit
= mp
->mnt_devbsdunit
;
625 if (devbsdunit
< LOWPRI_MAX_NUM_DEV
) {
626 *tv
= _throttle_io_info
[devbsdunit
].last_IO_timestamp
;
628 memset(tv
, 0, sizeof(*tv
));
633 update_last_io_time(mount_t mp
)
637 devbsdunit
= mp
->mnt_devbsdunit
;
639 if (devbsdunit
< LOWPRI_MAX_NUM_DEV
) {
640 microuptime(&_throttle_io_info
[devbsdunit
].last_IO_timestamp
);
644 int throttle_io_will_be_throttled(int lowpri_window_msecs
, size_t devbsdunit
)
646 struct timeval elapsed
;
649 microuptime(&elapsed
);
650 timevalsub(&elapsed
, &_throttle_io_info
[devbsdunit
].last_normal_IO_timestamp
);
651 elapsed_msecs
= elapsed
.tv_sec
* 1000 + elapsed
.tv_usec
/ 1000;
653 if (lowpri_window_msecs
== -1) // use the max waiting time
654 lowpri_window_msecs
= lowpri_max_waiting_msecs
;
656 return elapsed_msecs
< lowpri_window_msecs
;
659 void throttle_lowpri_io(boolean_t ok_to_sleep
)
665 ut
= get_bsdthread_info(current_thread());
667 if (ut
->uu_lowpri_window
== 0)
670 max_try_num
= lowpri_max_waiting_msecs
/ LOWPRI_SLEEP_INTERVAL
* MAX(1, _throttle_io_info
[ut
->uu_devbsdunit
].numthreads_throttling
);
672 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_START
,
673 ut
->uu_lowpri_window
, 0, 0, 0, 0);
675 if (ok_to_sleep
== TRUE
) {
676 for (i
=0; i
<max_try_num
; i
++) {
677 if (throttle_io_will_be_throttled(ut
->uu_lowpri_window
, ut
->uu_devbsdunit
)) {
678 IOSleep(LOWPRI_SLEEP_INTERVAL
);
684 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_END
,
685 ut
->uu_lowpri_window
, i
*5, 0, 0, 0);
687 oldValue
= OSDecrementAtomic(&_throttle_io_info
[ut
->uu_devbsdunit
].numthreads_throttling
);
688 ut
->uu_lowpri_window
= 0;
691 panic("%s: numthreads negative", __func__
);
695 int throttle_get_io_policy(struct uthread
**ut
)
697 int policy
= IOPOL_DEFAULT
;
698 proc_t p
= current_proc();
700 *ut
= get_bsdthread_info(current_thread());
703 policy
= p
->p_iopol_disk
;
706 // the I/O policy of the thread overrides that of the process
707 // unless the I/O policy of the thread is default
708 if ((*ut
)->uu_iopol_disk
!= IOPOL_DEFAULT
)
709 policy
= (*ut
)->uu_iopol_disk
;
715 spec_strategy(struct vnop_strategy_args
*ap
)
722 bdev
= buf_device(bp
);
723 bflags
= buf_flags(bp
);
730 if (bflags
& B_ASYNC
)
735 else if (bflags
& B_PAGEIO
)
738 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW
, code
) | DBG_FUNC_NONE
,
739 (unsigned int)bp
, bdev
, (int)buf_blkno(bp
), buf_count(bp
), 0);
741 if (((bflags
& (B_PAGEIO
| B_READ
)) == (B_PAGEIO
| B_READ
)) &&
742 (buf_vnode(bp
)->v_mount
->mnt_kern_flag
& MNTK_ROOTDEV
))
743 hard_throttle_on_root
= 1;
745 if (lowpri_IO_initial_window_msecs
) {
748 int is_throttleable_io
= 0;
749 int is_passive_io
= 0;
753 policy
= throttle_get_io_policy(&ut
);
760 is_throttleable_io
= 1;
766 printf("unknown I/O policy %d", policy
);
770 if (!is_throttleable_io
&& ISSET(bflags
, B_PASSIVE
))
773 if (buf_vnode(bp
)->v_mount
!= NULL
)
774 devbsdunit
= buf_vnode(bp
)->v_mount
->mnt_devbsdunit
;
776 devbsdunit
= LOWPRI_MAX_NUM_DEV
- 1;
777 if (!is_throttleable_io
) {
779 microuptime(&_throttle_io_info
[devbsdunit
].last_normal_IO_timestamp
);
783 * I'd really like to do the IOSleep here, but
784 * we may be holding all kinds of filesystem related locks
785 * and the pages for this I/O marked 'busy'...
786 * we don't want to cause a normal task to block on
787 * one of these locks while we're throttling a task marked
788 * for low priority I/O... we'll mark the uthread and
789 * do the delay just before we return from the system
790 * call that triggered this I/O or from vnode_pagein
792 if (ut
->uu_lowpri_window
== 0) {
793 ut
->uu_devbsdunit
= devbsdunit
;
794 oldValue
= OSIncrementAtomic(&_throttle_io_info
[devbsdunit
].numthreads_throttling
);
796 panic("%s: numthreads negative", __func__
);
798 ut
->uu_lowpri_window
= lowpri_IO_initial_window_msecs
;
799 ut
->uu_lowpri_window
+= lowpri_IO_window_msecs_inc
* oldValue
;
801 if (ut
->uu_devbsdunit
!= devbsdunit
) { // the thread sends I/Os to different devices within the same system call
802 // keep track of the numthreads in the right device
803 OSDecrementAtomic(&_throttle_io_info
[ut
->uu_devbsdunit
].numthreads_throttling
);
804 OSIncrementAtomic(&_throttle_io_info
[devbsdunit
].numthreads_throttling
);
805 ut
->uu_devbsdunit
= devbsdunit
;
807 int numthreads
= MAX(1, _throttle_io_info
[devbsdunit
].numthreads_throttling
);
808 ut
->uu_lowpri_window
+= lowpri_IO_window_msecs_inc
* numthreads
;
809 if (ut
->uu_lowpri_window
> lowpri_max_window_msecs
* numthreads
)
810 ut
->uu_lowpri_window
= lowpri_max_window_msecs
* numthreads
;
815 if ((bflags
& B_READ
) == 0) {
818 if (buf_vnode(bp
)->v_mount
!= NULL
)
819 devbsdunit
= buf_vnode(bp
)->v_mount
->mnt_devbsdunit
;
821 devbsdunit
= LOWPRI_MAX_NUM_DEV
- 1;
823 microuptime(&_throttle_io_info
[devbsdunit
].last_IO_timestamp
);
826 (*bdevsw
[major(bdev
)].d_strategy
)(bp
);
833 * This is a noop, simply returning what one has been given.
836 spec_blockmap(__unused
struct vnop_blockmap_args
*ap
)
843 * Device close routine
846 spec_close(struct vnop_close_args
*ap
)
848 struct vnode
*vp
= ap
->a_vp
;
849 dev_t dev
= vp
->v_rdev
;
850 int (*devclose
)(dev_t
, int, int, struct proc
*);
852 int flags
= ap
->a_fflag
;
853 struct proc
*p
= vfs_context_proc(ap
->a_context
);
854 struct session
*sessp
;
856 switch (vp
->v_type
) {
860 * Hack: a tty device that is a controlling terminal
861 * has a reference from the session structure.
862 * We cannot easily tell that a character device is
863 * a controlling terminal, unless it is the closing
864 * process' controlling terminal. In that case,
865 * if the reference count is 2 (this last descriptor
866 * plus the session), release the reference from the session.
868 sessp
= proc_session(p
);
869 if (sessp
!= SESSION_NULL
) {
870 if ((vcount(vp
) == 2) &&
871 (vp
== sessp
->s_ttyvp
)) {
873 sessp
->s_ttyvp
= NULL
;
875 sessp
->s_ttyp
= NULL
;
876 sessp
->s_ttypgrpid
= NO_PID
;
877 session_unlock(sessp
);
883 devclose
= cdevsw
[major(dev
)].d_close
;
886 * close on last reference or on vnode revoke call
888 if ((flags
& IO_REVOKE
) != 0)
895 #ifdef DEVFS_IMPLEMENTS_LOCKING
897 * On last close of a block device (that isn't mounted)
898 * we must invalidate any in core blocks, so that
899 * we can, for instance, change floppy disks.
901 if ((error
= spec_fsync_internal(vp
, MNT_WAIT
, ap
->a_context
)))
904 error
= buf_invalidateblks(vp
, BUF_WRITE_DATA
, 0, 0);
908 * Since every use (buffer, vnode, swap, blockmap)
909 * holds a reference to the vnode, and because we mark
910 * any other vnodes that alias this device, when the
911 * sum of the reference counts on all the aliased
912 * vnodes descends to one, we are on last close.
916 #else /* DEVFS_IMPLEMENTS_LOCKING */
918 * Since every use (buffer, vnode, swap, blockmap)
919 * holds a reference to the vnode, and because we mark
920 * any other vnodes that alias this device, when the
921 * sum of the reference counts on all the aliased
922 * vnodes descends to one, we are on last close.
928 * On last close of a block device (that isn't mounted)
929 * we must invalidate any in core blocks, so that
930 * we can, for instance, change floppy disks.
932 if ((error
= spec_fsync_internal(vp
, MNT_WAIT
, ap
->a_context
)))
935 error
= buf_invalidateblks(vp
, BUF_WRITE_DATA
, 0, 0);
938 #endif /* DEVFS_IMPLEMENTS_LOCKING */
939 devclose
= bdevsw
[major(dev
)].d_close
;
944 panic("spec_close: not special");
948 return ((*devclose
)(dev
, flags
, mode
, p
));
952 * Return POSIX pathconf information applicable to special devices.
955 spec_pathconf(struct vnop_pathconf_args
*ap
)
958 switch (ap
->a_name
) {
960 *ap
->a_retval
= LINK_MAX
;
963 *ap
->a_retval
= MAX_CANON
;
966 *ap
->a_retval
= MAX_INPUT
;
969 *ap
->a_retval
= PIPE_BUF
;
971 case _PC_CHOWN_RESTRICTED
:
972 *ap
->a_retval
= 200112; /* _POSIX_CHOWN_RESTRICTED */
975 *ap
->a_retval
= _POSIX_VDISABLE
;
984 * Special device failed operation
987 spec_ebadf(__unused
void *dummy
)
993 /* Blktooff derives file offset from logical block number */
995 spec_blktooff(struct vnop_blktooff_args
*ap
)
997 struct vnode
*vp
= ap
->a_vp
;
999 switch (vp
->v_type
) {
1001 *ap
->a_offset
= (off_t
)-1; /* failure */
1005 printf("spec_blktooff: not implemented for VBLK\n");
1006 *ap
->a_offset
= (off_t
)-1; /* failure */
1010 panic("spec_blktooff type");
1017 /* Offtoblk derives logical block number from file offset */
1019 spec_offtoblk(struct vnop_offtoblk_args
*ap
)
1021 struct vnode
*vp
= ap
->a_vp
;
1023 switch (vp
->v_type
) {
1025 *ap
->a_lblkno
= (daddr64_t
)-1; /* failure */
1029 printf("spec_offtoblk: not implemented for VBLK\n");
1030 *ap
->a_lblkno
= (daddr64_t
)-1; /* failure */
1034 panic("spec_offtoblk type");