]> git.saurik.com Git - apple/xnu.git/blame - bsd/miscfs/specfs/spec_vnops.c
xnu-1699.22.73.tar.gz
[apple/xnu.git] / bsd / miscfs / specfs / spec_vnops.c
CommitLineData
1c79356b 1/*
2d21ac55 2 * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1989, 1993, 1995
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95
62 */
63
64#include <sys/param.h>
91447636
A
65#include <sys/proc_internal.h>
66#include <sys/kauth.h>
1c79356b
A
67#include <sys/systm.h>
68#include <sys/kernel.h>
69#include <sys/conf.h>
91447636
A
70#include <sys/buf_internal.h>
71#include <sys/mount_internal.h>
91447636 72#include <sys/vnode_internal.h>
6d2010ae
A
73#include <sys/file_internal.h>
74#include <sys/namei.h>
1c79356b
A
75#include <sys/stat.h>
76#include <sys/errno.h>
77#include <sys/ioctl.h>
78#include <sys/file.h>
91447636 79#include <sys/user.h>
1c79356b 80#include <sys/malloc.h>
55e303ae 81#include <sys/disk.h>
91447636 82#include <sys/uio_internal.h>
2d21ac55 83#include <sys/resource.h>
1c79356b
A
84#include <miscfs/specfs/specdev.h>
85#include <vfs/vfs_support.h>
6d2010ae
A
86#include <kern/assert.h>
87#include <kern/task.h>
1c79356b 88
9bccf70c 89#include <sys/kdebug.h>
1c79356b 90
2d21ac55 91/* XXX following three prototypes should be in a header file somewhere */
2d21ac55
A
92extern dev_t chrtoblk(dev_t dev);
93extern int iskmemdev(dev_t dev);
b0d623f7
A
94extern int bpfkqfilter(dev_t dev, struct knote *kn);
95extern int ptsd_kqfilter(dev_t dev, struct knote *kn);
2d21ac55 96
1c79356b
A
97struct vnode *speclisth[SPECHSZ];
98
99/* symbolic sleep message strings for devices */
100char devopn[] = "devopn";
101char devio[] = "devio";
102char devwait[] = "devwait";
103char devin[] = "devin";
104char devout[] = "devout";
105char devioc[] = "devioc";
106char devcls[] = "devcls";
107
108#define VOPFUNC int (*)(void *)
109
110int (**spec_vnodeop_p)(void *);
111struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
91447636
A
112 { &vnop_default_desc, (VOPFUNC)vn_default_error },
113 { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */
114 { &vnop_create_desc, (VOPFUNC)err_create }, /* create */
115 { &vnop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */
116 { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */
117 { &vnop_close_desc, (VOPFUNC)spec_close }, /* close */
118 { &vnop_access_desc, (VOPFUNC)spec_access }, /* access */
119 { &vnop_getattr_desc, (VOPFUNC)spec_getattr }, /* getattr */
120 { &vnop_setattr_desc, (VOPFUNC)spec_setattr }, /* setattr */
121 { &vnop_read_desc, (VOPFUNC)spec_read }, /* read */
122 { &vnop_write_desc, (VOPFUNC)spec_write }, /* write */
123 { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */
124 { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */
125 { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */
126 { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */
127 { &vnop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */
128 { &vnop_remove_desc, (VOPFUNC)err_remove }, /* remove */
129 { &vnop_link_desc, (VOPFUNC)err_link }, /* link */
130 { &vnop_rename_desc, (VOPFUNC)err_rename }, /* rename */
131 { &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */
132 { &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */
133 { &vnop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */
134 { &vnop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */
135 { &vnop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */
136 { &vnop_inactive_desc, (VOPFUNC)nop_inactive }, /* inactive */
137 { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim }, /* reclaim */
138 { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */
139 { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */
140 { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */
141 { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite }, /* bwrite */
91447636
A
142 { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */
143 { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */
144 { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */
145 { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */
146 { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk }, /* offtoblk */
147 { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap }, /* blockmap */
1c79356b
A
148 { (struct vnodeop_desc*)NULL, (int(*)())NULL }
149};
150struct vnodeopv_desc spec_vnodeop_opv_desc =
151 { &spec_vnodeop_p, spec_vnodeop_entries };
152
91447636
A
153
154static void set_blocksize(vnode_t, dev_t);
155
156
1c79356b
A
157/*
158 * Trivial lookup routine that always fails.
159 */
160int
2d21ac55 161spec_lookup(struct vnop_lookup_args *ap)
1c79356b
A
162{
163
164 *ap->a_vpp = NULL;
165 return (ENOTDIR);
166}
167
91447636 168static void
1c79356b
A
169set_blocksize(struct vnode *vp, dev_t dev)
170{
91447636 171 int (*size)(dev_t);
1c79356b
A
172 int rsize;
173
174 if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
175 rsize = (*size)(dev);
176 if (rsize <= 0) /* did size fail? */
177 vp->v_specsize = DEV_BSIZE;
178 else
179 vp->v_specsize = rsize;
180 }
181 else
182 vp->v_specsize = DEV_BSIZE;
183}
184
185void
186set_fsblocksize(struct vnode *vp)
187{
188
189 if (vp->v_type == VBLK) {
190 dev_t dev = (dev_t)vp->v_rdev;
191 int maj = major(dev);
192
91447636 193 if ((u_int)maj >= (u_int)nblkdev)
1c79356b
A
194 return;
195
91447636 196 vnode_lock(vp);
1c79356b 197 set_blocksize(vp, dev);
91447636 198 vnode_unlock(vp);
1c79356b
A
199 }
200
201}
202
203
204/*
205 * Open a special file.
206 */
91447636 207int
2d21ac55 208spec_open(struct vnop_open_args *ap)
1c79356b 209{
91447636
A
210 struct proc *p = vfs_context_proc(ap->a_context);
211 kauth_cred_t cred = vfs_context_ucred(ap->a_context);
212 struct vnode *vp = ap->a_vp;
1c79356b
A
213 dev_t bdev, dev = (dev_t)vp->v_rdev;
214 int maj = major(dev);
215 int error;
216
217 /*
218 * Don't allow open if fs is mounted -nodev.
219 */
220 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
221 return (ENXIO);
222
223 switch (vp->v_type) {
224
225 case VCHR:
91447636 226 if ((u_int)maj >= (u_int)nchrdev)
1c79356b 227 return (ENXIO);
91447636 228 if (cred != FSCRED && (ap->a_mode & FWRITE)) {
1c79356b
A
229 /*
230 * When running in very secure mode, do not allow
231 * opens for writing of any disk character devices.
232 */
233 if (securelevel >= 2 && isdisk(dev, VCHR))
234 return (EPERM);
235 /*
236 * When running in secure mode, do not allow opens
237 * for writing of /dev/mem, /dev/kmem, or character
238 * devices whose corresponding block devices are
239 * currently mounted.
240 */
241 if (securelevel >= 1) {
91447636 242 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error))
1c79356b
A
243 return (error);
244 if (iskmemdev(dev))
245 return (EPERM);
246 }
247 }
91447636
A
248 if (cdevsw[maj].d_type == D_TTY) {
249 vnode_lock(vp);
1c79356b 250 vp->v_flag |= VISTTY;
91447636
A
251 vnode_unlock(vp);
252 }
6d2010ae
A
253
254 devsw_lock(dev, S_IFCHR);
1c79356b 255 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
6d2010ae
A
256
257 if (error == 0) {
258 vp->v_specinfo->si_opencount++;
259 }
260
261 devsw_unlock(dev, S_IFCHR);
1c79356b
A
262 return (error);
263
264 case VBLK:
91447636 265 if ((u_int)maj >= (u_int)nblkdev)
1c79356b
A
266 return (ENXIO);
267 /*
268 * When running in very secure mode, do not allow
269 * opens for writing of any disk block devices.
270 */
91447636 271 if (securelevel >= 2 && cred != FSCRED &&
1c79356b
A
272 (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK)
273 return (EPERM);
274 /*
275 * Do not allow opens of block devices that are
276 * currently mounted.
277 */
91447636 278 if ( (error = vfs_mountedon(vp)) )
1c79356b 279 return (error);
6d2010ae
A
280
281 devsw_lock(dev, S_IFBLK);
1c79356b 282 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
6d2010ae
A
283 if (!error) {
284 vp->v_specinfo->si_opencount++;
285 }
286 devsw_unlock(dev, S_IFBLK);
287
1c79356b 288 if (!error) {
55e303ae
A
289 u_int64_t blkcnt;
290 u_int32_t blksize;
91447636
A
291 int setsize = 0;
292 u_int32_t size512 = 512;
293
294
295 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
296 /* Switch to 512 byte sectors (temporarily) */
55e303ae 297
91447636
A
298 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
299 /* Get the number of 512 byte physical blocks. */
300 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
301 setsize = 1;
302 }
303 }
304 /* If it doesn't set back, we can't recover */
305 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context))
306 error = ENXIO;
307 }
308
309
310 vnode_lock(vp);
1c79356b 311 set_blocksize(vp, dev);
55e303ae
A
312
313 /*
314 * Cache the size in bytes of the block device for later
315 * use by spec_write().
316 */
91447636 317 if (setsize)
55e303ae 318 vp->v_specdevsize = blkcnt * (u_int64_t)size512;
91447636
A
319 else
320 vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */
321
322 vnode_unlock(vp);
323
1c79356b
A
324 }
325 return(error);
91447636
A
326 default:
327 panic("spec_open type");
1c79356b
A
328 }
329 return (0);
330}
331
332/*
333 * Vnode op for read
334 */
91447636 335int
2d21ac55 336spec_read(struct vnop_read_args *ap)
1c79356b 337{
2d21ac55
A
338 struct vnode *vp = ap->a_vp;
339 struct uio *uio = ap->a_uio;
1c79356b 340 struct buf *bp;
91447636 341 daddr64_t bn, nextbn;
1c79356b
A
342 long bsize, bscale;
343 int devBlockSize=0;
91447636 344 int n, on;
1c79356b
A
345 int error = 0;
346 dev_t dev;
347
348#if DIAGNOSTIC
349 if (uio->uio_rw != UIO_READ)
350 panic("spec_read mode");
91447636 351 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
1c79356b
A
352 panic("spec_read proc");
353#endif
91447636 354 if (uio_resid(uio) == 0)
1c79356b
A
355 return (0);
356
357 switch (vp->v_type) {
358
359 case VCHR:
1c79356b
A
360 error = (*cdevsw[major(vp->v_rdev)].d_read)
361 (vp->v_rdev, uio, ap->a_ioflag);
1c79356b
A
362 return (error);
363
364 case VBLK:
365 if (uio->uio_offset < 0)
366 return (EINVAL);
367
368 dev = vp->v_rdev;
369
370 devBlockSize = vp->v_specsize;
371
372 if (devBlockSize > PAGE_SIZE)
373 return (EINVAL);
374
375 bscale = PAGE_SIZE / devBlockSize;
376 bsize = bscale * devBlockSize;
377
378 do {
379 on = uio->uio_offset % bsize;
380
91447636 381 bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1));
1c79356b 382
91447636 383 if (vp->v_speclastr + bscale == bn) {
1c79356b 384 nextbn = bn + bscale;
91447636 385 error = buf_breadn(vp, bn, (int)bsize, &nextbn,
1c79356b
A
386 (int *)&bsize, 1, NOCRED, &bp);
387 } else
91447636
A
388 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
389
390 vnode_lock(vp);
391 vp->v_speclastr = bn;
392 vnode_unlock(vp);
1c79356b 393
91447636 394 n = bsize - buf_resid(bp);
1c79356b
A
395 if ((on > n) || error) {
396 if (!error)
397 error = EINVAL;
91447636 398 buf_brelse(bp);
1c79356b
A
399 return (error);
400 }
91447636 401 n = min((unsigned)(n - on), uio_resid(uio));
1c79356b 402
6d2010ae 403 error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
1c79356b 404 if (n + on == bsize)
91447636
A
405 buf_markaged(bp);
406 buf_brelse(bp);
407 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
1c79356b
A
408 return (error);
409
410 default:
411 panic("spec_read type");
412 }
413 /* NOTREACHED */
91447636
A
414
415 return (0);
1c79356b
A
416}
417
418/*
419 * Vnode op for write
420 */
91447636 421int
2d21ac55 422spec_write(struct vnop_write_args *ap)
1c79356b 423{
2d21ac55
A
424 struct vnode *vp = ap->a_vp;
425 struct uio *uio = ap->a_uio;
1c79356b 426 struct buf *bp;
91447636 427 daddr64_t bn;
1c79356b 428 int bsize, blkmask, bscale;
2d21ac55 429 int io_sync;
1c79356b 430 int devBlockSize=0;
2d21ac55 431 int n, on;
1c79356b
A
432 int error = 0;
433 dev_t dev;
434
435#if DIAGNOSTIC
436 if (uio->uio_rw != UIO_WRITE)
437 panic("spec_write mode");
91447636 438 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
1c79356b
A
439 panic("spec_write proc");
440#endif
441
442 switch (vp->v_type) {
443
444 case VCHR:
1c79356b
A
445 error = (*cdevsw[major(vp->v_rdev)].d_write)
446 (vp->v_rdev, uio, ap->a_ioflag);
1c79356b
A
447 return (error);
448
449 case VBLK:
91447636 450 if (uio_resid(uio) == 0)
1c79356b
A
451 return (0);
452 if (uio->uio_offset < 0)
453 return (EINVAL);
454
455 io_sync = (ap->a_ioflag & IO_SYNC);
1c79356b
A
456
457 dev = (vp->v_rdev);
458
459 devBlockSize = vp->v_specsize;
460 if (devBlockSize > PAGE_SIZE)
461 return(EINVAL);
462
463 bscale = PAGE_SIZE / devBlockSize;
464 blkmask = bscale - 1;
465 bsize = bscale * devBlockSize;
466
467
468 do {
91447636 469 bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask);
1c79356b
A
470 on = uio->uio_offset % bsize;
471
91447636 472 n = min((unsigned)(bsize - on), uio_resid(uio));
1c79356b 473
55e303ae 474 /*
91447636 475 * Use buf_getblk() as an optimization IFF:
55e303ae
A
476 *
477 * 1) We are reading exactly a block on a block
478 * aligned boundary
479 * 2) We know the size of the device from spec_open
480 * 3) The read doesn't span the end of the device
481 *
91447636 482 * Otherwise, we fall back on buf_bread().
55e303ae
A
483 */
484 if (n == bsize &&
485 vp->v_specdevsize != (u_int64_t)0 &&
486 (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
487 /* reduce the size of the read to what is there */
488 n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
489 }
490
1c79356b 491 if (n == bsize)
91447636 492 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
1c79356b 493 else
91447636 494 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
1c79356b 495
55e303ae 496 /* Translate downstream error for upstream, if needed */
91447636
A
497 if (!error)
498 error = (int)buf_error(bp);
1c79356b 499 if (error) {
91447636 500 buf_brelse(bp);
1c79356b
A
501 return (error);
502 }
91447636 503 n = min(n, bsize - buf_resid(bp));
1c79356b 504
6d2010ae 505 error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
91447636
A
506 if (error) {
507 buf_brelse(bp);
508 return (error);
509 }
510 buf_markaged(bp);
1c79356b
A
511
512 if (io_sync)
91447636 513 error = buf_bwrite(bp);
1c79356b
A
514 else {
515 if ((n + on) == bsize)
91447636 516 error = buf_bawrite(bp);
1c79356b 517 else
91447636 518 error = buf_bdwrite(bp);
1c79356b 519 }
91447636 520 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
1c79356b
A
521 return (error);
522
523 default:
524 panic("spec_write type");
525 }
526 /* NOTREACHED */
91447636
A
527
528 return (0);
1c79356b
A
529}
530
531/*
532 * Device ioctl operation.
533 */
91447636 534int
2d21ac55 535spec_ioctl(struct vnop_ioctl_args *ap)
1c79356b 536{
91447636 537 proc_t p = vfs_context_proc(ap->a_context);
1c79356b 538 dev_t dev = ap->a_vp->v_rdev;
b0d623f7
A
539 int retval = 0;
540
541 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START,
542 (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, (unsigned int)ap->a_vp->v_type, 0);
1c79356b
A
543
544 switch (ap->a_vp->v_type) {
545
546 case VCHR:
b0d623f7
A
547 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
548 ap->a_fflag, p);
549 break;
1c79356b
A
550
551 case VBLK:
b0d623f7
A
552 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
553 ap->a_fflag, p);
554 break;
1c79356b
A
555
556 default:
557 panic("spec_ioctl");
558 /* NOTREACHED */
559 }
b0d623f7
A
560 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END,
561 (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, retval, 0);
562
563 return (retval);
1c79356b
A
564}
565
91447636 566int
2d21ac55 567spec_select(struct vnop_select_args *ap)
1c79356b 568{
91447636 569 proc_t p = vfs_context_proc(ap->a_context);
2d21ac55 570 dev_t dev;
1c79356b
A
571
572 switch (ap->a_vp->v_type) {
573
574 default:
575 return (1); /* XXX */
576
577 case VCHR:
578 dev = ap->a_vp->v_rdev;
91447636 579 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
1c79356b
A
580 }
581}
91447636 582
6d2010ae
A
583static int filt_specattach(struct knote *kn);
584
b0d623f7
A
585int
586spec_kqfilter(vnode_t vp, struct knote *kn)
587{
588 dev_t dev;
589 int err = EINVAL;
590
591 /*
592 * For a few special kinds of devices, we can attach knotes.
593 * Each filter function must check whether the dev type matches it.
594 */
595 dev = vnode_specrdev(vp);
596
597 if (vnode_istty(vp)) {
6d2010ae
A
598 /* We can hook into TTYs... */
599 err = filt_specattach(kn);
b0d623f7
A
600 } else {
601 /* Try a bpf device, as defined in bsd/net/bpf.c */
602 err = bpfkqfilter(dev, kn);
603 }
604
605 return err;
606}
607
1c79356b
A
608/*
609 * Synch buffers associated with a block device
610 */
1c79356b 611int
91447636 612spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
1c79356b 613{
1c79356b
A
614 if (vp->v_type == VCHR)
615 return (0);
616 /*
617 * Flush all dirty buffers associated with a block device.
618 */
b0d623f7 619 buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync");
91447636 620
1c79356b
A
621 return (0);
622}
623
91447636 624int
2d21ac55 625spec_fsync(struct vnop_fsync_args *ap)
91447636
A
626{
627 return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
628}
629
1c79356b
A
630/*
631 * Just call the device strategy routine
632 */
91447636 633extern int hard_throttle_on_root;
2d21ac55 634void IOSleep(int);
2d21ac55
A
635
636// the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
637#define LOWPRI_INITIAL_WINDOW_MSECS 100
638#define LOWPRI_WINDOW_MSECS_INC 50
639#define LOWPRI_MAX_WINDOW_MSECS 200
640#define LOWPRI_MAX_WAITING_MSECS 200
2d21ac55 641
6d2010ae
A
642#if CONFIG_EMBEDDED
643#define LOWPRI_SLEEP_INTERVAL 5
644#else
645#define LOWPRI_SLEEP_INTERVAL 2
646#endif
b0d623f7 647
593a1d5f
A
648struct _throttle_io_info_t {
649 struct timeval last_normal_IO_timestamp;
b0d623f7 650 struct timeval last_IO_timestamp;
593a1d5f 651 SInt32 numthreads_throttling;
b0d623f7
A
652 SInt32 refcnt;
653 SInt32 alloc;
593a1d5f
A
654};
655
656struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
2d21ac55
A
657int lowpri_IO_initial_window_msecs = LOWPRI_INITIAL_WINDOW_MSECS;
658int lowpri_IO_window_msecs_inc = LOWPRI_WINDOW_MSECS_INC;
659int lowpri_max_window_msecs = LOWPRI_MAX_WINDOW_MSECS;
660int lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS;
661
b0d623f7
A
662#if 0
663#define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) \
664 do { \
665 if ((debug_info)->alloc) \
666 printf("%s: "format, __FUNCTION__, ## args); \
667 } while(0)
668
669#else
670#define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
671#endif
672
6d2010ae
A
673SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
674SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, "");
675SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
676SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
677
678/*
679 * throttled I/O helper function
680 * convert the index of the lowest set bit to a device index
681 */
682int
683num_trailing_0(uint64_t n)
684{
685 /*
686 * since in most cases the number of trailing 0s is very small,
687 * we simply counting sequentially from the lowest bit
688 */
689 if (n == 0)
690 return sizeof(n) * 8;
691 int count = 0;
692 while (!ISSET(n, 1)) {
693 n >>= 1;
694 ++count;
695 }
696 return count;
697}
2d21ac55 698
b0d623f7
A
699/*
700 * Release the reference and if the item was allocated and this is the last
701 * reference then free it.
702 *
703 * This routine always returns the old value.
704 */
705static int
706throttle_info_rel(struct _throttle_io_info_t *info)
707{
708 SInt32 oldValue = OSDecrementAtomic(&info->refcnt);
709
710 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
711 info, (int)(oldValue -1), info );
712
713 /* The reference count just went negative, very bad */
714 if (oldValue == 0)
715 panic("throttle info ref cnt went negative!");
716
717 /*
718 * Once reference count is zero, no one else should be able to take a
719 * reference
720 */
721 if ((info->refcnt == 0) && (info->alloc)) {
722 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info, info );
723 FREE(info, M_TEMP);
724 }
725 return oldValue;
726}
727
728/*
729 * Just take a reference on the throttle info structure.
730 *
731 * This routine always returns the old value.
732 */
733static SInt32
734throttle_info_ref(struct _throttle_io_info_t *info)
735{
736 SInt32 oldValue = OSIncrementAtomic(&info->refcnt);
737
738 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
739 info, (int)(oldValue -1), info );
740 /* Allocated items should never have a reference of zero */
741 if (info->alloc && (oldValue == 0))
742 panic("Taking a reference without calling create throttle info!\n");
743
744 return oldValue;
745}
746
747/*
748 * KPI routine
749 *
750 * Create and take a reference on a throttle info structure and return a
751 * pointer for the file system to use when calling throttle_info_update.
752 * Calling file system must have a matching release for every create.
753 */
754void *
755throttle_info_create(void)
756{
757 struct _throttle_io_info_t *info;
758
759 MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK);
760 /* Should never happen but just in case */
761 if (info == NULL)
762 return NULL;
763 /* Mark that this one was allocated and needs to be freed */
764 DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
765 info->alloc = TRUE;
766 /* Take a reference */
767 OSIncrementAtomic(&info->refcnt);
768 return info;
769}
770
771/*
772 * KPI routine
773 *
774 * Release the throttle info pointer if all the reference are gone. Should be
775 * called to release reference taken by throttle_info_create
776 */
777void
778throttle_info_release(void *throttle_info)
779{
780 DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
781 (struct _throttle_io_info_t *)throttle_info,
782 (struct _throttle_io_info_t *)throttle_info);
783 if (throttle_info) /* Just to be careful */
784 throttle_info_rel(throttle_info);
785}
786
787/*
788 * KPI routine
789 *
790 * File Systems that create an info structure, need to call this routine in
791 * their mount routine (used by cluster code). File Systems that call this in
792 * their mount routines must call throttle_info_mount_rel in their unmount
793 * routines.
794 */
795void
796throttle_info_mount_ref(mount_t mp, void *throttle_info)
797{
798 if ((throttle_info == NULL) || (mp == NULL))
799 return;
800 throttle_info_ref(throttle_info);
801 /* We already have a reference release it before adding the new one */
802 if (mp->mnt_throttle_info)
803 throttle_info_rel(mp->mnt_throttle_info);
804 mp->mnt_throttle_info = throttle_info;
805}
806
6d2010ae
A
807/*
808 * Private KPI routine
809 *
810 * return a handle for accessing throttle_info given a throttle_mask. The
811 * handle must be released by throttle_info_rel_by_mask
812 */
813int
814throttle_info_ref_by_mask(uint64_t throttle_mask,
815 throttle_info_handle_t *throttle_info_handle)
816{
817 int dev_index;
818 struct _throttle_io_info_t *info;
819
820 if (throttle_info_handle == NULL)
821 return EINVAL;
822
823 dev_index = num_trailing_0(throttle_mask);
824 info = &_throttle_io_info[dev_index];
825 throttle_info_ref(info);
826 *(struct _throttle_io_info_t**)throttle_info_handle = info;
827 return 0;
828}
829
830/*
831 * Private KPI routine
832 *
833 * release the handle obtained by throttle_info_ref_by_mask
834 */
835void
836throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
837{
838 /* for now the handle is just a pointer to _throttle_io_info_t */
839 throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
840}
841
b0d623f7
A
842/*
843 * KPI routine
844 *
845 * File Systems that throttle_info_mount_ref, must call this routine in their
846 * umount routine.
847 */
848void
849throttle_info_mount_rel(mount_t mp)
850{
851 if (mp->mnt_throttle_info)
852 throttle_info_rel(mp->mnt_throttle_info);
853 mp->mnt_throttle_info = NULL;
854}
855
e2fac8b1
A
856void
857throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
858{
b0d623f7 859 struct _throttle_io_info_t *info;
e2fac8b1 860
b0d623f7
A
861 if (mp == NULL)
862 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
863 else if (mp->mnt_throttle_info == NULL)
864 info = &_throttle_io_info[mp->mnt_devbsdunit];
865 else
866 info = mp->mnt_throttle_info;
867
868 *tv = info->last_IO_timestamp;
e2fac8b1
A
869}
870
871void
872update_last_io_time(mount_t mp)
873{
b0d623f7 874 struct _throttle_io_info_t *info;
e2fac8b1 875
b0d623f7
A
876 if (mp == NULL)
877 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
878 else if (mp->mnt_throttle_info == NULL)
879 info = &_throttle_io_info[mp->mnt_devbsdunit];
880 else
881 info = mp->mnt_throttle_info;
e2fac8b1 882
b0d623f7 883 microuptime(&info->last_IO_timestamp);
e2fac8b1
A
884}
885
6d2010ae
A
886
887#if CONFIG_EMBEDDED
888
889int throttle_get_io_policy(struct uthread **ut)
890{
891 int policy = IOPOL_DEFAULT;
892 proc_t p = current_proc();
893
894 *ut = get_bsdthread_info(current_thread());
895
896 if (p != NULL)
897 policy = p->p_iopol_disk;
898
899 if (*ut != NULL) {
900 // the I/O policy of the thread overrides that of the process
901 // unless the I/O policy of the thread is default
902 if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT)
903 policy = (*ut)->uu_iopol_disk;
904 }
905 return policy;
906}
907#else
908
909int throttle_get_io_policy(__unused struct uthread **ut)
910{
911 *ut = get_bsdthread_info(current_thread());
912
913 return (proc_get_task_selfdiskacc());
914}
915#endif
916
917
b0d623f7
A
918static int
919throttle_io_will_be_throttled_internal(int lowpri_window_msecs, void * throttle_info)
2d21ac55 920{
b0d623f7 921 struct _throttle_io_info_t *info = throttle_info;
2d21ac55 922 struct timeval elapsed;
593a1d5f 923 int elapsed_msecs;
6d2010ae
A
924 int policy;
925 struct uthread *ut;
926
927 policy = throttle_get_io_policy(&ut);
928
929 if (ut->uu_throttle_bc == FALSE && policy != IOPOL_THROTTLE)
930 return (0);
2d21ac55 931
593a1d5f 932 microuptime(&elapsed);
b0d623f7 933 timevalsub(&elapsed, &info->last_normal_IO_timestamp);
593a1d5f 934 elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
2d21ac55 935
593a1d5f
A
936 if (lowpri_window_msecs == -1) // use the max waiting time
937 lowpri_window_msecs = lowpri_max_waiting_msecs;
2d21ac55 938
593a1d5f
A
939 return elapsed_msecs < lowpri_window_msecs;
940}
2d21ac55 941
b0d623f7
A
942/*
943 * If we have a mount point and it has a throttle info pointer then
944 * use it to do the check, otherwise use the device unit number to find
945 * the correct throttle info array element.
946 */
947int
948throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp)
949{
950 void *info;
951
952 /* Should we just return zero if no mount point */
953 if (mp == NULL)
954 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
955 else if (mp->mnt_throttle_info == NULL)
956 info = &_throttle_io_info[mp->mnt_devbsdunit];
957 else
958 info = mp->mnt_throttle_info;
959 return throttle_io_will_be_throttled_internal(lowpri_window_msecs, info);
960}
961
6d2010ae
A
962uint32_t
963throttle_lowpri_io(int sleep_amount)
593a1d5f 964{
6d2010ae
A
965 int sleep_cnt = 0;
966 int numthreads_throttling;
593a1d5f
A
967 int max_try_num;
968 struct uthread *ut;
b0d623f7 969 struct _throttle_io_info_t *info;
6d2010ae 970 int max_waiting_msecs;
2d21ac55 971
593a1d5f
A
972 ut = get_bsdthread_info(current_thread());
973
b0d623f7
A
974 if ((ut->uu_lowpri_window == 0) || (ut->uu_throttle_info == NULL))
975 goto done;
593a1d5f 976
b0d623f7 977 info = ut->uu_throttle_info;
593a1d5f 978
6d2010ae
A
979 if (sleep_amount != 0) {
980#if CONFIG_EMBEDDED
981 max_waiting_msecs = lowpri_max_waiting_msecs;
982#else
983 if (ut->uu_throttle_isssd == TRUE)
984 max_waiting_msecs = lowpri_max_waiting_msecs / 100;
985 else
986 max_waiting_msecs = lowpri_max_waiting_msecs;
987#endif
988 if (max_waiting_msecs < LOWPRI_SLEEP_INTERVAL)
989 max_waiting_msecs = LOWPRI_SLEEP_INTERVAL;
b0d623f7 990
6d2010ae
A
991 numthreads_throttling = info->numthreads_throttling + MIN(10, MAX(1, sleep_amount)) - 1;
992 max_try_num = max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, numthreads_throttling);
993
994 for (sleep_cnt = 0; sleep_cnt < max_try_num; sleep_cnt++) {
b0d623f7 995 if (throttle_io_will_be_throttled_internal(ut->uu_lowpri_window, info)) {
6d2010ae
A
996 if (sleep_cnt == 0) {
997 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
998 ut->uu_lowpri_window, max_try_num, numthreads_throttling, 0, 0);
999 }
593a1d5f 1000 IOSleep(LOWPRI_SLEEP_INTERVAL);
b0d623f7 1001 DEBUG_ALLOC_THROTTLE_INFO("sleeping because of info = %p\n", info, info );
593a1d5f
A
1002 } else {
1003 break;
1004 }
2d21ac55 1005 }
6d2010ae
A
1006 if (sleep_cnt) {
1007 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
1008 ut->uu_lowpri_window, sleep_cnt, 0, 0, 0);
1009 }
2d21ac55 1010 }
593a1d5f 1011 SInt32 oldValue;
b0d623f7 1012 oldValue = OSDecrementAtomic(&info->numthreads_throttling);
593a1d5f
A
1013
1014 if (oldValue <= 0) {
1015 panic("%s: numthreads negative", __func__);
1016 }
b0d623f7
A
1017done:
1018 ut->uu_lowpri_window = 0;
1019 if (ut->uu_throttle_info)
1020 throttle_info_rel(ut->uu_throttle_info);
1021 ut->uu_throttle_info = NULL;
6d2010ae
A
1022 ut->uu_throttle_bc = FALSE;
1023
1024 return (sleep_cnt * LOWPRI_SLEEP_INTERVAL);
593a1d5f
A
1025}
1026
6d2010ae
A
1027/*
1028 * KPI routine
1029 *
1030 * set a kernel thread's IO policy. policy can be:
1031 * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE
1032 *
1033 * explanations about these policies are in the man page of setiopolicy_np
1034 */
1035void throttle_set_thread_io_policy(int policy)
593a1d5f 1036{
6d2010ae
A
1037#if !CONFIG_EMBEDDED
1038 proc_apply_thread_selfdiskacc(policy);
1039#else /* !CONFIG_EMBEDDED */
1040 struct uthread *ut;
1041 ut = get_bsdthread_info(current_thread());
1042 ut->uu_iopol_disk = policy;
1043#endif /* !CONFIG_EMBEDDED */
1044}
593a1d5f 1045
593a1d5f 1046
6d2010ae
A
1047static
1048void throttle_info_reset_window(struct uthread *ut)
1049{
1050 struct _throttle_io_info_t *info;
1051
1052 info = ut->uu_throttle_info;
1053
1054 OSDecrementAtomic(&info->numthreads_throttling);
1055 throttle_info_rel(info);
1056 ut->uu_throttle_info = NULL;
1057 ut->uu_lowpri_window = 0;
1058}
1059
1060static
1061void throttle_info_set_initial_window(struct uthread *ut, struct _throttle_io_info_t *info, boolean_t isssd, boolean_t BC_throttle)
1062{
1063 SInt32 oldValue;
1064
1065 ut->uu_throttle_info = info;
1066 throttle_info_ref(info);
1067 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
1068
1069 oldValue = OSIncrementAtomic(&info->numthreads_throttling);
1070 if (oldValue < 0) {
1071 panic("%s: numthreads negative", __func__);
593a1d5f 1072 }
6d2010ae
A
1073 ut->uu_lowpri_window = lowpri_IO_initial_window_msecs;
1074 ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue;
1075 ut->uu_throttle_isssd = isssd;
1076 ut->uu_throttle_bc = BC_throttle;
2d21ac55 1077}
91447636 1078
6d2010ae
A
1079
1080static
1081void throttle_info_update_internal(void *throttle_info, int flags, boolean_t isssd)
b0d623f7
A
1082{
1083 struct _throttle_io_info_t *info = throttle_info;
1084 struct uthread *ut;
1085 int policy;
1086 int is_throttleable_io = 0;
1087 int is_passive_io = 0;
b0d623f7
A
1088
1089 if (!lowpri_IO_initial_window_msecs || (info == NULL))
1090 return;
1091 policy = throttle_get_io_policy(&ut);
1092
1093 switch (policy) {
1094 case IOPOL_DEFAULT:
1095 case IOPOL_NORMAL:
1096 break;
1097 case IOPOL_THROTTLE:
1098 is_throttleable_io = 1;
1099 break;
1100 case IOPOL_PASSIVE:
1101 is_passive_io = 1;
1102 break;
1103 default:
1104 printf("unknown I/O policy %d", policy);
1105 break;
1106 }
1107
1108 if (!is_throttleable_io && ISSET(flags, B_PASSIVE))
1109 is_passive_io |= 1;
1110
1111 if (!is_throttleable_io) {
1112 if (!is_passive_io){
1113 microuptime(&info->last_normal_IO_timestamp);
1114 }
1115 } else if (ut) {
1116 /*
1117 * I'd really like to do the IOSleep here, but
1118 * we may be holding all kinds of filesystem related locks
1119 * and the pages for this I/O marked 'busy'...
1120 * we don't want to cause a normal task to block on
1121 * one of these locks while we're throttling a task marked
1122 * for low priority I/O... we'll mark the uthread and
1123 * do the delay just before we return from the system
1124 * call that triggered this I/O or from vnode_pagein
1125 */
6d2010ae
A
1126 if (ut->uu_lowpri_window == 0)
1127 throttle_info_set_initial_window(ut, info, isssd, FALSE);
1128 else {
b0d623f7
A
1129 /* The thread sends I/Os to different devices within the same system call */
1130 if (ut->uu_throttle_info != info) {
6d2010ae 1131 struct _throttle_io_info_t *old_info = ut->uu_throttle_info;
b0d623f7
A
1132
1133 // keep track of the numthreads in the right device
1134 OSDecrementAtomic(&old_info->numthreads_throttling);
1135 OSIncrementAtomic(&info->numthreads_throttling);
1136
6d2010ae
A
1137 DEBUG_ALLOC_THROTTLE_INFO("switching from info = %p\n", old_info, old_info );
1138 DEBUG_ALLOC_THROTTLE_INFO("switching to info = %p\n", info, info );
b0d623f7
A
1139 /* This thread no longer needs a reference on that throttle info */
1140 throttle_info_rel(ut->uu_throttle_info);
1141 ut->uu_throttle_info = info;
1142 /* Need to take a reference on this throttle info */
1143 throttle_info_ref(ut->uu_throttle_info);
1144 }
1145 int numthreads = MAX(1, info->numthreads_throttling);
1146 ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads;
1147 if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads)
1148 ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads;
6d2010ae
A
1149
1150 if (isssd == FALSE) {
1151 /*
1152 * we're here because we've actually issued I/Os to different devices...
1153 * if at least one of them was a non SSD, then thottle the thread
1154 * using the policy for non SSDs
1155 */
1156 ut->uu_throttle_isssd = FALSE;
1157 }
b0d623f7
A
1158 }
1159 }
1160}
1161
6d2010ae
A
1162/*
1163 * KPI routine
1164 *
1165 * this is usually called before every I/O, used for throttled I/O
1166 * book keeping. This routine has low overhead and does not sleep
1167 */
1168void throttle_info_update(void *throttle_info, int flags)
1169{
1170 throttle_info_update_internal(throttle_info, flags, FALSE);
1171}
1172
1173/*
1174 * KPI routine
1175 *
1176 * this is usually called before every I/O, used for throttled I/O
1177 * book keeping. This routine has low overhead and does not sleep
1178 */
1179void throttle_info_update_by_mask(void *throttle_info_handle, int flags)
1180{
1181 void *throttle_info = throttle_info_handle;
1182 /* for now we only use the lowest bit of the throttle mask, so the
1183 * handle is the same as the throttle_info. Later if we store a
1184 * set of throttle infos in the handle, we will want to loop through
1185 * them and call throttle_info_update in a loop
1186 */
1187 throttle_info_update(throttle_info, flags);
1188}
1189
1190extern int ignore_is_ssd;
1191
91447636 1192int
2d21ac55 1193spec_strategy(struct vnop_strategy_args *ap)
1c79356b 1194{
91447636
A
1195 buf_t bp;
1196 int bflags;
6d2010ae 1197 int policy;
91447636 1198 dev_t bdev;
b0d623f7 1199 uthread_t ut;
b0d623f7 1200 mount_t mp;
6d2010ae
A
1201 int strategy_ret;
1202 struct _throttle_io_info_t *throttle_info;
1203 boolean_t isssd = FALSE;
9bccf70c
A
1204
1205 bp = ap->a_bp;
91447636 1206 bdev = buf_device(bp);
b0d623f7 1207 mp = buf_vnode(bp)->v_mount;
9bccf70c 1208
6d2010ae
A
1209 policy = throttle_get_io_policy(&ut);
1210
1211 if (policy == IOPOL_THROTTLE) {
1212 bp->b_flags |= B_THROTTLED_IO;
1213 bp->b_flags &= ~B_PASSIVE;
1214 } else if (policy == IOPOL_PASSIVE)
1215 bp->b_flags |= B_PASSIVE;
1216
1217 bflags = bp->b_flags;
1218
9bccf70c 1219 if (kdebug_enable) {
91447636 1220 int code = 0;
9bccf70c 1221
91447636
A
1222 if (bflags & B_READ)
1223 code |= DKIO_READ;
1224 if (bflags & B_ASYNC)
1225 code |= DKIO_ASYNC;
9bccf70c 1226
91447636
A
1227 if (bflags & B_META)
1228 code |= DKIO_META;
1229 else if (bflags & B_PAGEIO)
1230 code |= DKIO_PAGING;
9bccf70c 1231
6d2010ae
A
1232 if (bflags & B_THROTTLED_IO)
1233 code |= DKIO_THROTTLE;
1234 else if (bflags & B_PASSIVE)
1235 code |= DKIO_PASSIVE;
1236
91447636 1237 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
b0d623f7 1238 bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0);
9bccf70c 1239 }
b0d623f7
A
1240 if (((bflags & (B_IOSTREAMING | B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
1241 mp && (mp->mnt_kern_flag & MNTK_ROOTDEV))
91447636
A
1242 hard_throttle_on_root = 1;
1243
6d2010ae
A
1244 if (mp != NULL) {
1245 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
1246 isssd = TRUE;
1247 throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
1248 } else
1249 throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2d21ac55 1250
6d2010ae 1251 throttle_info_update_internal(throttle_info, bflags, isssd);
e2fac8b1 1252
b0d623f7 1253 if ((bflags & B_READ) == 0) {
6d2010ae 1254 microuptime(&throttle_info->last_IO_timestamp);
b0d623f7
A
1255 if (mp) {
1256 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
1257 }
1258 } else if (mp) {
1259 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
e2fac8b1 1260 }
6d2010ae
A
1261 /*
1262 * The BootCache may give us special information about
1263 * the IO, so it returns special values that we check
1264 * for here.
1265 *
1266 * IO_SATISFIED_BY_CACHE
1267 * The read has been satisfied by the boot cache. Don't
1268 * throttle the thread unnecessarily.
1269 *
1270 * IO_SHOULD_BE_THROTTLED
1271 * The boot cache is playing back a playlist and this IO
1272 * cut through. Throttle it so we're not cutting through
1273 * the boot cache too often.
1274 *
1275 * Note that typical strategy routines are defined with
1276 * a void return so we'll get garbage here. In the
1277 * unlikely case the garbage matches our special return
1278 * value, it's not a big deal since we're only adjusting
1279 * the throttling delay.
1280 */
1281#define IO_SATISFIED_BY_CACHE ((int)0xcafefeed)
1282#define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
1283 typedef int strategy_fcn_ret_t(struct buf *bp);
b0d623f7 1284
6d2010ae
A
1285 strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp);
1286
1287 if ((IO_SATISFIED_BY_CACHE == strategy_ret) && (ut->uu_lowpri_window != 0) && (ut->uu_throttle_info != NULL)) {
1288 /*
1289 * If this was a throttled IO satisfied by the boot cache,
1290 * don't delay the thread.
1291 */
1292 throttle_info_reset_window(ut);
1293
1294 } else if ((IO_SHOULD_BE_THROTTLED == strategy_ret) && (ut->uu_lowpri_window == 0) && (ut->uu_throttle_info == NULL)) {
1295 /*
1296 * If the boot cache indicates this IO should be throttled,
1297 * delay the thread.
1298 */
1299 throttle_info_set_initial_window(ut, throttle_info, isssd, TRUE);
1300 }
b0d623f7 1301 return (0);
ccc36f2f
A
1302}
1303
1c79356b
A
1304
1305/*
1306 * This is a noop, simply returning what one has been given.
1307 */
91447636
A
1308int
1309spec_blockmap(__unused struct vnop_blockmap_args *ap)
1c79356b 1310{
91447636 1311 return (ENOTSUP);
1c79356b
A
1312}
1313
1314
1315/*
1316 * Device close routine
1317 */
91447636 1318int
2d21ac55 1319spec_close(struct vnop_close_args *ap)
1c79356b 1320{
2d21ac55 1321 struct vnode *vp = ap->a_vp;
1c79356b 1322 dev_t dev = vp->v_rdev;
6d2010ae 1323 int error = 0;
2d21ac55 1324 int flags = ap->a_fflag;
91447636 1325 struct proc *p = vfs_context_proc(ap->a_context);
2d21ac55 1326 struct session *sessp;
6d2010ae 1327 int do_rele = 0;
1c79356b
A
1328
1329 switch (vp->v_type) {
1330
1331 case VCHR:
1332 /*
1333 * Hack: a tty device that is a controlling terminal
1334 * has a reference from the session structure.
1335 * We cannot easily tell that a character device is
1336 * a controlling terminal, unless it is the closing
1337 * process' controlling terminal. In that case,
b0d623f7
A
1338 * if the reference count is 1 (this is the very
1339 * last close)
1c79356b 1340 */
2d21ac55
A
1341 sessp = proc_session(p);
1342 if (sessp != SESSION_NULL) {
b0d623f7 1343 if ((vcount(vp) == 1) &&
2d21ac55 1344 (vp == sessp->s_ttyvp)) {
6d2010ae 1345
2d21ac55 1346 session_lock(sessp);
6d2010ae
A
1347 if (vp == sessp->s_ttyvp) {
1348 sessp->s_ttyvp = NULL;
1349 sessp->s_ttyvid = 0;
1350 sessp->s_ttyp = TTY_NULL;
1351 sessp->s_ttypgrpid = NO_PID;
1352 do_rele = 1;
1353 }
2d21ac55 1354 session_unlock(sessp);
6d2010ae
A
1355
1356 if (do_rele) {
1357 vnode_rele(vp);
1358 }
2d21ac55
A
1359 }
1360 session_rele(sessp);
1c79356b 1361 }
2d21ac55 1362
6d2010ae
A
1363 devsw_lock(dev, S_IFCHR);
1364
1365 vp->v_specinfo->si_opencount--;
1366
1367 if (vp->v_specinfo->si_opencount < 0) {
1368 panic("Negative open count?");
1369 }
1c79356b 1370 /*
2d21ac55 1371 * close on last reference or on vnode revoke call
1c79356b 1372 */
6d2010ae
A
1373 if ((vcount(vp) > 0) && ((flags & IO_REVOKE) == 0)) {
1374 devsw_unlock(dev, S_IFCHR);
1c79356b 1375 return (0);
6d2010ae
A
1376 }
1377
1378 error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
1379
1380 devsw_unlock(dev, S_IFCHR);
1c79356b
A
1381 break;
1382
1383 case VBLK:
1c79356b 1384 /*
6d2010ae
A
1385 * If there is more than one outstanding open, don't
1386 * send the close to the device.
0b4e3aa0 1387 */
6d2010ae
A
1388 devsw_lock(dev, S_IFBLK);
1389 if (vcount(vp) > 1) {
1390 vp->v_specinfo->si_opencount--;
1391 devsw_unlock(dev, S_IFBLK);
0b4e3aa0 1392 return (0);
6d2010ae
A
1393 }
1394 devsw_unlock(dev, S_IFBLK);
0b4e3aa0
A
1395
1396 /*
1397 * On last close of a block device (that isn't mounted)
1398 * we must invalidate any in core blocks, so that
1399 * we can, for instance, change floppy disks.
1400 */
91447636
A
1401 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
1402 return (error);
1403
1404 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
0b4e3aa0
A
1405 if (error)
1406 return (error);
b0d623f7 1407
6d2010ae
A
1408 devsw_lock(dev, S_IFBLK);
1409
1410 vp->v_specinfo->si_opencount--;
1411
1412 if (vp->v_specinfo->si_opencount < 0) {
1413 panic("Negative open count?");
1414 }
1415
1416 if (vcount(vp) > 0) {
1417 devsw_unlock(dev, S_IFBLK);
1418 return (0);
1419 }
1420
1421 error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
1422
1423 devsw_unlock(dev, S_IFBLK);
1c79356b
A
1424 break;
1425
1426 default:
1427 panic("spec_close: not special");
2d21ac55 1428 return(EBADF);
1c79356b
A
1429 }
1430
6d2010ae 1431 return error;
1c79356b
A
1432}
1433
1434/*
1435 * Return POSIX pathconf information applicable to special devices.
1436 */
91447636 1437int
2d21ac55 1438spec_pathconf(struct vnop_pathconf_args *ap)
1c79356b
A
1439{
1440
1441 switch (ap->a_name) {
1442 case _PC_LINK_MAX:
1443 *ap->a_retval = LINK_MAX;
1444 return (0);
1445 case _PC_MAX_CANON:
1446 *ap->a_retval = MAX_CANON;
1447 return (0);
1448 case _PC_MAX_INPUT:
1449 *ap->a_retval = MAX_INPUT;
1450 return (0);
1451 case _PC_PIPE_BUF:
1452 *ap->a_retval = PIPE_BUF;
1453 return (0);
1454 case _PC_CHOWN_RESTRICTED:
2d21ac55 1455 *ap->a_retval = 200112; /* _POSIX_CHOWN_RESTRICTED */
1c79356b
A
1456 return (0);
1457 case _PC_VDISABLE:
1458 *ap->a_retval = _POSIX_VDISABLE;
1459 return (0);
1460 default:
1461 return (EINVAL);
1462 }
1463 /* NOTREACHED */
1464}
1465
1c79356b
A
1466/*
1467 * Special device failed operation
1468 */
91447636
A
1469int
1470spec_ebadf(__unused void *dummy)
1c79356b
A
1471{
1472
1473 return (EBADF);
1474}
1475
1c79356b
A
1476/* Blktooff derives file offset from logical block number */
1477int
2d21ac55 1478spec_blktooff(struct vnop_blktooff_args *ap)
1c79356b 1479{
2d21ac55 1480 struct vnode *vp = ap->a_vp;
1c79356b
A
1481
1482 switch (vp->v_type) {
1483 case VCHR:
1484 *ap->a_offset = (off_t)-1; /* failure */
91447636 1485 return (ENOTSUP);
1c79356b
A
1486
1487 case VBLK:
1488 printf("spec_blktooff: not implemented for VBLK\n");
1489 *ap->a_offset = (off_t)-1; /* failure */
91447636 1490 return (ENOTSUP);
1c79356b
A
1491
1492 default:
1493 panic("spec_blktooff type");
1494 }
1495 /* NOTREACHED */
91447636
A
1496
1497 return (0);
1c79356b
A
1498}
1499
1500/* Offtoblk derives logical block number from file offset */
1501int
2d21ac55 1502spec_offtoblk(struct vnop_offtoblk_args *ap)
1c79356b 1503{
2d21ac55 1504 struct vnode *vp = ap->a_vp;
1c79356b
A
1505
1506 switch (vp->v_type) {
1507 case VCHR:
91447636
A
1508 *ap->a_lblkno = (daddr64_t)-1; /* failure */
1509 return (ENOTSUP);
1c79356b
A
1510
1511 case VBLK:
1512 printf("spec_offtoblk: not implemented for VBLK\n");
91447636
A
1513 *ap->a_lblkno = (daddr64_t)-1; /* failure */
1514 return (ENOTSUP);
1c79356b
A
1515
1516 default:
1517 panic("spec_offtoblk type");
1518 }
1519 /* NOTREACHED */
91447636
A
1520
1521 return (0);
1c79356b 1522}
6d2010ae
A
1523
1524static void filt_specdetach(struct knote *kn);
1525static int filt_spec(struct knote *kn, long hint);
1526static unsigned filt_specpeek(struct knote *kn);
1527
1528struct filterops spec_filtops = {
1529 .f_isfd = 1,
1530 .f_attach = filt_specattach,
1531 .f_detach = filt_specdetach,
1532 .f_event = filt_spec,
1533 .f_peek = filt_specpeek
1534};
1535
1536static int
1537filter_to_seltype(int16_t filter)
1538{
1539 switch (filter) {
1540 case EVFILT_READ:
1541 return FREAD;
1542 case EVFILT_WRITE:
1543 return FWRITE;
1544 break;
1545 default:
1546 panic("filt_to_seltype(): invalid filter %d\n", filter);
1547 return 0;
1548 }
1549}
1550
1551static int
1552filt_specattach(struct knote *kn)
1553{
1554 vnode_t vp;
1555 dev_t dev;
1556
1557 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */
1558
1559 assert(vnode_ischr(vp));
1560
1561 dev = vnode_specrdev(vp);
1562
1563 if (major(dev) > nchrdev) {
1564 return ENXIO;
1565 }
1566
1567 if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0) {
1568 return EINVAL;
1569 }
1570
1571 /* Resulting wql is safe to unlink even if it has never been linked */
1572 kn->kn_hook = wait_queue_link_allocate();
1573 if (kn->kn_hook == NULL) {
1574 return EAGAIN;
1575 }
1576
1577 kn->kn_fop = &spec_filtops;
1578 kn->kn_hookid = vnode_vid(vp);
1579
1580 knote_markstayqueued(kn);
1581
1582 return 0;
1583}
1584
1585static void
1586filt_specdetach(struct knote *kn)
1587{
1588 kern_return_t ret;
1589
1590 /*
1591 * Given wait queue link and wait queue set, unlink. This is subtle.
1592 * If the device has been revoked from under us, selclearthread() will
1593 * have removed our link from the kqueue's wait queue set, which
1594 * wait_queue_set_unlink_one() will detect and handle.
1595 */
1596 ret = wait_queue_set_unlink_one(kn->kn_kq->kq_wqs, kn->kn_hook);
1597 if (ret != KERN_SUCCESS) {
1598 panic("filt_specdetach(): failed to unlink wait queue link.");
1599 }
1600
1601 (void)wait_queue_link_free(kn->kn_hook);
1602 kn->kn_hook = NULL;
1603 kn->kn_status &= ~KN_STAYQUEUED;
1604}
1605
1606static int
1607filt_spec(struct knote *kn, long hint)
1608{
1609 vnode_t vp;
1610 uthread_t uth;
1611 wait_queue_set_t old_wqs;
1612 vfs_context_t ctx;
1613 int selres;
1614 int error;
1615 int use_offset;
1616 dev_t dev;
1617 uint64_t flags;
1618
1619 assert(kn->kn_hook != NULL);
1620
1621 if (hint != 0) {
1622 panic("filt_spec(): nonzero hint?");
1623 }
1624
1625 uth = get_bsdthread_info(current_thread());
1626 ctx = vfs_context_current();
1627 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
1628
1629 error = vnode_getwithvid(vp, kn->kn_hookid);
1630 if (error != 0) {
1631 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1632 return 1;
1633 }
1634
1635 dev = vnode_specrdev(vp);
1636 flags = cdevsw_flags[major(dev)];
1637 use_offset = ((flags & CDEVSW_USE_OFFSET) != 0);
1638 assert((flags & CDEVSW_SELECT_KQUEUE) != 0);
1639
1640 /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */
1641 old_wqs = uth->uu_wqset;
1642 uth->uu_wqset = kn->kn_kq->kq_wqs;
1643 selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx);
1644 uth->uu_wqset = old_wqs;
1645
1646 if (use_offset) {
1647 if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) {
1648 kn->kn_data = 0;
1649 } else {
1650 kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset;
1651 }
1652 } else {
1653 kn->kn_data = selres;
1654 }
1655
1656 vnode_put(vp);
1657
1658 return (kn->kn_data != 0);
1659}
1660
1661static unsigned
1662filt_specpeek(struct knote *kn)
1663{
1664 vnode_t vp;
1665 uthread_t uth;
1666 wait_queue_set_t old_wqs;
1667 vfs_context_t ctx;
1668 int error, selres;
1669
1670 uth = get_bsdthread_info(current_thread());
1671 ctx = vfs_context_current();
1672 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
1673
1674 error = vnode_getwithvid(vp, kn->kn_hookid);
1675 if (error != 0) {
1676 return 1; /* Just like VNOP_SELECT() on recycled vnode */
1677 }
1678
1679 /*
1680 * Why pass the link here? Because we may not have registered in the past...
1681 */
1682 old_wqs = uth->uu_wqset;
1683 uth->uu_wqset = kn->kn_kq->kq_wqs;
1684 selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx);
1685 uth->uu_wqset = old_wqs;
1686
1687 vnode_put(vp);
1688 return selres;
1689}
1690