]> git.saurik.com Git - apple/xnu.git/blob - bsd/miscfs/specfs/spec_vnops.c
52cf1c806285bbf346e18db20a8fe4f3561dc9c1
[apple/xnu.git] / bsd / miscfs / specfs / spec_vnops.c
1 /*
2 * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1989, 1993, 1995
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95
62 */
63
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/kauth.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/conf.h>
70 #include <sys/buf_internal.h>
71 #include <sys/mount_internal.h>
72 #include <sys/vnode_internal.h>
73 #include <sys/file_internal.h>
74 #include <sys/namei.h>
75 #include <sys/stat.h>
76 #include <sys/errno.h>
77 #include <sys/ioctl.h>
78 #include <sys/file.h>
79 #include <sys/user.h>
80 #include <sys/malloc.h>
81 #include <sys/disk.h>
82 #include <sys/uio_internal.h>
83 #include <sys/resource.h>
84 #include <miscfs/specfs/specdev.h>
85 #include <vfs/vfs_support.h>
86 #include <kern/assert.h>
87 #include <kern/task.h>
88
89 #include <sys/kdebug.h>
90
91 /* XXX following three prototypes should be in a header file somewhere */
92 extern dev_t chrtoblk(dev_t dev);
93 extern int iskmemdev(dev_t dev);
94 extern int bpfkqfilter(dev_t dev, struct knote *kn);
95 extern int ptsd_kqfilter(dev_t dev, struct knote *kn);
96
97 struct vnode *speclisth[SPECHSZ];
98
99 /* symbolic sleep message strings for devices */
100 char devopn[] = "devopn";
101 char devio[] = "devio";
102 char devwait[] = "devwait";
103 char devin[] = "devin";
104 char devout[] = "devout";
105 char devioc[] = "devioc";
106 char devcls[] = "devcls";
107
108 #define VOPFUNC int (*)(void *)
109
110 int (**spec_vnodeop_p)(void *);
111 struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
112 { &vnop_default_desc, (VOPFUNC)vn_default_error },
113 { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */
114 { &vnop_create_desc, (VOPFUNC)err_create }, /* create */
115 { &vnop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */
116 { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */
117 { &vnop_close_desc, (VOPFUNC)spec_close }, /* close */
118 { &vnop_access_desc, (VOPFUNC)spec_access }, /* access */
119 { &vnop_getattr_desc, (VOPFUNC)spec_getattr }, /* getattr */
120 { &vnop_setattr_desc, (VOPFUNC)spec_setattr }, /* setattr */
121 { &vnop_read_desc, (VOPFUNC)spec_read }, /* read */
122 { &vnop_write_desc, (VOPFUNC)spec_write }, /* write */
123 { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */
124 { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */
125 { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */
126 { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */
127 { &vnop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */
128 { &vnop_remove_desc, (VOPFUNC)err_remove }, /* remove */
129 { &vnop_link_desc, (VOPFUNC)err_link }, /* link */
130 { &vnop_rename_desc, (VOPFUNC)err_rename }, /* rename */
131 { &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */
132 { &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */
133 { &vnop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */
134 { &vnop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */
135 { &vnop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */
136 { &vnop_inactive_desc, (VOPFUNC)nop_inactive }, /* inactive */
137 { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim }, /* reclaim */
138 { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */
139 { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */
140 { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */
141 { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite }, /* bwrite */
142 { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */
143 { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */
144 { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */
145 { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */
146 { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk }, /* offtoblk */
147 { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap }, /* blockmap */
148 { (struct vnodeop_desc*)NULL, (int(*)())NULL }
149 };
150 struct vnodeopv_desc spec_vnodeop_opv_desc =
151 { &spec_vnodeop_p, spec_vnodeop_entries };
152
153
154 static void set_blocksize(vnode_t, dev_t);
155
156
157 struct _throttle_io_info_t {
158 struct timeval last_normal_IO_timestamp;
159 struct timeval last_IO_timestamp;
160 SInt32 numthreads_throttling;
161 SInt32 refcnt;
162 SInt32 alloc;
163 };
164
165 struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
166
167 static void throttle_info_update_internal(void *throttle_info, int flags, boolean_t isssd);
168
169
170
171 /*
172 * Trivial lookup routine that always fails.
173 */
174 int
175 spec_lookup(struct vnop_lookup_args *ap)
176 {
177
178 *ap->a_vpp = NULL;
179 return (ENOTDIR);
180 }
181
182 static void
183 set_blocksize(struct vnode *vp, dev_t dev)
184 {
185 int (*size)(dev_t);
186 int rsize;
187
188 if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
189 rsize = (*size)(dev);
190 if (rsize <= 0) /* did size fail? */
191 vp->v_specsize = DEV_BSIZE;
192 else
193 vp->v_specsize = rsize;
194 }
195 else
196 vp->v_specsize = DEV_BSIZE;
197 }
198
199 void
200 set_fsblocksize(struct vnode *vp)
201 {
202
203 if (vp->v_type == VBLK) {
204 dev_t dev = (dev_t)vp->v_rdev;
205 int maj = major(dev);
206
207 if ((u_int)maj >= (u_int)nblkdev)
208 return;
209
210 vnode_lock(vp);
211 set_blocksize(vp, dev);
212 vnode_unlock(vp);
213 }
214
215 }
216
217
218 /*
219 * Open a special file.
220 */
221 int
222 spec_open(struct vnop_open_args *ap)
223 {
224 struct proc *p = vfs_context_proc(ap->a_context);
225 kauth_cred_t cred = vfs_context_ucred(ap->a_context);
226 struct vnode *vp = ap->a_vp;
227 dev_t bdev, dev = (dev_t)vp->v_rdev;
228 int maj = major(dev);
229 int error;
230
231 /*
232 * Don't allow open if fs is mounted -nodev.
233 */
234 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
235 return (ENXIO);
236
237 switch (vp->v_type) {
238
239 case VCHR:
240 if ((u_int)maj >= (u_int)nchrdev)
241 return (ENXIO);
242 if (cred != FSCRED && (ap->a_mode & FWRITE)) {
243 /*
244 * When running in very secure mode, do not allow
245 * opens for writing of any disk character devices.
246 */
247 if (securelevel >= 2 && isdisk(dev, VCHR))
248 return (EPERM);
249 /*
250 * When running in secure mode, do not allow opens
251 * for writing of /dev/mem, /dev/kmem, or character
252 * devices whose corresponding block devices are
253 * currently mounted.
254 */
255 if (securelevel >= 1) {
256 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error))
257 return (error);
258 if (iskmemdev(dev))
259 return (EPERM);
260 }
261 }
262 if (cdevsw[maj].d_type == D_TTY) {
263 vnode_lock(vp);
264 vp->v_flag |= VISTTY;
265 vnode_unlock(vp);
266 }
267
268 devsw_lock(dev, S_IFCHR);
269 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
270
271 if (error == 0) {
272 vp->v_specinfo->si_opencount++;
273 }
274
275 devsw_unlock(dev, S_IFCHR);
276
277 if (error == 0 && cdevsw[maj].d_type == D_DISK && !vp->v_un.vu_specinfo->si_initted) {
278 int isssd = 0;
279 uint64_t throttle_mask = 0;
280 uint32_t devbsdunit = 0;
281
282 if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL) == 0) {
283
284 if (VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) {
285 /*
286 * as a reasonable approximation, only use the lowest bit of the mask
287 * to generate a disk unit number
288 */
289 devbsdunit = num_trailing_0(throttle_mask);
290
291 vnode_lock(vp);
292
293 vp->v_un.vu_specinfo->si_isssd = isssd;
294 vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit;
295 vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask;
296 vp->v_un.vu_specinfo->si_throttleable = 1;
297 vp->v_un.vu_specinfo->si_initted = 1;
298
299 vnode_unlock(vp);
300 }
301 }
302 if (vp->v_un.vu_specinfo->si_initted == 0) {
303 vnode_lock(vp);
304 vp->v_un.vu_specinfo->si_initted = 1;
305 vnode_unlock(vp);
306 }
307 }
308 return (error);
309
310 case VBLK:
311 if ((u_int)maj >= (u_int)nblkdev)
312 return (ENXIO);
313 /*
314 * When running in very secure mode, do not allow
315 * opens for writing of any disk block devices.
316 */
317 if (securelevel >= 2 && cred != FSCRED &&
318 (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK)
319 return (EPERM);
320 /*
321 * Do not allow opens of block devices that are
322 * currently mounted.
323 */
324 if ( (error = vfs_mountedon(vp)) )
325 return (error);
326
327 devsw_lock(dev, S_IFBLK);
328 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
329 if (!error) {
330 vp->v_specinfo->si_opencount++;
331 }
332 devsw_unlock(dev, S_IFBLK);
333
334 if (!error) {
335 u_int64_t blkcnt;
336 u_int32_t blksize;
337 int setsize = 0;
338 u_int32_t size512 = 512;
339
340
341 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
342 /* Switch to 512 byte sectors (temporarily) */
343
344 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
345 /* Get the number of 512 byte physical blocks. */
346 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
347 setsize = 1;
348 }
349 }
350 /* If it doesn't set back, we can't recover */
351 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context))
352 error = ENXIO;
353 }
354
355
356 vnode_lock(vp);
357 set_blocksize(vp, dev);
358
359 /*
360 * Cache the size in bytes of the block device for later
361 * use by spec_write().
362 */
363 if (setsize)
364 vp->v_specdevsize = blkcnt * (u_int64_t)size512;
365 else
366 vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */
367
368 vnode_unlock(vp);
369
370 }
371 return(error);
372 default:
373 panic("spec_open type");
374 }
375 return (0);
376 }
377
378 /*
379 * Vnode op for read
380 */
381 int
382 spec_read(struct vnop_read_args *ap)
383 {
384 struct vnode *vp = ap->a_vp;
385 struct uio *uio = ap->a_uio;
386 struct buf *bp;
387 daddr64_t bn, nextbn;
388 long bsize, bscale;
389 int devBlockSize=0;
390 int n, on;
391 int error = 0;
392 dev_t dev;
393
394 #if DIAGNOSTIC
395 if (uio->uio_rw != UIO_READ)
396 panic("spec_read mode");
397 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
398 panic("spec_read proc");
399 #endif
400 if (uio_resid(uio) == 0)
401 return (0);
402
403 switch (vp->v_type) {
404
405 case VCHR:
406 if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
407 struct _throttle_io_info_t *throttle_info;
408
409 throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
410
411 throttle_info_update_internal(throttle_info, 0, vp->v_un.vu_specinfo->si_isssd);
412 }
413
414 error = (*cdevsw[major(vp->v_rdev)].d_read)
415 (vp->v_rdev, uio, ap->a_ioflag);
416
417 return (error);
418
419 case VBLK:
420 if (uio->uio_offset < 0)
421 return (EINVAL);
422
423 dev = vp->v_rdev;
424
425 devBlockSize = vp->v_specsize;
426
427 if (devBlockSize > PAGE_SIZE)
428 return (EINVAL);
429
430 bscale = PAGE_SIZE / devBlockSize;
431 bsize = bscale * devBlockSize;
432
433 do {
434 on = uio->uio_offset % bsize;
435
436 bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1));
437
438 if (vp->v_speclastr + bscale == bn) {
439 nextbn = bn + bscale;
440 error = buf_breadn(vp, bn, (int)bsize, &nextbn,
441 (int *)&bsize, 1, NOCRED, &bp);
442 } else
443 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
444
445 vnode_lock(vp);
446 vp->v_speclastr = bn;
447 vnode_unlock(vp);
448
449 n = bsize - buf_resid(bp);
450 if ((on > n) || error) {
451 if (!error)
452 error = EINVAL;
453 buf_brelse(bp);
454 return (error);
455 }
456 n = min((unsigned)(n - on), uio_resid(uio));
457
458 error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
459 if (n + on == bsize)
460 buf_markaged(bp);
461 buf_brelse(bp);
462 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
463 return (error);
464
465 default:
466 panic("spec_read type");
467 }
468 /* NOTREACHED */
469
470 return (0);
471 }
472
473 /*
474 * Vnode op for write
475 */
476 int
477 spec_write(struct vnop_write_args *ap)
478 {
479 struct vnode *vp = ap->a_vp;
480 struct uio *uio = ap->a_uio;
481 struct buf *bp;
482 daddr64_t bn;
483 int bsize, blkmask, bscale;
484 int io_sync;
485 int devBlockSize=0;
486 int n, on;
487 int error = 0;
488 dev_t dev;
489
490 #if DIAGNOSTIC
491 if (uio->uio_rw != UIO_WRITE)
492 panic("spec_write mode");
493 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
494 panic("spec_write proc");
495 #endif
496
497 switch (vp->v_type) {
498
499 case VCHR:
500 if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
501 struct _throttle_io_info_t *throttle_info;
502
503 throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
504
505 throttle_info_update_internal(throttle_info, 0, vp->v_un.vu_specinfo->si_isssd);
506
507 microuptime(&throttle_info->last_IO_timestamp);
508 }
509
510 error = (*cdevsw[major(vp->v_rdev)].d_write)
511 (vp->v_rdev, uio, ap->a_ioflag);
512
513 return (error);
514
515 case VBLK:
516 if (uio_resid(uio) == 0)
517 return (0);
518 if (uio->uio_offset < 0)
519 return (EINVAL);
520
521 io_sync = (ap->a_ioflag & IO_SYNC);
522
523 dev = (vp->v_rdev);
524
525 devBlockSize = vp->v_specsize;
526 if (devBlockSize > PAGE_SIZE)
527 return(EINVAL);
528
529 bscale = PAGE_SIZE / devBlockSize;
530 blkmask = bscale - 1;
531 bsize = bscale * devBlockSize;
532
533
534 do {
535 bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask);
536 on = uio->uio_offset % bsize;
537
538 n = min((unsigned)(bsize - on), uio_resid(uio));
539
540 /*
541 * Use buf_getblk() as an optimization IFF:
542 *
543 * 1) We are reading exactly a block on a block
544 * aligned boundary
545 * 2) We know the size of the device from spec_open
546 * 3) The read doesn't span the end of the device
547 *
548 * Otherwise, we fall back on buf_bread().
549 */
550 if (n == bsize &&
551 vp->v_specdevsize != (u_int64_t)0 &&
552 (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
553 /* reduce the size of the read to what is there */
554 n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
555 }
556
557 if (n == bsize)
558 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
559 else
560 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
561
562 /* Translate downstream error for upstream, if needed */
563 if (!error)
564 error = (int)buf_error(bp);
565 if (error) {
566 buf_brelse(bp);
567 return (error);
568 }
569 n = min(n, bsize - buf_resid(bp));
570
571 error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
572 if (error) {
573 buf_brelse(bp);
574 return (error);
575 }
576 buf_markaged(bp);
577
578 if (io_sync)
579 error = buf_bwrite(bp);
580 else {
581 if ((n + on) == bsize)
582 error = buf_bawrite(bp);
583 else
584 error = buf_bdwrite(bp);
585 }
586 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
587 return (error);
588
589 default:
590 panic("spec_write type");
591 }
592 /* NOTREACHED */
593
594 return (0);
595 }
596
597 /*
598 * Device ioctl operation.
599 */
600 int
601 spec_ioctl(struct vnop_ioctl_args *ap)
602 {
603 proc_t p = vfs_context_proc(ap->a_context);
604 dev_t dev = ap->a_vp->v_rdev;
605 int retval = 0;
606
607 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START,
608 (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, (unsigned int)ap->a_vp->v_type, 0);
609
610 switch (ap->a_vp->v_type) {
611
612 case VCHR:
613 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
614 ap->a_fflag, p);
615 break;
616
617 case VBLK:
618 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
619 ap->a_fflag, p);
620 break;
621
622 default:
623 panic("spec_ioctl");
624 /* NOTREACHED */
625 }
626 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END,
627 (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, retval, 0);
628
629 return (retval);
630 }
631
632 int
633 spec_select(struct vnop_select_args *ap)
634 {
635 proc_t p = vfs_context_proc(ap->a_context);
636 dev_t dev;
637
638 switch (ap->a_vp->v_type) {
639
640 default:
641 return (1); /* XXX */
642
643 case VCHR:
644 dev = ap->a_vp->v_rdev;
645 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
646 }
647 }
648
649 static int filt_specattach(struct knote *kn);
650
651 int
652 spec_kqfilter(vnode_t vp, struct knote *kn)
653 {
654 dev_t dev;
655 int err = EINVAL;
656
657 /*
658 * For a few special kinds of devices, we can attach knotes.
659 * Each filter function must check whether the dev type matches it.
660 */
661 dev = vnode_specrdev(vp);
662
663 if (vnode_istty(vp)) {
664 /* We can hook into TTYs... */
665 err = filt_specattach(kn);
666 } else {
667 /* Try a bpf device, as defined in bsd/net/bpf.c */
668 err = bpfkqfilter(dev, kn);
669 }
670
671 return err;
672 }
673
674 /*
675 * Synch buffers associated with a block device
676 */
677 int
678 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
679 {
680 if (vp->v_type == VCHR)
681 return (0);
682 /*
683 * Flush all dirty buffers associated with a block device.
684 */
685 buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync");
686
687 return (0);
688 }
689
690 int
691 spec_fsync(struct vnop_fsync_args *ap)
692 {
693 return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
694 }
695
696 /*
697 * Just call the device strategy routine
698 */
699 extern int hard_throttle_on_root;
700 void IOSleep(int);
701
702 // the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
703 #define LOWPRI_INITIAL_WINDOW_MSECS 100
704 #define LOWPRI_WINDOW_MSECS_INC 50
705 #define LOWPRI_MAX_WINDOW_MSECS 200
706 #define LOWPRI_MAX_WAITING_MSECS 200
707
708 #if CONFIG_EMBEDDED
709 #define LOWPRI_SLEEP_INTERVAL 5
710 #else
711 #define LOWPRI_SLEEP_INTERVAL 2
712 #endif
713
714 int lowpri_IO_initial_window_msecs = LOWPRI_INITIAL_WINDOW_MSECS;
715 int lowpri_IO_window_msecs_inc = LOWPRI_WINDOW_MSECS_INC;
716 int lowpri_max_window_msecs = LOWPRI_MAX_WINDOW_MSECS;
717 int lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS;
718
719 #if 0
720 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) \
721 do { \
722 if ((debug_info)->alloc) \
723 printf("%s: "format, __FUNCTION__, ## args); \
724 } while(0)
725
726 #else
727 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
728 #endif
729
730 SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
731 SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, "");
732 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
733 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
734
735 /*
736 * throttled I/O helper function
737 * convert the index of the lowest set bit to a device index
738 */
739 int
740 num_trailing_0(uint64_t n)
741 {
742 /*
743 * since in most cases the number of trailing 0s is very small,
744 * we simply counting sequentially from the lowest bit
745 */
746 if (n == 0)
747 return sizeof(n) * 8;
748 int count = 0;
749 while (!ISSET(n, 1)) {
750 n >>= 1;
751 ++count;
752 }
753 return count;
754 }
755
756 /*
757 * Release the reference and if the item was allocated and this is the last
758 * reference then free it.
759 *
760 * This routine always returns the old value.
761 */
762 static int
763 throttle_info_rel(struct _throttle_io_info_t *info)
764 {
765 SInt32 oldValue = OSDecrementAtomic(&info->refcnt);
766
767 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
768 info, (int)(oldValue -1), info );
769
770 /* The reference count just went negative, very bad */
771 if (oldValue == 0)
772 panic("throttle info ref cnt went negative!");
773
774 /*
775 * Once reference count is zero, no one else should be able to take a
776 * reference
777 */
778 if ((info->refcnt == 0) && (info->alloc)) {
779 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info, info );
780 FREE(info, M_TEMP);
781 }
782 return oldValue;
783 }
784
785 /*
786 * Just take a reference on the throttle info structure.
787 *
788 * This routine always returns the old value.
789 */
790 static SInt32
791 throttle_info_ref(struct _throttle_io_info_t *info)
792 {
793 SInt32 oldValue = OSIncrementAtomic(&info->refcnt);
794
795 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
796 info, (int)(oldValue -1), info );
797 /* Allocated items should never have a reference of zero */
798 if (info->alloc && (oldValue == 0))
799 panic("Taking a reference without calling create throttle info!\n");
800
801 return oldValue;
802 }
803
804 /*
805 * KPI routine
806 *
807 * Create and take a reference on a throttle info structure and return a
808 * pointer for the file system to use when calling throttle_info_update.
809 * Calling file system must have a matching release for every create.
810 */
811 void *
812 throttle_info_create(void)
813 {
814 struct _throttle_io_info_t *info;
815
816 MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK);
817 /* Should never happen but just in case */
818 if (info == NULL)
819 return NULL;
820 /* Mark that this one was allocated and needs to be freed */
821 DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
822 info->alloc = TRUE;
823 /* Take a reference */
824 OSIncrementAtomic(&info->refcnt);
825 return info;
826 }
827
828 /*
829 * KPI routine
830 *
831 * Release the throttle info pointer if all the reference are gone. Should be
832 * called to release reference taken by throttle_info_create
833 */
834 void
835 throttle_info_release(void *throttle_info)
836 {
837 DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
838 (struct _throttle_io_info_t *)throttle_info,
839 (struct _throttle_io_info_t *)throttle_info);
840 if (throttle_info) /* Just to be careful */
841 throttle_info_rel(throttle_info);
842 }
843
844 /*
845 * KPI routine
846 *
847 * File Systems that create an info structure, need to call this routine in
848 * their mount routine (used by cluster code). File Systems that call this in
849 * their mount routines must call throttle_info_mount_rel in their unmount
850 * routines.
851 */
852 void
853 throttle_info_mount_ref(mount_t mp, void *throttle_info)
854 {
855 if ((throttle_info == NULL) || (mp == NULL))
856 return;
857 throttle_info_ref(throttle_info);
858 /* We already have a reference release it before adding the new one */
859 if (mp->mnt_throttle_info)
860 throttle_info_rel(mp->mnt_throttle_info);
861 mp->mnt_throttle_info = throttle_info;
862 }
863
864 /*
865 * Private KPI routine
866 *
867 * return a handle for accessing throttle_info given a throttle_mask. The
868 * handle must be released by throttle_info_rel_by_mask
869 */
870 int
871 throttle_info_ref_by_mask(uint64_t throttle_mask,
872 throttle_info_handle_t *throttle_info_handle)
873 {
874 int dev_index;
875 struct _throttle_io_info_t *info;
876
877 if (throttle_info_handle == NULL)
878 return EINVAL;
879
880 dev_index = num_trailing_0(throttle_mask);
881 info = &_throttle_io_info[dev_index];
882 throttle_info_ref(info);
883 *(struct _throttle_io_info_t**)throttle_info_handle = info;
884 return 0;
885 }
886
887 /*
888 * Private KPI routine
889 *
890 * release the handle obtained by throttle_info_ref_by_mask
891 */
892 void
893 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
894 {
895 /* for now the handle is just a pointer to _throttle_io_info_t */
896 throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
897 }
898
899 /*
900 * KPI routine
901 *
902 * File Systems that throttle_info_mount_ref, must call this routine in their
903 * umount routine.
904 */
905 void
906 throttle_info_mount_rel(mount_t mp)
907 {
908 if (mp->mnt_throttle_info)
909 throttle_info_rel(mp->mnt_throttle_info);
910 mp->mnt_throttle_info = NULL;
911 }
912
913 void
914 throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
915 {
916 struct _throttle_io_info_t *info;
917
918 if (mp == NULL)
919 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
920 else if (mp->mnt_throttle_info == NULL)
921 info = &_throttle_io_info[mp->mnt_devbsdunit];
922 else
923 info = mp->mnt_throttle_info;
924
925 *tv = info->last_IO_timestamp;
926 }
927
928 void
929 update_last_io_time(mount_t mp)
930 {
931 struct _throttle_io_info_t *info;
932
933 if (mp == NULL)
934 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
935 else if (mp->mnt_throttle_info == NULL)
936 info = &_throttle_io_info[mp->mnt_devbsdunit];
937 else
938 info = mp->mnt_throttle_info;
939
940 microuptime(&info->last_IO_timestamp);
941 }
942
943
944 #if CONFIG_EMBEDDED
945
946 int throttle_get_io_policy(struct uthread **ut)
947 {
948 int policy = IOPOL_DEFAULT;
949 proc_t p = current_proc();
950
951 *ut = get_bsdthread_info(current_thread());
952
953 if (p != NULL)
954 policy = p->p_iopol_disk;
955
956 if (*ut != NULL) {
957 // the I/O policy of the thread overrides that of the process
958 // unless the I/O policy of the thread is default
959 if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT)
960 policy = (*ut)->uu_iopol_disk;
961 }
962 return policy;
963 }
964 #else
965
966 int throttle_get_io_policy(__unused struct uthread **ut)
967 {
968 *ut = get_bsdthread_info(current_thread());
969
970 return (proc_get_task_selfdiskacc());
971 }
972 #endif
973
974
975 static int
976 throttle_io_will_be_throttled_internal(int lowpri_window_msecs, void * throttle_info)
977 {
978 struct _throttle_io_info_t *info = throttle_info;
979 struct timeval elapsed;
980 int elapsed_msecs;
981 int policy;
982 struct uthread *ut;
983
984 policy = throttle_get_io_policy(&ut);
985
986 if (ut->uu_throttle_bc == FALSE && policy != IOPOL_THROTTLE)
987 return (0);
988
989 microuptime(&elapsed);
990 timevalsub(&elapsed, &info->last_normal_IO_timestamp);
991 elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
992
993 if (lowpri_window_msecs == -1) // use the max waiting time
994 lowpri_window_msecs = lowpri_max_waiting_msecs;
995
996 return elapsed_msecs < lowpri_window_msecs;
997 }
998
999 /*
1000 * If we have a mount point and it has a throttle info pointer then
1001 * use it to do the check, otherwise use the device unit number to find
1002 * the correct throttle info array element.
1003 */
1004 int
1005 throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp)
1006 {
1007 void *info;
1008
1009 /* Should we just return zero if no mount point */
1010 if (mp == NULL)
1011 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1012 else if (mp->mnt_throttle_info == NULL)
1013 info = &_throttle_io_info[mp->mnt_devbsdunit];
1014 else
1015 info = mp->mnt_throttle_info;
1016 return throttle_io_will_be_throttled_internal(lowpri_window_msecs, info);
1017 }
1018
1019 uint32_t
1020 throttle_lowpri_io(int sleep_amount)
1021 {
1022 int sleep_cnt = 0;
1023 int numthreads_throttling;
1024 int max_try_num;
1025 struct uthread *ut;
1026 struct _throttle_io_info_t *info;
1027 int max_waiting_msecs;
1028
1029 ut = get_bsdthread_info(current_thread());
1030
1031 if ((ut->uu_lowpri_window == 0) || (ut->uu_throttle_info == NULL))
1032 goto done;
1033
1034 info = ut->uu_throttle_info;
1035
1036 if (sleep_amount != 0) {
1037 #if CONFIG_EMBEDDED
1038 max_waiting_msecs = lowpri_max_waiting_msecs;
1039 #else
1040 if (ut->uu_throttle_isssd == TRUE)
1041 max_waiting_msecs = lowpri_max_waiting_msecs / 100;
1042 else
1043 max_waiting_msecs = lowpri_max_waiting_msecs;
1044 #endif
1045 if (max_waiting_msecs < LOWPRI_SLEEP_INTERVAL)
1046 max_waiting_msecs = LOWPRI_SLEEP_INTERVAL;
1047
1048 numthreads_throttling = info->numthreads_throttling + MIN(10, MAX(1, sleep_amount)) - 1;
1049 max_try_num = max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, numthreads_throttling);
1050
1051 for (sleep_cnt = 0; sleep_cnt < max_try_num; sleep_cnt++) {
1052 if (throttle_io_will_be_throttled_internal(ut->uu_lowpri_window, info)) {
1053 if (sleep_cnt == 0) {
1054 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
1055 ut->uu_lowpri_window, max_try_num, numthreads_throttling, 0, 0);
1056 }
1057 IOSleep(LOWPRI_SLEEP_INTERVAL);
1058 DEBUG_ALLOC_THROTTLE_INFO("sleeping because of info = %p\n", info, info );
1059 } else {
1060 break;
1061 }
1062 }
1063 if (sleep_cnt) {
1064 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
1065 ut->uu_lowpri_window, sleep_cnt, 0, 0, 0);
1066 }
1067 }
1068 SInt32 oldValue;
1069 oldValue = OSDecrementAtomic(&info->numthreads_throttling);
1070
1071 if (oldValue <= 0) {
1072 panic("%s: numthreads negative", __func__);
1073 }
1074 done:
1075 ut->uu_lowpri_window = 0;
1076 if (ut->uu_throttle_info)
1077 throttle_info_rel(ut->uu_throttle_info);
1078 ut->uu_throttle_info = NULL;
1079 ut->uu_throttle_bc = FALSE;
1080
1081 return (sleep_cnt * LOWPRI_SLEEP_INTERVAL);
1082 }
1083
1084 /*
1085 * KPI routine
1086 *
1087 * set a kernel thread's IO policy. policy can be:
1088 * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE
1089 *
1090 * explanations about these policies are in the man page of setiopolicy_np
1091 */
1092 void throttle_set_thread_io_policy(int policy)
1093 {
1094 #if !CONFIG_EMBEDDED
1095 proc_apply_thread_selfdiskacc(policy);
1096 #else /* !CONFIG_EMBEDDED */
1097 struct uthread *ut;
1098 ut = get_bsdthread_info(current_thread());
1099 ut->uu_iopol_disk = policy;
1100 #endif /* !CONFIG_EMBEDDED */
1101 }
1102
1103
1104 static
1105 void throttle_info_reset_window(struct uthread *ut)
1106 {
1107 struct _throttle_io_info_t *info;
1108
1109 info = ut->uu_throttle_info;
1110
1111 OSDecrementAtomic(&info->numthreads_throttling);
1112 throttle_info_rel(info);
1113 ut->uu_throttle_info = NULL;
1114 ut->uu_lowpri_window = 0;
1115 }
1116
1117 static
1118 void throttle_info_set_initial_window(struct uthread *ut, struct _throttle_io_info_t *info, boolean_t isssd, boolean_t BC_throttle)
1119 {
1120 SInt32 oldValue;
1121
1122 ut->uu_throttle_info = info;
1123 throttle_info_ref(info);
1124 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
1125
1126 oldValue = OSIncrementAtomic(&info->numthreads_throttling);
1127 if (oldValue < 0) {
1128 panic("%s: numthreads negative", __func__);
1129 }
1130 ut->uu_lowpri_window = lowpri_IO_initial_window_msecs;
1131 ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue;
1132 ut->uu_throttle_isssd = isssd;
1133 ut->uu_throttle_bc = BC_throttle;
1134 }
1135
1136
1137 static
1138 void throttle_info_update_internal(void *throttle_info, int flags, boolean_t isssd)
1139 {
1140 struct _throttle_io_info_t *info = throttle_info;
1141 struct uthread *ut;
1142 int policy;
1143 int is_throttleable_io = 0;
1144 int is_passive_io = 0;
1145
1146 if (!lowpri_IO_initial_window_msecs || (info == NULL))
1147 return;
1148 policy = throttle_get_io_policy(&ut);
1149
1150 switch (policy) {
1151 case IOPOL_DEFAULT:
1152 case IOPOL_NORMAL:
1153 break;
1154 case IOPOL_THROTTLE:
1155 is_throttleable_io = 1;
1156 break;
1157 case IOPOL_PASSIVE:
1158 is_passive_io = 1;
1159 break;
1160 default:
1161 printf("unknown I/O policy %d", policy);
1162 break;
1163 }
1164
1165 if (!is_throttleable_io && ISSET(flags, B_PASSIVE))
1166 is_passive_io |= 1;
1167
1168 if (!is_throttleable_io) {
1169 if (!is_passive_io){
1170 microuptime(&info->last_normal_IO_timestamp);
1171 }
1172 } else if (ut) {
1173 /*
1174 * I'd really like to do the IOSleep here, but
1175 * we may be holding all kinds of filesystem related locks
1176 * and the pages for this I/O marked 'busy'...
1177 * we don't want to cause a normal task to block on
1178 * one of these locks while we're throttling a task marked
1179 * for low priority I/O... we'll mark the uthread and
1180 * do the delay just before we return from the system
1181 * call that triggered this I/O or from vnode_pagein
1182 */
1183 if (ut->uu_lowpri_window == 0)
1184 throttle_info_set_initial_window(ut, info, isssd, FALSE);
1185 else {
1186 /* The thread sends I/Os to different devices within the same system call */
1187 if (ut->uu_throttle_info != info) {
1188 struct _throttle_io_info_t *old_info = ut->uu_throttle_info;
1189
1190 // keep track of the numthreads in the right device
1191 OSDecrementAtomic(&old_info->numthreads_throttling);
1192 OSIncrementAtomic(&info->numthreads_throttling);
1193
1194 DEBUG_ALLOC_THROTTLE_INFO("switching from info = %p\n", old_info, old_info );
1195 DEBUG_ALLOC_THROTTLE_INFO("switching to info = %p\n", info, info );
1196 /* This thread no longer needs a reference on that throttle info */
1197 throttle_info_rel(ut->uu_throttle_info);
1198 ut->uu_throttle_info = info;
1199 /* Need to take a reference on this throttle info */
1200 throttle_info_ref(ut->uu_throttle_info);
1201 }
1202 int numthreads = MAX(1, info->numthreads_throttling);
1203 ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads;
1204 if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads)
1205 ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads;
1206
1207 if (isssd == FALSE) {
1208 /*
1209 * we're here because we've actually issued I/Os to different devices...
1210 * if at least one of them was a non SSD, then thottle the thread
1211 * using the policy for non SSDs
1212 */
1213 ut->uu_throttle_isssd = FALSE;
1214 }
1215 }
1216 }
1217 }
1218
1219 /*
1220 * KPI routine
1221 *
1222 * this is usually called before every I/O, used for throttled I/O
1223 * book keeping. This routine has low overhead and does not sleep
1224 */
1225 void throttle_info_update(void *throttle_info, int flags)
1226 {
1227 throttle_info_update_internal(throttle_info, flags, FALSE);
1228 }
1229
1230 /*
1231 * KPI routine
1232 *
1233 * this is usually called before every I/O, used for throttled I/O
1234 * book keeping. This routine has low overhead and does not sleep
1235 */
1236 void throttle_info_update_by_mask(void *throttle_info_handle, int flags)
1237 {
1238 void *throttle_info = throttle_info_handle;
1239 /* for now we only use the lowest bit of the throttle mask, so the
1240 * handle is the same as the throttle_info. Later if we store a
1241 * set of throttle infos in the handle, we will want to loop through
1242 * them and call throttle_info_update in a loop
1243 */
1244 throttle_info_update(throttle_info, flags);
1245 }
1246
1247 extern int ignore_is_ssd;
1248
1249 int
1250 spec_strategy(struct vnop_strategy_args *ap)
1251 {
1252 buf_t bp;
1253 int bflags;
1254 int policy;
1255 dev_t bdev;
1256 uthread_t ut;
1257 mount_t mp;
1258 int strategy_ret;
1259 struct _throttle_io_info_t *throttle_info;
1260 boolean_t isssd = FALSE;
1261
1262 bp = ap->a_bp;
1263 bdev = buf_device(bp);
1264 mp = buf_vnode(bp)->v_mount;
1265
1266 policy = throttle_get_io_policy(&ut);
1267
1268 if (policy == IOPOL_THROTTLE) {
1269 bp->b_flags |= B_THROTTLED_IO;
1270 bp->b_attr.ba_flags |= BA_THROTTLED_IO;
1271 bp->b_flags &= ~B_PASSIVE;
1272 } else if (policy == IOPOL_PASSIVE)
1273 bp->b_flags |= B_PASSIVE;
1274
1275 bflags = bp->b_flags;
1276
1277 if (kdebug_enable) {
1278 int code = 0;
1279
1280 if (bflags & B_READ)
1281 code |= DKIO_READ;
1282 if (bflags & B_ASYNC)
1283 code |= DKIO_ASYNC;
1284
1285 if (bflags & B_META)
1286 code |= DKIO_META;
1287 else if (bflags & B_PAGEIO)
1288 code |= DKIO_PAGING;
1289
1290 if (bflags & B_THROTTLED_IO)
1291 code |= DKIO_THROTTLE;
1292 else if (bflags & B_PASSIVE)
1293 code |= DKIO_PASSIVE;
1294
1295 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
1296 bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0);
1297 }
1298 if (((bflags & (B_IOSTREAMING | B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
1299 mp && (mp->mnt_kern_flag & MNTK_ROOTDEV))
1300 hard_throttle_on_root = 1;
1301
1302 if (mp != NULL) {
1303 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
1304 isssd = TRUE;
1305 throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
1306 } else
1307 throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1308
1309 throttle_info_update_internal(throttle_info, bflags, isssd);
1310
1311 if ((bflags & B_READ) == 0) {
1312 microuptime(&throttle_info->last_IO_timestamp);
1313 if (mp) {
1314 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
1315 }
1316 } else if (mp) {
1317 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
1318 }
1319 /*
1320 * The BootCache may give us special information about
1321 * the IO, so it returns special values that we check
1322 * for here.
1323 *
1324 * IO_SATISFIED_BY_CACHE
1325 * The read has been satisfied by the boot cache. Don't
1326 * throttle the thread unnecessarily.
1327 *
1328 * IO_SHOULD_BE_THROTTLED
1329 * The boot cache is playing back a playlist and this IO
1330 * cut through. Throttle it so we're not cutting through
1331 * the boot cache too often.
1332 *
1333 * Note that typical strategy routines are defined with
1334 * a void return so we'll get garbage here. In the
1335 * unlikely case the garbage matches our special return
1336 * value, it's not a big deal since we're only adjusting
1337 * the throttling delay.
1338 */
1339 #define IO_SATISFIED_BY_CACHE ((int)0xcafefeed)
1340 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
1341 typedef int strategy_fcn_ret_t(struct buf *bp);
1342
1343 strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp);
1344
1345 if ((IO_SATISFIED_BY_CACHE == strategy_ret) && (ut->uu_lowpri_window != 0) && (ut->uu_throttle_info != NULL)) {
1346 /*
1347 * If this was a throttled IO satisfied by the boot cache,
1348 * don't delay the thread.
1349 */
1350 throttle_info_reset_window(ut);
1351
1352 } else if ((IO_SHOULD_BE_THROTTLED == strategy_ret) && (ut->uu_lowpri_window == 0) && (ut->uu_throttle_info == NULL)) {
1353 /*
1354 * If the boot cache indicates this IO should be throttled,
1355 * delay the thread.
1356 */
1357 throttle_info_set_initial_window(ut, throttle_info, isssd, TRUE);
1358 }
1359 return (0);
1360 }
1361
1362
1363 /*
1364 * This is a noop, simply returning what one has been given.
1365 */
1366 int
1367 spec_blockmap(__unused struct vnop_blockmap_args *ap)
1368 {
1369 return (ENOTSUP);
1370 }
1371
1372
1373 /*
1374 * Device close routine
1375 */
1376 int
1377 spec_close(struct vnop_close_args *ap)
1378 {
1379 struct vnode *vp = ap->a_vp;
1380 dev_t dev = vp->v_rdev;
1381 int error = 0;
1382 int flags = ap->a_fflag;
1383 struct proc *p = vfs_context_proc(ap->a_context);
1384 struct session *sessp;
1385 int do_rele = 0;
1386
1387 switch (vp->v_type) {
1388
1389 case VCHR:
1390 /*
1391 * Hack: a tty device that is a controlling terminal
1392 * has a reference from the session structure.
1393 * We cannot easily tell that a character device is
1394 * a controlling terminal, unless it is the closing
1395 * process' controlling terminal. In that case,
1396 * if the reference count is 1 (this is the very
1397 * last close)
1398 */
1399 sessp = proc_session(p);
1400 if (sessp != SESSION_NULL) {
1401 if ((vcount(vp) == 1) &&
1402 (vp == sessp->s_ttyvp)) {
1403
1404 session_lock(sessp);
1405 if (vp == sessp->s_ttyvp) {
1406 sessp->s_ttyvp = NULL;
1407 sessp->s_ttyvid = 0;
1408 sessp->s_ttyp = TTY_NULL;
1409 sessp->s_ttypgrpid = NO_PID;
1410 do_rele = 1;
1411 }
1412 session_unlock(sessp);
1413
1414 if (do_rele) {
1415 vnode_rele(vp);
1416 }
1417 }
1418 session_rele(sessp);
1419 }
1420
1421 devsw_lock(dev, S_IFCHR);
1422
1423 vp->v_specinfo->si_opencount--;
1424
1425 if (vp->v_specinfo->si_opencount < 0) {
1426 panic("Negative open count?");
1427 }
1428 /*
1429 * close on last reference or on vnode revoke call
1430 */
1431 if ((vcount(vp) > 0) && ((flags & IO_REVOKE) == 0)) {
1432 devsw_unlock(dev, S_IFCHR);
1433 return (0);
1434 }
1435
1436 error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
1437
1438 devsw_unlock(dev, S_IFCHR);
1439 break;
1440
1441 case VBLK:
1442 /*
1443 * If there is more than one outstanding open, don't
1444 * send the close to the device.
1445 */
1446 devsw_lock(dev, S_IFBLK);
1447 if (vcount(vp) > 1) {
1448 vp->v_specinfo->si_opencount--;
1449 devsw_unlock(dev, S_IFBLK);
1450 return (0);
1451 }
1452 devsw_unlock(dev, S_IFBLK);
1453
1454 /*
1455 * On last close of a block device (that isn't mounted)
1456 * we must invalidate any in core blocks, so that
1457 * we can, for instance, change floppy disks.
1458 */
1459 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
1460 return (error);
1461
1462 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
1463 if (error)
1464 return (error);
1465
1466 devsw_lock(dev, S_IFBLK);
1467
1468 vp->v_specinfo->si_opencount--;
1469
1470 if (vp->v_specinfo->si_opencount < 0) {
1471 panic("Negative open count?");
1472 }
1473
1474 if (vcount(vp) > 0) {
1475 devsw_unlock(dev, S_IFBLK);
1476 return (0);
1477 }
1478
1479 error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
1480
1481 devsw_unlock(dev, S_IFBLK);
1482 break;
1483
1484 default:
1485 panic("spec_close: not special");
1486 return(EBADF);
1487 }
1488
1489 return error;
1490 }
1491
1492 /*
1493 * Return POSIX pathconf information applicable to special devices.
1494 */
1495 int
1496 spec_pathconf(struct vnop_pathconf_args *ap)
1497 {
1498
1499 switch (ap->a_name) {
1500 case _PC_LINK_MAX:
1501 *ap->a_retval = LINK_MAX;
1502 return (0);
1503 case _PC_MAX_CANON:
1504 *ap->a_retval = MAX_CANON;
1505 return (0);
1506 case _PC_MAX_INPUT:
1507 *ap->a_retval = MAX_INPUT;
1508 return (0);
1509 case _PC_PIPE_BUF:
1510 *ap->a_retval = PIPE_BUF;
1511 return (0);
1512 case _PC_CHOWN_RESTRICTED:
1513 *ap->a_retval = 200112; /* _POSIX_CHOWN_RESTRICTED */
1514 return (0);
1515 case _PC_VDISABLE:
1516 *ap->a_retval = _POSIX_VDISABLE;
1517 return (0);
1518 default:
1519 return (EINVAL);
1520 }
1521 /* NOTREACHED */
1522 }
1523
1524 /*
1525 * Special device failed operation
1526 */
1527 int
1528 spec_ebadf(__unused void *dummy)
1529 {
1530
1531 return (EBADF);
1532 }
1533
1534 /* Blktooff derives file offset from logical block number */
1535 int
1536 spec_blktooff(struct vnop_blktooff_args *ap)
1537 {
1538 struct vnode *vp = ap->a_vp;
1539
1540 switch (vp->v_type) {
1541 case VCHR:
1542 *ap->a_offset = (off_t)-1; /* failure */
1543 return (ENOTSUP);
1544
1545 case VBLK:
1546 printf("spec_blktooff: not implemented for VBLK\n");
1547 *ap->a_offset = (off_t)-1; /* failure */
1548 return (ENOTSUP);
1549
1550 default:
1551 panic("spec_blktooff type");
1552 }
1553 /* NOTREACHED */
1554
1555 return (0);
1556 }
1557
1558 /* Offtoblk derives logical block number from file offset */
1559 int
1560 spec_offtoblk(struct vnop_offtoblk_args *ap)
1561 {
1562 struct vnode *vp = ap->a_vp;
1563
1564 switch (vp->v_type) {
1565 case VCHR:
1566 *ap->a_lblkno = (daddr64_t)-1; /* failure */
1567 return (ENOTSUP);
1568
1569 case VBLK:
1570 printf("spec_offtoblk: not implemented for VBLK\n");
1571 *ap->a_lblkno = (daddr64_t)-1; /* failure */
1572 return (ENOTSUP);
1573
1574 default:
1575 panic("spec_offtoblk type");
1576 }
1577 /* NOTREACHED */
1578
1579 return (0);
1580 }
1581
1582 static void filt_specdetach(struct knote *kn);
1583 static int filt_spec(struct knote *kn, long hint);
1584 static unsigned filt_specpeek(struct knote *kn);
1585
1586 struct filterops spec_filtops = {
1587 .f_isfd = 1,
1588 .f_attach = filt_specattach,
1589 .f_detach = filt_specdetach,
1590 .f_event = filt_spec,
1591 .f_peek = filt_specpeek
1592 };
1593
1594 static int
1595 filter_to_seltype(int16_t filter)
1596 {
1597 switch (filter) {
1598 case EVFILT_READ:
1599 return FREAD;
1600 case EVFILT_WRITE:
1601 return FWRITE;
1602 break;
1603 default:
1604 panic("filt_to_seltype(): invalid filter %d\n", filter);
1605 return 0;
1606 }
1607 }
1608
1609 static int
1610 filt_specattach(struct knote *kn)
1611 {
1612 vnode_t vp;
1613 dev_t dev;
1614
1615 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */
1616
1617 assert(vnode_ischr(vp));
1618
1619 dev = vnode_specrdev(vp);
1620
1621 if (major(dev) > nchrdev) {
1622 return ENXIO;
1623 }
1624
1625 if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0) {
1626 return EINVAL;
1627 }
1628
1629 /* Resulting wql is safe to unlink even if it has never been linked */
1630 kn->kn_hook = wait_queue_link_allocate();
1631 if (kn->kn_hook == NULL) {
1632 return EAGAIN;
1633 }
1634
1635 kn->kn_fop = &spec_filtops;
1636 kn->kn_hookid = vnode_vid(vp);
1637
1638 knote_markstayqueued(kn);
1639
1640 return 0;
1641 }
1642
1643 static void
1644 filt_specdetach(struct knote *kn)
1645 {
1646 kern_return_t ret;
1647
1648 /*
1649 * Given wait queue link and wait queue set, unlink. This is subtle.
1650 * If the device has been revoked from under us, selclearthread() will
1651 * have removed our link from the kqueue's wait queue set, which
1652 * wait_queue_set_unlink_one() will detect and handle.
1653 */
1654 ret = wait_queue_set_unlink_one(kn->kn_kq->kq_wqs, kn->kn_hook);
1655 if (ret != KERN_SUCCESS) {
1656 panic("filt_specdetach(): failed to unlink wait queue link.");
1657 }
1658
1659 (void)wait_queue_link_free(kn->kn_hook);
1660 kn->kn_hook = NULL;
1661 kn->kn_status &= ~KN_STAYQUEUED;
1662 }
1663
1664 static int
1665 filt_spec(struct knote *kn, long hint)
1666 {
1667 vnode_t vp;
1668 uthread_t uth;
1669 wait_queue_set_t old_wqs;
1670 vfs_context_t ctx;
1671 int selres;
1672 int error;
1673 int use_offset;
1674 dev_t dev;
1675 uint64_t flags;
1676
1677 assert(kn->kn_hook != NULL);
1678
1679 if (hint != 0) {
1680 panic("filt_spec(): nonzero hint?");
1681 }
1682
1683 uth = get_bsdthread_info(current_thread());
1684 ctx = vfs_context_current();
1685 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
1686
1687 error = vnode_getwithvid(vp, kn->kn_hookid);
1688 if (error != 0) {
1689 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1690 return 1;
1691 }
1692
1693 dev = vnode_specrdev(vp);
1694 flags = cdevsw_flags[major(dev)];
1695 use_offset = ((flags & CDEVSW_USE_OFFSET) != 0);
1696 assert((flags & CDEVSW_SELECT_KQUEUE) != 0);
1697
1698 /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */
1699 old_wqs = uth->uu_wqset;
1700 uth->uu_wqset = kn->kn_kq->kq_wqs;
1701 selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx);
1702 uth->uu_wqset = old_wqs;
1703
1704 if (use_offset) {
1705 if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) {
1706 kn->kn_data = 0;
1707 } else {
1708 kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset;
1709 }
1710 } else {
1711 kn->kn_data = selres;
1712 }
1713
1714 vnode_put(vp);
1715
1716 return (kn->kn_data != 0);
1717 }
1718
1719 static unsigned
1720 filt_specpeek(struct knote *kn)
1721 {
1722 vnode_t vp;
1723 uthread_t uth;
1724 wait_queue_set_t old_wqs;
1725 vfs_context_t ctx;
1726 int error, selres;
1727
1728 uth = get_bsdthread_info(current_thread());
1729 ctx = vfs_context_current();
1730 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
1731
1732 error = vnode_getwithvid(vp, kn->kn_hookid);
1733 if (error != 0) {
1734 return 1; /* Just like VNOP_SELECT() on recycled vnode */
1735 }
1736
1737 /*
1738 * Why pass the link here? Because we may not have registered in the past...
1739 */
1740 old_wqs = uth->uu_wqset;
1741 uth->uu_wqset = kn->kn_kq->kq_wqs;
1742 selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx);
1743 uth->uu_wqset = old_wqs;
1744
1745 vnode_put(vp);
1746 return selres;
1747 }
1748