]> git.saurik.com Git - apple/xnu.git/blob - bsd/miscfs/specfs/spec_vnops.c
8050679f8eb274131f773fb43ea3b3161011304a
[apple/xnu.git] / bsd / miscfs / specfs / spec_vnops.c
1 /*
2 * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1989, 1993, 1995
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95
62 */
63
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/kauth.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/conf.h>
70 #include <sys/buf_internal.h>
71 #include <sys/mount_internal.h>
72 #include <sys/vnode_internal.h>
73 #include <sys/file_internal.h>
74 #include <sys/namei.h>
75 #include <sys/stat.h>
76 #include <sys/errno.h>
77 #include <sys/ioctl.h>
78 #include <sys/file.h>
79 #include <sys/user.h>
80 #include <sys/malloc.h>
81 #include <sys/disk.h>
82 #include <sys/uio_internal.h>
83 #include <sys/resource.h>
84 #include <miscfs/specfs/specdev.h>
85 #include <vfs/vfs_support.h>
86 #include <kern/assert.h>
87 #include <kern/task.h>
88
89 #include <sys/kdebug.h>
90
91 /* XXX following three prototypes should be in a header file somewhere */
92 extern dev_t chrtoblk(dev_t dev);
93 extern int iskmemdev(dev_t dev);
94 extern int bpfkqfilter(dev_t dev, struct knote *kn);
95 extern int ptsd_kqfilter(dev_t dev, struct knote *kn);
96
97 struct vnode *speclisth[SPECHSZ];
98
99 /* symbolic sleep message strings for devices */
100 char devopn[] = "devopn";
101 char devio[] = "devio";
102 char devwait[] = "devwait";
103 char devin[] = "devin";
104 char devout[] = "devout";
105 char devioc[] = "devioc";
106 char devcls[] = "devcls";
107
108 #define VOPFUNC int (*)(void *)
109
110 int (**spec_vnodeop_p)(void *);
111 struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
112 { &vnop_default_desc, (VOPFUNC)vn_default_error },
113 { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */
114 { &vnop_create_desc, (VOPFUNC)err_create }, /* create */
115 { &vnop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */
116 { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */
117 { &vnop_close_desc, (VOPFUNC)spec_close }, /* close */
118 { &vnop_access_desc, (VOPFUNC)spec_access }, /* access */
119 { &vnop_getattr_desc, (VOPFUNC)spec_getattr }, /* getattr */
120 { &vnop_setattr_desc, (VOPFUNC)spec_setattr }, /* setattr */
121 { &vnop_read_desc, (VOPFUNC)spec_read }, /* read */
122 { &vnop_write_desc, (VOPFUNC)spec_write }, /* write */
123 { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */
124 { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */
125 { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */
126 { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */
127 { &vnop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */
128 { &vnop_remove_desc, (VOPFUNC)err_remove }, /* remove */
129 { &vnop_link_desc, (VOPFUNC)err_link }, /* link */
130 { &vnop_rename_desc, (VOPFUNC)err_rename }, /* rename */
131 { &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */
132 { &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */
133 { &vnop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */
134 { &vnop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */
135 { &vnop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */
136 { &vnop_inactive_desc, (VOPFUNC)nop_inactive }, /* inactive */
137 { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim }, /* reclaim */
138 { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */
139 { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */
140 { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */
141 { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite }, /* bwrite */
142 { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */
143 { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */
144 { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */
145 { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */
146 { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk }, /* offtoblk */
147 { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap }, /* blockmap */
148 { (struct vnodeop_desc*)NULL, (int(*)())NULL }
149 };
150 struct vnodeopv_desc spec_vnodeop_opv_desc =
151 { &spec_vnodeop_p, spec_vnodeop_entries };
152
153
154 static void set_blocksize(vnode_t, dev_t);
155
156
157 /*
158 * Trivial lookup routine that always fails.
159 */
160 int
161 spec_lookup(struct vnop_lookup_args *ap)
162 {
163
164 *ap->a_vpp = NULL;
165 return (ENOTDIR);
166 }
167
168 static void
169 set_blocksize(struct vnode *vp, dev_t dev)
170 {
171 int (*size)(dev_t);
172 int rsize;
173
174 if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
175 rsize = (*size)(dev);
176 if (rsize <= 0) /* did size fail? */
177 vp->v_specsize = DEV_BSIZE;
178 else
179 vp->v_specsize = rsize;
180 }
181 else
182 vp->v_specsize = DEV_BSIZE;
183 }
184
185 void
186 set_fsblocksize(struct vnode *vp)
187 {
188
189 if (vp->v_type == VBLK) {
190 dev_t dev = (dev_t)vp->v_rdev;
191 int maj = major(dev);
192
193 if ((u_int)maj >= (u_int)nblkdev)
194 return;
195
196 vnode_lock(vp);
197 set_blocksize(vp, dev);
198 vnode_unlock(vp);
199 }
200
201 }
202
203
204 /*
205 * Open a special file.
206 */
207 int
208 spec_open(struct vnop_open_args *ap)
209 {
210 struct proc *p = vfs_context_proc(ap->a_context);
211 kauth_cred_t cred = vfs_context_ucred(ap->a_context);
212 struct vnode *vp = ap->a_vp;
213 dev_t bdev, dev = (dev_t)vp->v_rdev;
214 int maj = major(dev);
215 int error;
216
217 /*
218 * Don't allow open if fs is mounted -nodev.
219 */
220 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
221 return (ENXIO);
222
223 switch (vp->v_type) {
224
225 case VCHR:
226 if ((u_int)maj >= (u_int)nchrdev)
227 return (ENXIO);
228 if (cred != FSCRED && (ap->a_mode & FWRITE)) {
229 /*
230 * When running in very secure mode, do not allow
231 * opens for writing of any disk character devices.
232 */
233 if (securelevel >= 2 && isdisk(dev, VCHR))
234 return (EPERM);
235 /*
236 * When running in secure mode, do not allow opens
237 * for writing of /dev/mem, /dev/kmem, or character
238 * devices whose corresponding block devices are
239 * currently mounted.
240 */
241 if (securelevel >= 1) {
242 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error))
243 return (error);
244 if (iskmemdev(dev))
245 return (EPERM);
246 }
247 }
248 if (cdevsw[maj].d_type == D_TTY) {
249 vnode_lock(vp);
250 vp->v_flag |= VISTTY;
251 vnode_unlock(vp);
252 }
253
254 devsw_lock(dev, S_IFCHR);
255 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
256
257 if (error == 0) {
258 vp->v_specinfo->si_opencount++;
259 }
260
261 devsw_unlock(dev, S_IFCHR);
262 return (error);
263
264 case VBLK:
265 if ((u_int)maj >= (u_int)nblkdev)
266 return (ENXIO);
267 /*
268 * When running in very secure mode, do not allow
269 * opens for writing of any disk block devices.
270 */
271 if (securelevel >= 2 && cred != FSCRED &&
272 (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK)
273 return (EPERM);
274 /*
275 * Do not allow opens of block devices that are
276 * currently mounted.
277 */
278 if ( (error = vfs_mountedon(vp)) )
279 return (error);
280
281 devsw_lock(dev, S_IFBLK);
282 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
283 if (!error) {
284 vp->v_specinfo->si_opencount++;
285 }
286 devsw_unlock(dev, S_IFBLK);
287
288 if (!error) {
289 u_int64_t blkcnt;
290 u_int32_t blksize;
291 int setsize = 0;
292 u_int32_t size512 = 512;
293
294
295 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
296 /* Switch to 512 byte sectors (temporarily) */
297
298 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
299 /* Get the number of 512 byte physical blocks. */
300 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
301 setsize = 1;
302 }
303 }
304 /* If it doesn't set back, we can't recover */
305 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context))
306 error = ENXIO;
307 }
308
309
310 vnode_lock(vp);
311 set_blocksize(vp, dev);
312
313 /*
314 * Cache the size in bytes of the block device for later
315 * use by spec_write().
316 */
317 if (setsize)
318 vp->v_specdevsize = blkcnt * (u_int64_t)size512;
319 else
320 vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */
321
322 vnode_unlock(vp);
323
324 }
325 return(error);
326 default:
327 panic("spec_open type");
328 }
329 return (0);
330 }
331
332 /*
333 * Vnode op for read
334 */
335 int
336 spec_read(struct vnop_read_args *ap)
337 {
338 struct vnode *vp = ap->a_vp;
339 struct uio *uio = ap->a_uio;
340 struct buf *bp;
341 daddr64_t bn, nextbn;
342 long bsize, bscale;
343 int devBlockSize=0;
344 int n, on;
345 int error = 0;
346 dev_t dev;
347
348 #if DIAGNOSTIC
349 if (uio->uio_rw != UIO_READ)
350 panic("spec_read mode");
351 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
352 panic("spec_read proc");
353 #endif
354 if (uio_resid(uio) == 0)
355 return (0);
356
357 switch (vp->v_type) {
358
359 case VCHR:
360 error = (*cdevsw[major(vp->v_rdev)].d_read)
361 (vp->v_rdev, uio, ap->a_ioflag);
362 return (error);
363
364 case VBLK:
365 if (uio->uio_offset < 0)
366 return (EINVAL);
367
368 dev = vp->v_rdev;
369
370 devBlockSize = vp->v_specsize;
371
372 if (devBlockSize > PAGE_SIZE)
373 return (EINVAL);
374
375 bscale = PAGE_SIZE / devBlockSize;
376 bsize = bscale * devBlockSize;
377
378 do {
379 on = uio->uio_offset % bsize;
380
381 bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1));
382
383 if (vp->v_speclastr + bscale == bn) {
384 nextbn = bn + bscale;
385 error = buf_breadn(vp, bn, (int)bsize, &nextbn,
386 (int *)&bsize, 1, NOCRED, &bp);
387 } else
388 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
389
390 vnode_lock(vp);
391 vp->v_speclastr = bn;
392 vnode_unlock(vp);
393
394 n = bsize - buf_resid(bp);
395 if ((on > n) || error) {
396 if (!error)
397 error = EINVAL;
398 buf_brelse(bp);
399 return (error);
400 }
401 n = min((unsigned)(n - on), uio_resid(uio));
402
403 error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
404 if (n + on == bsize)
405 buf_markaged(bp);
406 buf_brelse(bp);
407 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
408 return (error);
409
410 default:
411 panic("spec_read type");
412 }
413 /* NOTREACHED */
414
415 return (0);
416 }
417
418 /*
419 * Vnode op for write
420 */
421 int
422 spec_write(struct vnop_write_args *ap)
423 {
424 struct vnode *vp = ap->a_vp;
425 struct uio *uio = ap->a_uio;
426 struct buf *bp;
427 daddr64_t bn;
428 int bsize, blkmask, bscale;
429 int io_sync;
430 int devBlockSize=0;
431 int n, on;
432 int error = 0;
433 dev_t dev;
434
435 #if DIAGNOSTIC
436 if (uio->uio_rw != UIO_WRITE)
437 panic("spec_write mode");
438 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
439 panic("spec_write proc");
440 #endif
441
442 switch (vp->v_type) {
443
444 case VCHR:
445 error = (*cdevsw[major(vp->v_rdev)].d_write)
446 (vp->v_rdev, uio, ap->a_ioflag);
447 return (error);
448
449 case VBLK:
450 if (uio_resid(uio) == 0)
451 return (0);
452 if (uio->uio_offset < 0)
453 return (EINVAL);
454
455 io_sync = (ap->a_ioflag & IO_SYNC);
456
457 dev = (vp->v_rdev);
458
459 devBlockSize = vp->v_specsize;
460 if (devBlockSize > PAGE_SIZE)
461 return(EINVAL);
462
463 bscale = PAGE_SIZE / devBlockSize;
464 blkmask = bscale - 1;
465 bsize = bscale * devBlockSize;
466
467
468 do {
469 bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask);
470 on = uio->uio_offset % bsize;
471
472 n = min((unsigned)(bsize - on), uio_resid(uio));
473
474 /*
475 * Use buf_getblk() as an optimization IFF:
476 *
477 * 1) We are reading exactly a block on a block
478 * aligned boundary
479 * 2) We know the size of the device from spec_open
480 * 3) The read doesn't span the end of the device
481 *
482 * Otherwise, we fall back on buf_bread().
483 */
484 if (n == bsize &&
485 vp->v_specdevsize != (u_int64_t)0 &&
486 (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
487 /* reduce the size of the read to what is there */
488 n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
489 }
490
491 if (n == bsize)
492 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
493 else
494 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
495
496 /* Translate downstream error for upstream, if needed */
497 if (!error)
498 error = (int)buf_error(bp);
499 if (error) {
500 buf_brelse(bp);
501 return (error);
502 }
503 n = min(n, bsize - buf_resid(bp));
504
505 error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
506 if (error) {
507 buf_brelse(bp);
508 return (error);
509 }
510 buf_markaged(bp);
511
512 if (io_sync)
513 error = buf_bwrite(bp);
514 else {
515 if ((n + on) == bsize)
516 error = buf_bawrite(bp);
517 else
518 error = buf_bdwrite(bp);
519 }
520 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
521 return (error);
522
523 default:
524 panic("spec_write type");
525 }
526 /* NOTREACHED */
527
528 return (0);
529 }
530
531 /*
532 * Device ioctl operation.
533 */
534 int
535 spec_ioctl(struct vnop_ioctl_args *ap)
536 {
537 proc_t p = vfs_context_proc(ap->a_context);
538 dev_t dev = ap->a_vp->v_rdev;
539 int retval = 0;
540
541 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START,
542 (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, (unsigned int)ap->a_vp->v_type, 0);
543
544 switch (ap->a_vp->v_type) {
545
546 case VCHR:
547 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
548 ap->a_fflag, p);
549 break;
550
551 case VBLK:
552 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
553 ap->a_fflag, p);
554 break;
555
556 default:
557 panic("spec_ioctl");
558 /* NOTREACHED */
559 }
560 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END,
561 (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, retval, 0);
562
563 return (retval);
564 }
565
566 int
567 spec_select(struct vnop_select_args *ap)
568 {
569 proc_t p = vfs_context_proc(ap->a_context);
570 dev_t dev;
571
572 switch (ap->a_vp->v_type) {
573
574 default:
575 return (1); /* XXX */
576
577 case VCHR:
578 dev = ap->a_vp->v_rdev;
579 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
580 }
581 }
582
583 static int filt_specattach(struct knote *kn);
584
585 int
586 spec_kqfilter(vnode_t vp, struct knote *kn)
587 {
588 dev_t dev;
589 int err = EINVAL;
590
591 /*
592 * For a few special kinds of devices, we can attach knotes.
593 * Each filter function must check whether the dev type matches it.
594 */
595 dev = vnode_specrdev(vp);
596
597 if (vnode_istty(vp)) {
598 /* We can hook into TTYs... */
599 err = filt_specattach(kn);
600 } else {
601 /* Try a bpf device, as defined in bsd/net/bpf.c */
602 err = bpfkqfilter(dev, kn);
603 }
604
605 return err;
606 }
607
608 /*
609 * Synch buffers associated with a block device
610 */
611 int
612 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
613 {
614 if (vp->v_type == VCHR)
615 return (0);
616 /*
617 * Flush all dirty buffers associated with a block device.
618 */
619 buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync");
620
621 return (0);
622 }
623
624 int
625 spec_fsync(struct vnop_fsync_args *ap)
626 {
627 return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
628 }
629
630 /*
631 * Just call the device strategy routine
632 */
633 extern int hard_throttle_on_root;
634 void IOSleep(int);
635
636 // the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
637 #define LOWPRI_INITIAL_WINDOW_MSECS 100
638 #define LOWPRI_WINDOW_MSECS_INC 50
639 #define LOWPRI_MAX_WINDOW_MSECS 200
640 #define LOWPRI_MAX_WAITING_MSECS 200
641
642 #if CONFIG_EMBEDDED
643 #define LOWPRI_SLEEP_INTERVAL 5
644 #else
645 #define LOWPRI_SLEEP_INTERVAL 2
646 #endif
647
648 struct _throttle_io_info_t {
649 struct timeval last_normal_IO_timestamp;
650 struct timeval last_IO_timestamp;
651 SInt32 numthreads_throttling;
652 SInt32 refcnt;
653 SInt32 alloc;
654 };
655
656 struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
657 int lowpri_IO_initial_window_msecs = LOWPRI_INITIAL_WINDOW_MSECS;
658 int lowpri_IO_window_msecs_inc = LOWPRI_WINDOW_MSECS_INC;
659 int lowpri_max_window_msecs = LOWPRI_MAX_WINDOW_MSECS;
660 int lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS;
661
662 #if 0
663 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) \
664 do { \
665 if ((debug_info)->alloc) \
666 printf("%s: "format, __FUNCTION__, ## args); \
667 } while(0)
668
669 #else
670 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
671 #endif
672
673 SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
674 SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, "");
675 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
676 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
677
678 /*
679 * throttled I/O helper function
680 * convert the index of the lowest set bit to a device index
681 */
682 int
683 num_trailing_0(uint64_t n)
684 {
685 /*
686 * since in most cases the number of trailing 0s is very small,
687 * we simply counting sequentially from the lowest bit
688 */
689 if (n == 0)
690 return sizeof(n) * 8;
691 int count = 0;
692 while (!ISSET(n, 1)) {
693 n >>= 1;
694 ++count;
695 }
696 return count;
697 }
698
699 /*
700 * Release the reference and if the item was allocated and this is the last
701 * reference then free it.
702 *
703 * This routine always returns the old value.
704 */
705 static int
706 throttle_info_rel(struct _throttle_io_info_t *info)
707 {
708 SInt32 oldValue = OSDecrementAtomic(&info->refcnt);
709
710 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
711 info, (int)(oldValue -1), info );
712
713 /* The reference count just went negative, very bad */
714 if (oldValue == 0)
715 panic("throttle info ref cnt went negative!");
716
717 /*
718 * Once reference count is zero, no one else should be able to take a
719 * reference
720 */
721 if ((info->refcnt == 0) && (info->alloc)) {
722 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info, info );
723 FREE(info, M_TEMP);
724 }
725 return oldValue;
726 }
727
728 /*
729 * Just take a reference on the throttle info structure.
730 *
731 * This routine always returns the old value.
732 */
733 static SInt32
734 throttle_info_ref(struct _throttle_io_info_t *info)
735 {
736 SInt32 oldValue = OSIncrementAtomic(&info->refcnt);
737
738 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
739 info, (int)(oldValue -1), info );
740 /* Allocated items should never have a reference of zero */
741 if (info->alloc && (oldValue == 0))
742 panic("Taking a reference without calling create throttle info!\n");
743
744 return oldValue;
745 }
746
747 /*
748 * KPI routine
749 *
750 * Create and take a reference on a throttle info structure and return a
751 * pointer for the file system to use when calling throttle_info_update.
752 * Calling file system must have a matching release for every create.
753 */
754 void *
755 throttle_info_create(void)
756 {
757 struct _throttle_io_info_t *info;
758
759 MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK);
760 /* Should never happen but just in case */
761 if (info == NULL)
762 return NULL;
763 /* Mark that this one was allocated and needs to be freed */
764 DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
765 info->alloc = TRUE;
766 /* Take a reference */
767 OSIncrementAtomic(&info->refcnt);
768 return info;
769 }
770
771 /*
772 * KPI routine
773 *
774 * Release the throttle info pointer if all the reference are gone. Should be
775 * called to release reference taken by throttle_info_create
776 */
777 void
778 throttle_info_release(void *throttle_info)
779 {
780 DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
781 (struct _throttle_io_info_t *)throttle_info,
782 (struct _throttle_io_info_t *)throttle_info);
783 if (throttle_info) /* Just to be careful */
784 throttle_info_rel(throttle_info);
785 }
786
787 /*
788 * KPI routine
789 *
790 * File Systems that create an info structure, need to call this routine in
791 * their mount routine (used by cluster code). File Systems that call this in
792 * their mount routines must call throttle_info_mount_rel in their unmount
793 * routines.
794 */
795 void
796 throttle_info_mount_ref(mount_t mp, void *throttle_info)
797 {
798 if ((throttle_info == NULL) || (mp == NULL))
799 return;
800 throttle_info_ref(throttle_info);
801 /* We already have a reference release it before adding the new one */
802 if (mp->mnt_throttle_info)
803 throttle_info_rel(mp->mnt_throttle_info);
804 mp->mnt_throttle_info = throttle_info;
805 }
806
807 /*
808 * Private KPI routine
809 *
810 * return a handle for accessing throttle_info given a throttle_mask. The
811 * handle must be released by throttle_info_rel_by_mask
812 */
813 int
814 throttle_info_ref_by_mask(uint64_t throttle_mask,
815 throttle_info_handle_t *throttle_info_handle)
816 {
817 int dev_index;
818 struct _throttle_io_info_t *info;
819
820 if (throttle_info_handle == NULL)
821 return EINVAL;
822
823 dev_index = num_trailing_0(throttle_mask);
824 info = &_throttle_io_info[dev_index];
825 throttle_info_ref(info);
826 *(struct _throttle_io_info_t**)throttle_info_handle = info;
827 return 0;
828 }
829
830 /*
831 * Private KPI routine
832 *
833 * release the handle obtained by throttle_info_ref_by_mask
834 */
835 void
836 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
837 {
838 /* for now the handle is just a pointer to _throttle_io_info_t */
839 throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
840 }
841
842 /*
843 * KPI routine
844 *
845 * File Systems that throttle_info_mount_ref, must call this routine in their
846 * umount routine.
847 */
848 void
849 throttle_info_mount_rel(mount_t mp)
850 {
851 if (mp->mnt_throttle_info)
852 throttle_info_rel(mp->mnt_throttle_info);
853 mp->mnt_throttle_info = NULL;
854 }
855
856 void
857 throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
858 {
859 struct _throttle_io_info_t *info;
860
861 if (mp == NULL)
862 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
863 else if (mp->mnt_throttle_info == NULL)
864 info = &_throttle_io_info[mp->mnt_devbsdunit];
865 else
866 info = mp->mnt_throttle_info;
867
868 *tv = info->last_IO_timestamp;
869 }
870
871 void
872 update_last_io_time(mount_t mp)
873 {
874 struct _throttle_io_info_t *info;
875
876 if (mp == NULL)
877 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
878 else if (mp->mnt_throttle_info == NULL)
879 info = &_throttle_io_info[mp->mnt_devbsdunit];
880 else
881 info = mp->mnt_throttle_info;
882
883 microuptime(&info->last_IO_timestamp);
884 }
885
886
887 #if CONFIG_EMBEDDED
888
889 int throttle_get_io_policy(struct uthread **ut)
890 {
891 int policy = IOPOL_DEFAULT;
892 proc_t p = current_proc();
893
894 *ut = get_bsdthread_info(current_thread());
895
896 if (p != NULL)
897 policy = p->p_iopol_disk;
898
899 if (*ut != NULL) {
900 // the I/O policy of the thread overrides that of the process
901 // unless the I/O policy of the thread is default
902 if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT)
903 policy = (*ut)->uu_iopol_disk;
904 }
905 return policy;
906 }
907 #else
908
909 int throttle_get_io_policy(__unused struct uthread **ut)
910 {
911 *ut = get_bsdthread_info(current_thread());
912
913 return (proc_get_task_selfdiskacc());
914 }
915 #endif
916
917
918 static int
919 throttle_io_will_be_throttled_internal(int lowpri_window_msecs, void * throttle_info)
920 {
921 struct _throttle_io_info_t *info = throttle_info;
922 struct timeval elapsed;
923 int elapsed_msecs;
924 int policy;
925 struct uthread *ut;
926
927 policy = throttle_get_io_policy(&ut);
928
929 if (ut->uu_throttle_bc == FALSE && policy != IOPOL_THROTTLE)
930 return (0);
931
932 microuptime(&elapsed);
933 timevalsub(&elapsed, &info->last_normal_IO_timestamp);
934 elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
935
936 if (lowpri_window_msecs == -1) // use the max waiting time
937 lowpri_window_msecs = lowpri_max_waiting_msecs;
938
939 return elapsed_msecs < lowpri_window_msecs;
940 }
941
942 /*
943 * If we have a mount point and it has a throttle info pointer then
944 * use it to do the check, otherwise use the device unit number to find
945 * the correct throttle info array element.
946 */
947 int
948 throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp)
949 {
950 void *info;
951
952 /* Should we just return zero if no mount point */
953 if (mp == NULL)
954 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
955 else if (mp->mnt_throttle_info == NULL)
956 info = &_throttle_io_info[mp->mnt_devbsdunit];
957 else
958 info = mp->mnt_throttle_info;
959 return throttle_io_will_be_throttled_internal(lowpri_window_msecs, info);
960 }
961
962 uint32_t
963 throttle_lowpri_io(int sleep_amount)
964 {
965 int sleep_cnt = 0;
966 int numthreads_throttling;
967 int max_try_num;
968 struct uthread *ut;
969 struct _throttle_io_info_t *info;
970 int max_waiting_msecs;
971
972 ut = get_bsdthread_info(current_thread());
973
974 if ((ut->uu_lowpri_window == 0) || (ut->uu_throttle_info == NULL))
975 goto done;
976
977 info = ut->uu_throttle_info;
978
979 if (sleep_amount != 0) {
980 #if CONFIG_EMBEDDED
981 max_waiting_msecs = lowpri_max_waiting_msecs;
982 #else
983 if (ut->uu_throttle_isssd == TRUE)
984 max_waiting_msecs = lowpri_max_waiting_msecs / 100;
985 else
986 max_waiting_msecs = lowpri_max_waiting_msecs;
987 #endif
988 if (max_waiting_msecs < LOWPRI_SLEEP_INTERVAL)
989 max_waiting_msecs = LOWPRI_SLEEP_INTERVAL;
990
991 numthreads_throttling = info->numthreads_throttling + MIN(10, MAX(1, sleep_amount)) - 1;
992 max_try_num = max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, numthreads_throttling);
993
994 for (sleep_cnt = 0; sleep_cnt < max_try_num; sleep_cnt++) {
995 if (throttle_io_will_be_throttled_internal(ut->uu_lowpri_window, info)) {
996 if (sleep_cnt == 0) {
997 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
998 ut->uu_lowpri_window, max_try_num, numthreads_throttling, 0, 0);
999 }
1000 IOSleep(LOWPRI_SLEEP_INTERVAL);
1001 DEBUG_ALLOC_THROTTLE_INFO("sleeping because of info = %p\n", info, info );
1002 } else {
1003 break;
1004 }
1005 }
1006 if (sleep_cnt) {
1007 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
1008 ut->uu_lowpri_window, sleep_cnt, 0, 0, 0);
1009 }
1010 }
1011 SInt32 oldValue;
1012 oldValue = OSDecrementAtomic(&info->numthreads_throttling);
1013
1014 if (oldValue <= 0) {
1015 panic("%s: numthreads negative", __func__);
1016 }
1017 done:
1018 ut->uu_lowpri_window = 0;
1019 if (ut->uu_throttle_info)
1020 throttle_info_rel(ut->uu_throttle_info);
1021 ut->uu_throttle_info = NULL;
1022 ut->uu_throttle_bc = FALSE;
1023
1024 return (sleep_cnt * LOWPRI_SLEEP_INTERVAL);
1025 }
1026
1027 /*
1028 * KPI routine
1029 *
1030 * set a kernel thread's IO policy. policy can be:
1031 * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE
1032 *
1033 * explanations about these policies are in the man page of setiopolicy_np
1034 */
1035 void throttle_set_thread_io_policy(int policy)
1036 {
1037 #if !CONFIG_EMBEDDED
1038 proc_apply_thread_selfdiskacc(policy);
1039 #else /* !CONFIG_EMBEDDED */
1040 struct uthread *ut;
1041 ut = get_bsdthread_info(current_thread());
1042 ut->uu_iopol_disk = policy;
1043 #endif /* !CONFIG_EMBEDDED */
1044 }
1045
1046
1047 static
1048 void throttle_info_reset_window(struct uthread *ut)
1049 {
1050 struct _throttle_io_info_t *info;
1051
1052 info = ut->uu_throttle_info;
1053
1054 OSDecrementAtomic(&info->numthreads_throttling);
1055 throttle_info_rel(info);
1056 ut->uu_throttle_info = NULL;
1057 ut->uu_lowpri_window = 0;
1058 }
1059
1060 static
1061 void throttle_info_set_initial_window(struct uthread *ut, struct _throttle_io_info_t *info, boolean_t isssd, boolean_t BC_throttle)
1062 {
1063 SInt32 oldValue;
1064
1065 ut->uu_throttle_info = info;
1066 throttle_info_ref(info);
1067 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
1068
1069 oldValue = OSIncrementAtomic(&info->numthreads_throttling);
1070 if (oldValue < 0) {
1071 panic("%s: numthreads negative", __func__);
1072 }
1073 ut->uu_lowpri_window = lowpri_IO_initial_window_msecs;
1074 ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue;
1075 ut->uu_throttle_isssd = isssd;
1076 ut->uu_throttle_bc = BC_throttle;
1077 }
1078
1079
1080 static
1081 void throttle_info_update_internal(void *throttle_info, int flags, boolean_t isssd)
1082 {
1083 struct _throttle_io_info_t *info = throttle_info;
1084 struct uthread *ut;
1085 int policy;
1086 int is_throttleable_io = 0;
1087 int is_passive_io = 0;
1088
1089 if (!lowpri_IO_initial_window_msecs || (info == NULL))
1090 return;
1091 policy = throttle_get_io_policy(&ut);
1092
1093 switch (policy) {
1094 case IOPOL_DEFAULT:
1095 case IOPOL_NORMAL:
1096 break;
1097 case IOPOL_THROTTLE:
1098 is_throttleable_io = 1;
1099 break;
1100 case IOPOL_PASSIVE:
1101 is_passive_io = 1;
1102 break;
1103 default:
1104 printf("unknown I/O policy %d", policy);
1105 break;
1106 }
1107
1108 if (!is_throttleable_io && ISSET(flags, B_PASSIVE))
1109 is_passive_io |= 1;
1110
1111 if (!is_throttleable_io) {
1112 if (!is_passive_io){
1113 microuptime(&info->last_normal_IO_timestamp);
1114 }
1115 } else if (ut) {
1116 /*
1117 * I'd really like to do the IOSleep here, but
1118 * we may be holding all kinds of filesystem related locks
1119 * and the pages for this I/O marked 'busy'...
1120 * we don't want to cause a normal task to block on
1121 * one of these locks while we're throttling a task marked
1122 * for low priority I/O... we'll mark the uthread and
1123 * do the delay just before we return from the system
1124 * call that triggered this I/O or from vnode_pagein
1125 */
1126 if (ut->uu_lowpri_window == 0)
1127 throttle_info_set_initial_window(ut, info, isssd, FALSE);
1128 else {
1129 /* The thread sends I/Os to different devices within the same system call */
1130 if (ut->uu_throttle_info != info) {
1131 struct _throttle_io_info_t *old_info = ut->uu_throttle_info;
1132
1133 // keep track of the numthreads in the right device
1134 OSDecrementAtomic(&old_info->numthreads_throttling);
1135 OSIncrementAtomic(&info->numthreads_throttling);
1136
1137 DEBUG_ALLOC_THROTTLE_INFO("switching from info = %p\n", old_info, old_info );
1138 DEBUG_ALLOC_THROTTLE_INFO("switching to info = %p\n", info, info );
1139 /* This thread no longer needs a reference on that throttle info */
1140 throttle_info_rel(ut->uu_throttle_info);
1141 ut->uu_throttle_info = info;
1142 /* Need to take a reference on this throttle info */
1143 throttle_info_ref(ut->uu_throttle_info);
1144 }
1145 int numthreads = MAX(1, info->numthreads_throttling);
1146 ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads;
1147 if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads)
1148 ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads;
1149
1150 if (isssd == FALSE) {
1151 /*
1152 * we're here because we've actually issued I/Os to different devices...
1153 * if at least one of them was a non SSD, then thottle the thread
1154 * using the policy for non SSDs
1155 */
1156 ut->uu_throttle_isssd = FALSE;
1157 }
1158 }
1159 }
1160 }
1161
1162 /*
1163 * KPI routine
1164 *
1165 * this is usually called before every I/O, used for throttled I/O
1166 * book keeping. This routine has low overhead and does not sleep
1167 */
1168 void throttle_info_update(void *throttle_info, int flags)
1169 {
1170 throttle_info_update_internal(throttle_info, flags, FALSE);
1171 }
1172
1173 /*
1174 * KPI routine
1175 *
1176 * this is usually called before every I/O, used for throttled I/O
1177 * book keeping. This routine has low overhead and does not sleep
1178 */
1179 void throttle_info_update_by_mask(void *throttle_info_handle, int flags)
1180 {
1181 void *throttle_info = throttle_info_handle;
1182 /* for now we only use the lowest bit of the throttle mask, so the
1183 * handle is the same as the throttle_info. Later if we store a
1184 * set of throttle infos in the handle, we will want to loop through
1185 * them and call throttle_info_update in a loop
1186 */
1187 throttle_info_update(throttle_info, flags);
1188 }
1189
1190 extern int ignore_is_ssd;
1191
1192 int
1193 spec_strategy(struct vnop_strategy_args *ap)
1194 {
1195 buf_t bp;
1196 int bflags;
1197 int policy;
1198 dev_t bdev;
1199 uthread_t ut;
1200 mount_t mp;
1201 int strategy_ret;
1202 struct _throttle_io_info_t *throttle_info;
1203 boolean_t isssd = FALSE;
1204
1205 bp = ap->a_bp;
1206 bdev = buf_device(bp);
1207 mp = buf_vnode(bp)->v_mount;
1208
1209 policy = throttle_get_io_policy(&ut);
1210
1211 if (policy == IOPOL_THROTTLE) {
1212 bp->b_flags |= B_THROTTLED_IO;
1213 bp->b_flags &= ~B_PASSIVE;
1214 } else if (policy == IOPOL_PASSIVE)
1215 bp->b_flags |= B_PASSIVE;
1216
1217 bflags = bp->b_flags;
1218
1219 if (kdebug_enable) {
1220 int code = 0;
1221
1222 if (bflags & B_READ)
1223 code |= DKIO_READ;
1224 if (bflags & B_ASYNC)
1225 code |= DKIO_ASYNC;
1226
1227 if (bflags & B_META)
1228 code |= DKIO_META;
1229 else if (bflags & B_PAGEIO)
1230 code |= DKIO_PAGING;
1231
1232 if (bflags & B_THROTTLED_IO)
1233 code |= DKIO_THROTTLE;
1234 else if (bflags & B_PASSIVE)
1235 code |= DKIO_PASSIVE;
1236
1237 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
1238 bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0);
1239 }
1240 if (((bflags & (B_IOSTREAMING | B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
1241 mp && (mp->mnt_kern_flag & MNTK_ROOTDEV))
1242 hard_throttle_on_root = 1;
1243
1244 if (mp != NULL) {
1245 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
1246 isssd = TRUE;
1247 throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
1248 } else
1249 throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1250
1251 throttle_info_update_internal(throttle_info, bflags, isssd);
1252
1253 if ((bflags & B_READ) == 0) {
1254 microuptime(&throttle_info->last_IO_timestamp);
1255 if (mp) {
1256 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
1257 }
1258 } else if (mp) {
1259 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
1260 }
1261 /*
1262 * The BootCache may give us special information about
1263 * the IO, so it returns special values that we check
1264 * for here.
1265 *
1266 * IO_SATISFIED_BY_CACHE
1267 * The read has been satisfied by the boot cache. Don't
1268 * throttle the thread unnecessarily.
1269 *
1270 * IO_SHOULD_BE_THROTTLED
1271 * The boot cache is playing back a playlist and this IO
1272 * cut through. Throttle it so we're not cutting through
1273 * the boot cache too often.
1274 *
1275 * Note that typical strategy routines are defined with
1276 * a void return so we'll get garbage here. In the
1277 * unlikely case the garbage matches our special return
1278 * value, it's not a big deal since we're only adjusting
1279 * the throttling delay.
1280 */
1281 #define IO_SATISFIED_BY_CACHE ((int)0xcafefeed)
1282 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
1283 typedef int strategy_fcn_ret_t(struct buf *bp);
1284
1285 strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp);
1286
1287 if ((IO_SATISFIED_BY_CACHE == strategy_ret) && (ut->uu_lowpri_window != 0) && (ut->uu_throttle_info != NULL)) {
1288 /*
1289 * If this was a throttled IO satisfied by the boot cache,
1290 * don't delay the thread.
1291 */
1292 throttle_info_reset_window(ut);
1293
1294 } else if ((IO_SHOULD_BE_THROTTLED == strategy_ret) && (ut->uu_lowpri_window == 0) && (ut->uu_throttle_info == NULL)) {
1295 /*
1296 * If the boot cache indicates this IO should be throttled,
1297 * delay the thread.
1298 */
1299 throttle_info_set_initial_window(ut, throttle_info, isssd, TRUE);
1300 }
1301 return (0);
1302 }
1303
1304
1305 /*
1306 * This is a noop, simply returning what one has been given.
1307 */
1308 int
1309 spec_blockmap(__unused struct vnop_blockmap_args *ap)
1310 {
1311 return (ENOTSUP);
1312 }
1313
1314
1315 /*
1316 * Device close routine
1317 */
1318 int
1319 spec_close(struct vnop_close_args *ap)
1320 {
1321 struct vnode *vp = ap->a_vp;
1322 dev_t dev = vp->v_rdev;
1323 int error = 0;
1324 int flags = ap->a_fflag;
1325 struct proc *p = vfs_context_proc(ap->a_context);
1326 struct session *sessp;
1327 int do_rele = 0;
1328
1329 switch (vp->v_type) {
1330
1331 case VCHR:
1332 /*
1333 * Hack: a tty device that is a controlling terminal
1334 * has a reference from the session structure.
1335 * We cannot easily tell that a character device is
1336 * a controlling terminal, unless it is the closing
1337 * process' controlling terminal. In that case,
1338 * if the reference count is 1 (this is the very
1339 * last close)
1340 */
1341 sessp = proc_session(p);
1342 if (sessp != SESSION_NULL) {
1343 if ((vcount(vp) == 1) &&
1344 (vp == sessp->s_ttyvp)) {
1345
1346 session_lock(sessp);
1347 if (vp == sessp->s_ttyvp) {
1348 sessp->s_ttyvp = NULL;
1349 sessp->s_ttyvid = 0;
1350 sessp->s_ttyp = TTY_NULL;
1351 sessp->s_ttypgrpid = NO_PID;
1352 do_rele = 1;
1353 }
1354 session_unlock(sessp);
1355
1356 if (do_rele) {
1357 vnode_rele(vp);
1358 }
1359 }
1360 session_rele(sessp);
1361 }
1362
1363 devsw_lock(dev, S_IFCHR);
1364
1365 vp->v_specinfo->si_opencount--;
1366
1367 if (vp->v_specinfo->si_opencount < 0) {
1368 panic("Negative open count?");
1369 }
1370 /*
1371 * close on last reference or on vnode revoke call
1372 */
1373 if ((vcount(vp) > 0) && ((flags & IO_REVOKE) == 0)) {
1374 devsw_unlock(dev, S_IFCHR);
1375 return (0);
1376 }
1377
1378 error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
1379
1380 devsw_unlock(dev, S_IFCHR);
1381 break;
1382
1383 case VBLK:
1384 /*
1385 * If there is more than one outstanding open, don't
1386 * send the close to the device.
1387 */
1388 devsw_lock(dev, S_IFBLK);
1389 if (vcount(vp) > 1) {
1390 vp->v_specinfo->si_opencount--;
1391 devsw_unlock(dev, S_IFBLK);
1392 return (0);
1393 }
1394 devsw_unlock(dev, S_IFBLK);
1395
1396 /*
1397 * On last close of a block device (that isn't mounted)
1398 * we must invalidate any in core blocks, so that
1399 * we can, for instance, change floppy disks.
1400 */
1401 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
1402 return (error);
1403
1404 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
1405 if (error)
1406 return (error);
1407
1408 devsw_lock(dev, S_IFBLK);
1409
1410 vp->v_specinfo->si_opencount--;
1411
1412 if (vp->v_specinfo->si_opencount < 0) {
1413 panic("Negative open count?");
1414 }
1415
1416 if (vcount(vp) > 0) {
1417 devsw_unlock(dev, S_IFBLK);
1418 return (0);
1419 }
1420
1421 error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
1422
1423 devsw_unlock(dev, S_IFBLK);
1424 break;
1425
1426 default:
1427 panic("spec_close: not special");
1428 return(EBADF);
1429 }
1430
1431 return error;
1432 }
1433
1434 /*
1435 * Return POSIX pathconf information applicable to special devices.
1436 */
1437 int
1438 spec_pathconf(struct vnop_pathconf_args *ap)
1439 {
1440
1441 switch (ap->a_name) {
1442 case _PC_LINK_MAX:
1443 *ap->a_retval = LINK_MAX;
1444 return (0);
1445 case _PC_MAX_CANON:
1446 *ap->a_retval = MAX_CANON;
1447 return (0);
1448 case _PC_MAX_INPUT:
1449 *ap->a_retval = MAX_INPUT;
1450 return (0);
1451 case _PC_PIPE_BUF:
1452 *ap->a_retval = PIPE_BUF;
1453 return (0);
1454 case _PC_CHOWN_RESTRICTED:
1455 *ap->a_retval = 200112; /* _POSIX_CHOWN_RESTRICTED */
1456 return (0);
1457 case _PC_VDISABLE:
1458 *ap->a_retval = _POSIX_VDISABLE;
1459 return (0);
1460 default:
1461 return (EINVAL);
1462 }
1463 /* NOTREACHED */
1464 }
1465
1466 /*
1467 * Special device failed operation
1468 */
1469 int
1470 spec_ebadf(__unused void *dummy)
1471 {
1472
1473 return (EBADF);
1474 }
1475
1476 /* Blktooff derives file offset from logical block number */
1477 int
1478 spec_blktooff(struct vnop_blktooff_args *ap)
1479 {
1480 struct vnode *vp = ap->a_vp;
1481
1482 switch (vp->v_type) {
1483 case VCHR:
1484 *ap->a_offset = (off_t)-1; /* failure */
1485 return (ENOTSUP);
1486
1487 case VBLK:
1488 printf("spec_blktooff: not implemented for VBLK\n");
1489 *ap->a_offset = (off_t)-1; /* failure */
1490 return (ENOTSUP);
1491
1492 default:
1493 panic("spec_blktooff type");
1494 }
1495 /* NOTREACHED */
1496
1497 return (0);
1498 }
1499
1500 /* Offtoblk derives logical block number from file offset */
1501 int
1502 spec_offtoblk(struct vnop_offtoblk_args *ap)
1503 {
1504 struct vnode *vp = ap->a_vp;
1505
1506 switch (vp->v_type) {
1507 case VCHR:
1508 *ap->a_lblkno = (daddr64_t)-1; /* failure */
1509 return (ENOTSUP);
1510
1511 case VBLK:
1512 printf("spec_offtoblk: not implemented for VBLK\n");
1513 *ap->a_lblkno = (daddr64_t)-1; /* failure */
1514 return (ENOTSUP);
1515
1516 default:
1517 panic("spec_offtoblk type");
1518 }
1519 /* NOTREACHED */
1520
1521 return (0);
1522 }
1523
1524 static void filt_specdetach(struct knote *kn);
1525 static int filt_spec(struct knote *kn, long hint);
1526 static unsigned filt_specpeek(struct knote *kn);
1527
1528 struct filterops spec_filtops = {
1529 .f_isfd = 1,
1530 .f_attach = filt_specattach,
1531 .f_detach = filt_specdetach,
1532 .f_event = filt_spec,
1533 .f_peek = filt_specpeek
1534 };
1535
1536 static int
1537 filter_to_seltype(int16_t filter)
1538 {
1539 switch (filter) {
1540 case EVFILT_READ:
1541 return FREAD;
1542 case EVFILT_WRITE:
1543 return FWRITE;
1544 break;
1545 default:
1546 panic("filt_to_seltype(): invalid filter %d\n", filter);
1547 return 0;
1548 }
1549 }
1550
1551 static int
1552 filt_specattach(struct knote *kn)
1553 {
1554 vnode_t vp;
1555 dev_t dev;
1556
1557 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */
1558
1559 assert(vnode_ischr(vp));
1560
1561 dev = vnode_specrdev(vp);
1562
1563 if (major(dev) > nchrdev) {
1564 return ENXIO;
1565 }
1566
1567 if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0) {
1568 return EINVAL;
1569 }
1570
1571 /* Resulting wql is safe to unlink even if it has never been linked */
1572 kn->kn_hook = wait_queue_link_allocate();
1573 if (kn->kn_hook == NULL) {
1574 return EAGAIN;
1575 }
1576
1577 kn->kn_fop = &spec_filtops;
1578 kn->kn_hookid = vnode_vid(vp);
1579
1580 knote_markstayqueued(kn);
1581
1582 return 0;
1583 }
1584
1585 static void
1586 filt_specdetach(struct knote *kn)
1587 {
1588 kern_return_t ret;
1589
1590 /*
1591 * Given wait queue link and wait queue set, unlink. This is subtle.
1592 * If the device has been revoked from under us, selclearthread() will
1593 * have removed our link from the kqueue's wait queue set, which
1594 * wait_queue_set_unlink_one() will detect and handle.
1595 */
1596 ret = wait_queue_set_unlink_one(kn->kn_kq->kq_wqs, kn->kn_hook);
1597 if (ret != KERN_SUCCESS) {
1598 panic("filt_specdetach(): failed to unlink wait queue link.");
1599 }
1600
1601 (void)wait_queue_link_free(kn->kn_hook);
1602 kn->kn_hook = NULL;
1603 kn->kn_status &= ~KN_STAYQUEUED;
1604 }
1605
1606 static int
1607 filt_spec(struct knote *kn, long hint)
1608 {
1609 vnode_t vp;
1610 uthread_t uth;
1611 wait_queue_set_t old_wqs;
1612 vfs_context_t ctx;
1613 int selres;
1614 int error;
1615 int use_offset;
1616 dev_t dev;
1617 uint64_t flags;
1618
1619 assert(kn->kn_hook != NULL);
1620
1621 if (hint != 0) {
1622 panic("filt_spec(): nonzero hint?");
1623 }
1624
1625 uth = get_bsdthread_info(current_thread());
1626 ctx = vfs_context_current();
1627 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
1628
1629 error = vnode_getwithvid(vp, kn->kn_hookid);
1630 if (error != 0) {
1631 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1632 return 1;
1633 }
1634
1635 dev = vnode_specrdev(vp);
1636 flags = cdevsw_flags[major(dev)];
1637 use_offset = ((flags & CDEVSW_USE_OFFSET) != 0);
1638 assert((flags & CDEVSW_SELECT_KQUEUE) != 0);
1639
1640 /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */
1641 old_wqs = uth->uu_wqset;
1642 uth->uu_wqset = kn->kn_kq->kq_wqs;
1643 selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx);
1644 uth->uu_wqset = old_wqs;
1645
1646 if (use_offset) {
1647 if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) {
1648 kn->kn_data = 0;
1649 } else {
1650 kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset;
1651 }
1652 } else {
1653 kn->kn_data = selres;
1654 }
1655
1656 vnode_put(vp);
1657
1658 return (kn->kn_data != 0);
1659 }
1660
1661 static unsigned
1662 filt_specpeek(struct knote *kn)
1663 {
1664 vnode_t vp;
1665 uthread_t uth;
1666 wait_queue_set_t old_wqs;
1667 vfs_context_t ctx;
1668 int error, selres;
1669
1670 uth = get_bsdthread_info(current_thread());
1671 ctx = vfs_context_current();
1672 vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
1673
1674 error = vnode_getwithvid(vp, kn->kn_hookid);
1675 if (error != 0) {
1676 return 1; /* Just like VNOP_SELECT() on recycled vnode */
1677 }
1678
1679 /*
1680 * Why pass the link here? Because we may not have registered in the past...
1681 */
1682 old_wqs = uth->uu_wqset;
1683 uth->uu_wqset = kn->kn_kq->kq_wqs;
1684 selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx);
1685 uth->uu_wqset = old_wqs;
1686
1687 vnode_put(vp);
1688 return selres;
1689 }
1690