]> git.saurik.com Git - apple/xnu.git/blob - bsd/miscfs/specfs/spec_vnops.c
xnu-1504.7.4.tar.gz
[apple/xnu.git] / bsd / miscfs / specfs / spec_vnops.c
1 /*
2 * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1989, 1993, 1995
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95
62 */
63
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/kauth.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/conf.h>
70 #include <sys/buf_internal.h>
71 #include <sys/mount_internal.h>
72 #include <sys/namei.h>
73 #include <sys/vnode_internal.h>
74 #include <sys/stat.h>
75 #include <sys/errno.h>
76 #include <sys/ioctl.h>
77 #include <sys/file.h>
78 #include <sys/user.h>
79 #include <sys/malloc.h>
80 #include <sys/disk.h>
81 #include <sys/uio_internal.h>
82 #include <sys/resource.h>
83 #include <miscfs/specfs/specdev.h>
84 #include <vfs/vfs_support.h>
85
86 #include <sys/kdebug.h>
87
88 /* XXX following three prototypes should be in a header file somewhere */
89 extern dev_t chrtoblk(dev_t dev);
90 extern int iskmemdev(dev_t dev);
91 extern int bpfkqfilter(dev_t dev, struct knote *kn);
92 extern int ptsd_kqfilter(dev_t dev, struct knote *kn);
93
94 struct vnode *speclisth[SPECHSZ];
95
96 /* symbolic sleep message strings for devices */
97 char devopn[] = "devopn";
98 char devio[] = "devio";
99 char devwait[] = "devwait";
100 char devin[] = "devin";
101 char devout[] = "devout";
102 char devioc[] = "devioc";
103 char devcls[] = "devcls";
104
105 #define VOPFUNC int (*)(void *)
106
107 int (**spec_vnodeop_p)(void *);
108 struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
109 { &vnop_default_desc, (VOPFUNC)vn_default_error },
110 { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */
111 { &vnop_create_desc, (VOPFUNC)err_create }, /* create */
112 { &vnop_mknod_desc, (VOPFUNC)err_mknod }, /* mknod */
113 { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */
114 { &vnop_close_desc, (VOPFUNC)spec_close }, /* close */
115 { &vnop_access_desc, (VOPFUNC)spec_access }, /* access */
116 { &vnop_getattr_desc, (VOPFUNC)spec_getattr }, /* getattr */
117 { &vnop_setattr_desc, (VOPFUNC)spec_setattr }, /* setattr */
118 { &vnop_read_desc, (VOPFUNC)spec_read }, /* read */
119 { &vnop_write_desc, (VOPFUNC)spec_write }, /* write */
120 { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */
121 { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */
122 { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */
123 { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */
124 { &vnop_fsync_desc, (VOPFUNC)spec_fsync }, /* fsync */
125 { &vnop_remove_desc, (VOPFUNC)err_remove }, /* remove */
126 { &vnop_link_desc, (VOPFUNC)err_link }, /* link */
127 { &vnop_rename_desc, (VOPFUNC)err_rename }, /* rename */
128 { &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, /* mkdir */
129 { &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, /* rmdir */
130 { &vnop_symlink_desc, (VOPFUNC)err_symlink }, /* symlink */
131 { &vnop_readdir_desc, (VOPFUNC)err_readdir }, /* readdir */
132 { &vnop_readlink_desc, (VOPFUNC)err_readlink }, /* readlink */
133 { &vnop_inactive_desc, (VOPFUNC)nop_inactive }, /* inactive */
134 { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim }, /* reclaim */
135 { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */
136 { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */
137 { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */
138 { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite }, /* bwrite */
139 { &vnop_pagein_desc, (VOPFUNC)err_pagein }, /* Pagein */
140 { &vnop_pageout_desc, (VOPFUNC)err_pageout }, /* Pageout */
141 { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */
142 { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff }, /* blktooff */
143 { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk }, /* offtoblk */
144 { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap }, /* blockmap */
145 { (struct vnodeop_desc*)NULL, (int(*)())NULL }
146 };
147 struct vnodeopv_desc spec_vnodeop_opv_desc =
148 { &spec_vnodeop_p, spec_vnodeop_entries };
149
150
151 static void set_blocksize(vnode_t, dev_t);
152
153
154 /*
155 * Trivial lookup routine that always fails.
156 */
157 int
158 spec_lookup(struct vnop_lookup_args *ap)
159 {
160
161 *ap->a_vpp = NULL;
162 return (ENOTDIR);
163 }
164
165 static void
166 set_blocksize(struct vnode *vp, dev_t dev)
167 {
168 int (*size)(dev_t);
169 int rsize;
170
171 if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
172 rsize = (*size)(dev);
173 if (rsize <= 0) /* did size fail? */
174 vp->v_specsize = DEV_BSIZE;
175 else
176 vp->v_specsize = rsize;
177 }
178 else
179 vp->v_specsize = DEV_BSIZE;
180 }
181
182 void
183 set_fsblocksize(struct vnode *vp)
184 {
185
186 if (vp->v_type == VBLK) {
187 dev_t dev = (dev_t)vp->v_rdev;
188 int maj = major(dev);
189
190 if ((u_int)maj >= (u_int)nblkdev)
191 return;
192
193 vnode_lock(vp);
194 set_blocksize(vp, dev);
195 vnode_unlock(vp);
196 }
197
198 }
199
200
201 /*
202 * Open a special file.
203 */
204 int
205 spec_open(struct vnop_open_args *ap)
206 {
207 struct proc *p = vfs_context_proc(ap->a_context);
208 kauth_cred_t cred = vfs_context_ucred(ap->a_context);
209 struct vnode *vp = ap->a_vp;
210 dev_t bdev, dev = (dev_t)vp->v_rdev;
211 int maj = major(dev);
212 int error;
213
214 /*
215 * Don't allow open if fs is mounted -nodev.
216 */
217 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
218 return (ENXIO);
219
220 switch (vp->v_type) {
221
222 case VCHR:
223 if ((u_int)maj >= (u_int)nchrdev)
224 return (ENXIO);
225 if (cred != FSCRED && (ap->a_mode & FWRITE)) {
226 /*
227 * When running in very secure mode, do not allow
228 * opens for writing of any disk character devices.
229 */
230 if (securelevel >= 2 && isdisk(dev, VCHR))
231 return (EPERM);
232 /*
233 * When running in secure mode, do not allow opens
234 * for writing of /dev/mem, /dev/kmem, or character
235 * devices whose corresponding block devices are
236 * currently mounted.
237 */
238 if (securelevel >= 1) {
239 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error))
240 return (error);
241 if (iskmemdev(dev))
242 return (EPERM);
243 }
244 }
245 if (cdevsw[maj].d_type == D_TTY) {
246 vnode_lock(vp);
247 vp->v_flag |= VISTTY;
248 vnode_unlock(vp);
249 }
250 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
251 return (error);
252
253 case VBLK:
254 if ((u_int)maj >= (u_int)nblkdev)
255 return (ENXIO);
256 /*
257 * When running in very secure mode, do not allow
258 * opens for writing of any disk block devices.
259 */
260 if (securelevel >= 2 && cred != FSCRED &&
261 (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK)
262 return (EPERM);
263 /*
264 * Do not allow opens of block devices that are
265 * currently mounted.
266 */
267 if ( (error = vfs_mountedon(vp)) )
268 return (error);
269 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
270 if (!error) {
271 u_int64_t blkcnt;
272 u_int32_t blksize;
273 int setsize = 0;
274 u_int32_t size512 = 512;
275
276
277 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
278 /* Switch to 512 byte sectors (temporarily) */
279
280 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
281 /* Get the number of 512 byte physical blocks. */
282 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
283 setsize = 1;
284 }
285 }
286 /* If it doesn't set back, we can't recover */
287 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context))
288 error = ENXIO;
289 }
290
291
292 vnode_lock(vp);
293 set_blocksize(vp, dev);
294
295 /*
296 * Cache the size in bytes of the block device for later
297 * use by spec_write().
298 */
299 if (setsize)
300 vp->v_specdevsize = blkcnt * (u_int64_t)size512;
301 else
302 vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */
303
304 vnode_unlock(vp);
305
306 }
307 return(error);
308 default:
309 panic("spec_open type");
310 }
311 return (0);
312 }
313
314 /*
315 * Vnode op for read
316 */
317 int
318 spec_read(struct vnop_read_args *ap)
319 {
320 struct vnode *vp = ap->a_vp;
321 struct uio *uio = ap->a_uio;
322 struct buf *bp;
323 daddr64_t bn, nextbn;
324 long bsize, bscale;
325 int devBlockSize=0;
326 int n, on;
327 int error = 0;
328 dev_t dev;
329
330 #if DIAGNOSTIC
331 if (uio->uio_rw != UIO_READ)
332 panic("spec_read mode");
333 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
334 panic("spec_read proc");
335 #endif
336 if (uio_resid(uio) == 0)
337 return (0);
338
339 switch (vp->v_type) {
340
341 case VCHR:
342 error = (*cdevsw[major(vp->v_rdev)].d_read)
343 (vp->v_rdev, uio, ap->a_ioflag);
344 return (error);
345
346 case VBLK:
347 if (uio->uio_offset < 0)
348 return (EINVAL);
349
350 dev = vp->v_rdev;
351
352 devBlockSize = vp->v_specsize;
353
354 if (devBlockSize > PAGE_SIZE)
355 return (EINVAL);
356
357 bscale = PAGE_SIZE / devBlockSize;
358 bsize = bscale * devBlockSize;
359
360 do {
361 on = uio->uio_offset % bsize;
362
363 bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1));
364
365 if (vp->v_speclastr + bscale == bn) {
366 nextbn = bn + bscale;
367 error = buf_breadn(vp, bn, (int)bsize, &nextbn,
368 (int *)&bsize, 1, NOCRED, &bp);
369 } else
370 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
371
372 vnode_lock(vp);
373 vp->v_speclastr = bn;
374 vnode_unlock(vp);
375
376 n = bsize - buf_resid(bp);
377 if ((on > n) || error) {
378 if (!error)
379 error = EINVAL;
380 buf_brelse(bp);
381 return (error);
382 }
383 n = min((unsigned)(n - on), uio_resid(uio));
384
385 error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio);
386 if (n + on == bsize)
387 buf_markaged(bp);
388 buf_brelse(bp);
389 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
390 return (error);
391
392 default:
393 panic("spec_read type");
394 }
395 /* NOTREACHED */
396
397 return (0);
398 }
399
400 /*
401 * Vnode op for write
402 */
403 int
404 spec_write(struct vnop_write_args *ap)
405 {
406 struct vnode *vp = ap->a_vp;
407 struct uio *uio = ap->a_uio;
408 struct buf *bp;
409 daddr64_t bn;
410 int bsize, blkmask, bscale;
411 int io_sync;
412 int devBlockSize=0;
413 int n, on;
414 int error = 0;
415 dev_t dev;
416
417 #if DIAGNOSTIC
418 if (uio->uio_rw != UIO_WRITE)
419 panic("spec_write mode");
420 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
421 panic("spec_write proc");
422 #endif
423
424 switch (vp->v_type) {
425
426 case VCHR:
427 error = (*cdevsw[major(vp->v_rdev)].d_write)
428 (vp->v_rdev, uio, ap->a_ioflag);
429 return (error);
430
431 case VBLK:
432 if (uio_resid(uio) == 0)
433 return (0);
434 if (uio->uio_offset < 0)
435 return (EINVAL);
436
437 io_sync = (ap->a_ioflag & IO_SYNC);
438
439 dev = (vp->v_rdev);
440
441 devBlockSize = vp->v_specsize;
442 if (devBlockSize > PAGE_SIZE)
443 return(EINVAL);
444
445 bscale = PAGE_SIZE / devBlockSize;
446 blkmask = bscale - 1;
447 bsize = bscale * devBlockSize;
448
449
450 do {
451 bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask);
452 on = uio->uio_offset % bsize;
453
454 n = min((unsigned)(bsize - on), uio_resid(uio));
455
456 /*
457 * Use buf_getblk() as an optimization IFF:
458 *
459 * 1) We are reading exactly a block on a block
460 * aligned boundary
461 * 2) We know the size of the device from spec_open
462 * 3) The read doesn't span the end of the device
463 *
464 * Otherwise, we fall back on buf_bread().
465 */
466 if (n == bsize &&
467 vp->v_specdevsize != (u_int64_t)0 &&
468 (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
469 /* reduce the size of the read to what is there */
470 n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
471 }
472
473 if (n == bsize)
474 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
475 else
476 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
477
478 /* Translate downstream error for upstream, if needed */
479 if (!error)
480 error = (int)buf_error(bp);
481 if (error) {
482 buf_brelse(bp);
483 return (error);
484 }
485 n = min(n, bsize - buf_resid(bp));
486
487 error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio);
488 if (error) {
489 buf_brelse(bp);
490 return (error);
491 }
492 buf_markaged(bp);
493
494 if (io_sync)
495 error = buf_bwrite(bp);
496 else {
497 if ((n + on) == bsize)
498 error = buf_bawrite(bp);
499 else
500 error = buf_bdwrite(bp);
501 }
502 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
503 return (error);
504
505 default:
506 panic("spec_write type");
507 }
508 /* NOTREACHED */
509
510 return (0);
511 }
512
513 /*
514 * Device ioctl operation.
515 */
516 int
517 spec_ioctl(struct vnop_ioctl_args *ap)
518 {
519 proc_t p = vfs_context_proc(ap->a_context);
520 dev_t dev = ap->a_vp->v_rdev;
521 int retval = 0;
522
523 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START,
524 (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, (unsigned int)ap->a_vp->v_type, 0);
525
526 switch (ap->a_vp->v_type) {
527
528 case VCHR:
529 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
530 ap->a_fflag, p);
531 break;
532
533 case VBLK:
534 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
535 ap->a_fflag, p);
536 break;
537
538 default:
539 panic("spec_ioctl");
540 /* NOTREACHED */
541 }
542 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END,
543 (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, retval, 0);
544
545 return (retval);
546 }
547
548 int
549 spec_select(struct vnop_select_args *ap)
550 {
551 proc_t p = vfs_context_proc(ap->a_context);
552 dev_t dev;
553
554 switch (ap->a_vp->v_type) {
555
556 default:
557 return (1); /* XXX */
558
559 case VCHR:
560 dev = ap->a_vp->v_rdev;
561 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
562 }
563 }
564
565 int
566 spec_kqfilter(vnode_t vp, struct knote *kn)
567 {
568 dev_t dev;
569 int err = EINVAL;
570
571 /*
572 * For a few special kinds of devices, we can attach knotes.
573 * Each filter function must check whether the dev type matches it.
574 */
575 dev = vnode_specrdev(vp);
576
577 if (vnode_istty(vp)) {
578 /* We can hook into the slave side of a tty */
579 err = ptsd_kqfilter(dev, kn);
580 } else {
581 /* Try a bpf device, as defined in bsd/net/bpf.c */
582 err = bpfkqfilter(dev, kn);
583 }
584
585 return err;
586 }
587
588 /*
589 * Synch buffers associated with a block device
590 */
591 int
592 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
593 {
594 if (vp->v_type == VCHR)
595 return (0);
596 /*
597 * Flush all dirty buffers associated with a block device.
598 */
599 buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync");
600
601 return (0);
602 }
603
604 int
605 spec_fsync(struct vnop_fsync_args *ap)
606 {
607 return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
608 }
609
610 /*
611 * Just call the device strategy routine
612 */
613 extern int hard_throttle_on_root;
614 void IOSleep(int);
615
616 // the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
617 #define LOWPRI_INITIAL_WINDOW_MSECS 100
618 #define LOWPRI_WINDOW_MSECS_INC 50
619 #define LOWPRI_MAX_WINDOW_MSECS 200
620 #define LOWPRI_MAX_WAITING_MSECS 200
621 #define LOWPRI_SLEEP_INTERVAL 5
622
623
624 struct _throttle_io_info_t {
625 struct timeval last_normal_IO_timestamp;
626 struct timeval last_IO_timestamp;
627 SInt32 numthreads_throttling;
628 SInt32 refcnt;
629 SInt32 alloc;
630
631 };
632
633 struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
634 int lowpri_IO_initial_window_msecs = LOWPRI_INITIAL_WINDOW_MSECS;
635 int lowpri_IO_window_msecs_inc = LOWPRI_WINDOW_MSECS_INC;
636 int lowpri_max_window_msecs = LOWPRI_MAX_WINDOW_MSECS;
637 int lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS;
638
639 #if 0
640 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) \
641 do { \
642 if ((debug_info)->alloc) \
643 printf("%s: "format, __FUNCTION__, ## args); \
644 } while(0)
645
646 #else
647 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
648 #endif
649
650 SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
651 SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, "");
652 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
653 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
654
655 /*
656 * Release the reference and if the item was allocated and this is the last
657 * reference then free it.
658 *
659 * This routine always returns the old value.
660 */
661 static int
662 throttle_info_rel(struct _throttle_io_info_t *info)
663 {
664 SInt32 oldValue = OSDecrementAtomic(&info->refcnt);
665
666 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
667 info, (int)(oldValue -1), info );
668
669 /* The reference count just went negative, very bad */
670 if (oldValue == 0)
671 panic("throttle info ref cnt went negative!");
672
673 /*
674 * Once reference count is zero, no one else should be able to take a
675 * reference
676 */
677 if ((info->refcnt == 0) && (info->alloc)) {
678 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info, info );
679 FREE(info, M_TEMP);
680 }
681 return oldValue;
682 }
683
684 /*
685 * Just take a reference on the throttle info structure.
686 *
687 * This routine always returns the old value.
688 */
689 static SInt32
690 throttle_info_ref(struct _throttle_io_info_t *info)
691 {
692 SInt32 oldValue = OSIncrementAtomic(&info->refcnt);
693
694 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
695 info, (int)(oldValue -1), info );
696 /* Allocated items should never have a reference of zero */
697 if (info->alloc && (oldValue == 0))
698 panic("Taking a reference without calling create throttle info!\n");
699
700 return oldValue;
701 }
702
703 /*
704 * KPI routine
705 *
706 * Create and take a reference on a throttle info structure and return a
707 * pointer for the file system to use when calling throttle_info_update.
708 * Calling file system must have a matching release for every create.
709 */
710 void *
711 throttle_info_create(void)
712 {
713 struct _throttle_io_info_t *info;
714
715 MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK);
716 /* Should never happen but just in case */
717 if (info == NULL)
718 return NULL;
719 /* Mark that this one was allocated and needs to be freed */
720 DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
721 info->alloc = TRUE;
722 /* Take a reference */
723 OSIncrementAtomic(&info->refcnt);
724 return info;
725 }
726
727 /*
728 * KPI routine
729 *
730 * Release the throttle info pointer if all the reference are gone. Should be
731 * called to release reference taken by throttle_info_create
732 */
733 void
734 throttle_info_release(void *throttle_info)
735 {
736 DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
737 (struct _throttle_io_info_t *)throttle_info,
738 (struct _throttle_io_info_t *)throttle_info);
739 if (throttle_info) /* Just to be careful */
740 throttle_info_rel(throttle_info);
741 }
742
743 /*
744 * KPI routine
745 *
746 * File Systems that create an info structure, need to call this routine in
747 * their mount routine (used by cluster code). File Systems that call this in
748 * their mount routines must call throttle_info_mount_rel in their unmount
749 * routines.
750 */
751 void
752 throttle_info_mount_ref(mount_t mp, void *throttle_info)
753 {
754 if ((throttle_info == NULL) || (mp == NULL))
755 return;
756 throttle_info_ref(throttle_info);
757 /* We already have a reference release it before adding the new one */
758 if (mp->mnt_throttle_info)
759 throttle_info_rel(mp->mnt_throttle_info);
760 mp->mnt_throttle_info = throttle_info;
761 }
762
763 /*
764 * KPI routine
765 *
766 * File Systems that throttle_info_mount_ref, must call this routine in their
767 * umount routine.
768 */
769 void
770 throttle_info_mount_rel(mount_t mp)
771 {
772 if (mp->mnt_throttle_info)
773 throttle_info_rel(mp->mnt_throttle_info);
774 mp->mnt_throttle_info = NULL;
775 }
776
777 void
778 throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
779 {
780 struct _throttle_io_info_t *info;
781
782 if (mp == NULL)
783 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
784 else if (mp->mnt_throttle_info == NULL)
785 info = &_throttle_io_info[mp->mnt_devbsdunit];
786 else
787 info = mp->mnt_throttle_info;
788
789 *tv = info->last_IO_timestamp;
790 }
791
792 void
793 update_last_io_time(mount_t mp)
794 {
795 struct _throttle_io_info_t *info;
796
797 if (mp == NULL)
798 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
799 else if (mp->mnt_throttle_info == NULL)
800 info = &_throttle_io_info[mp->mnt_devbsdunit];
801 else
802 info = mp->mnt_throttle_info;
803
804 microuptime(&info->last_IO_timestamp);
805 }
806
807 static int
808 throttle_io_will_be_throttled_internal(int lowpri_window_msecs, void * throttle_info)
809 {
810 struct _throttle_io_info_t *info = throttle_info;
811 struct timeval elapsed;
812 int elapsed_msecs;
813
814 microuptime(&elapsed);
815 timevalsub(&elapsed, &info->last_normal_IO_timestamp);
816 elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
817
818 if (lowpri_window_msecs == -1) // use the max waiting time
819 lowpri_window_msecs = lowpri_max_waiting_msecs;
820
821 return elapsed_msecs < lowpri_window_msecs;
822 }
823
824 /*
825 * If we have a mount point and it has a throttle info pointer then
826 * use it to do the check, otherwise use the device unit number to find
827 * the correct throttle info array element.
828 */
829 int
830 throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp)
831 {
832 void *info;
833
834 /* Should we just return zero if no mount point */
835 if (mp == NULL)
836 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
837 else if (mp->mnt_throttle_info == NULL)
838 info = &_throttle_io_info[mp->mnt_devbsdunit];
839 else
840 info = mp->mnt_throttle_info;
841 return throttle_io_will_be_throttled_internal(lowpri_window_msecs, info);
842 }
843
844 void throttle_lowpri_io(boolean_t ok_to_sleep)
845 {
846 int i;
847 int max_try_num;
848 struct uthread *ut;
849 struct _throttle_io_info_t *info;
850
851 ut = get_bsdthread_info(current_thread());
852
853 if ((ut->uu_lowpri_window == 0) || (ut->uu_throttle_info == NULL))
854 goto done;
855
856 info = ut->uu_throttle_info;
857 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
858 ut->uu_lowpri_window, ok_to_sleep, 0, 0, 0);
859
860 if (ok_to_sleep == TRUE) {
861 max_try_num = lowpri_max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, info->numthreads_throttling);
862
863 for (i=0; i<max_try_num; i++) {
864 if (throttle_io_will_be_throttled_internal(ut->uu_lowpri_window, info)) {
865 IOSleep(LOWPRI_SLEEP_INTERVAL);
866 DEBUG_ALLOC_THROTTLE_INFO("sleeping because of info = %p\n", info, info );
867 } else {
868 break;
869 }
870 }
871 }
872 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
873 ut->uu_lowpri_window, i*5, 0, 0, 0);
874 SInt32 oldValue;
875 oldValue = OSDecrementAtomic(&info->numthreads_throttling);
876
877 if (oldValue <= 0) {
878 panic("%s: numthreads negative", __func__);
879 }
880 done:
881 ut->uu_lowpri_window = 0;
882 if (ut->uu_throttle_info)
883 throttle_info_rel(ut->uu_throttle_info);
884 ut->uu_throttle_info = NULL;
885 }
886
887 int throttle_get_io_policy(struct uthread **ut)
888 {
889 int policy = IOPOL_DEFAULT;
890 proc_t p = current_proc();
891
892 *ut = get_bsdthread_info(current_thread());
893
894 if (p != NULL)
895 policy = p->p_iopol_disk;
896
897 if (*ut != NULL) {
898 // the I/O policy of the thread overrides that of the process
899 // unless the I/O policy of the thread is default
900 if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT)
901 policy = (*ut)->uu_iopol_disk;
902 }
903 return policy;
904 }
905
906 void throttle_info_update(void *throttle_info, int flags)
907 {
908 struct _throttle_io_info_t *info = throttle_info;
909 struct uthread *ut;
910 int policy;
911 int is_throttleable_io = 0;
912 int is_passive_io = 0;
913 SInt32 oldValue;
914
915 if (!lowpri_IO_initial_window_msecs || (info == NULL))
916 return;
917 policy = throttle_get_io_policy(&ut);
918
919 switch (policy) {
920 case IOPOL_DEFAULT:
921 case IOPOL_NORMAL:
922 break;
923 case IOPOL_THROTTLE:
924 is_throttleable_io = 1;
925 break;
926 case IOPOL_PASSIVE:
927 is_passive_io = 1;
928 break;
929 default:
930 printf("unknown I/O policy %d", policy);
931 break;
932 }
933
934 if (!is_throttleable_io && ISSET(flags, B_PASSIVE))
935 is_passive_io |= 1;
936
937 if (!is_throttleable_io) {
938 if (!is_passive_io){
939 microuptime(&info->last_normal_IO_timestamp);
940 }
941 } else if (ut) {
942 /*
943 * I'd really like to do the IOSleep here, but
944 * we may be holding all kinds of filesystem related locks
945 * and the pages for this I/O marked 'busy'...
946 * we don't want to cause a normal task to block on
947 * one of these locks while we're throttling a task marked
948 * for low priority I/O... we'll mark the uthread and
949 * do the delay just before we return from the system
950 * call that triggered this I/O or from vnode_pagein
951 */
952 if (ut->uu_lowpri_window == 0) {
953 ut->uu_throttle_info = info;
954 throttle_info_ref(ut->uu_throttle_info);
955 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
956
957 oldValue = OSIncrementAtomic(&info->numthreads_throttling);
958 if (oldValue < 0) {
959 panic("%s: numthreads negative", __func__);
960 }
961 ut->uu_lowpri_window = lowpri_IO_initial_window_msecs;
962 ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue;
963 } else {
964 /* The thread sends I/Os to different devices within the same system call */
965 if (ut->uu_throttle_info != info) {
966 struct _throttle_io_info_t *old_info = ut->uu_throttle_info;
967
968 // keep track of the numthreads in the right device
969 OSDecrementAtomic(&old_info->numthreads_throttling);
970 OSIncrementAtomic(&info->numthreads_throttling);
971
972 DEBUG_ALLOC_THROTTLE_INFO("switching from info = %p\n", old_info, old_info );
973 DEBUG_ALLOC_THROTTLE_INFO("switching to info = %p\n", info, info );
974 /* This thread no longer needs a reference on that throttle info */
975 throttle_info_rel(ut->uu_throttle_info);
976 ut->uu_throttle_info = info;
977 /* Need to take a reference on this throttle info */
978 throttle_info_ref(ut->uu_throttle_info);
979 }
980 int numthreads = MAX(1, info->numthreads_throttling);
981 ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads;
982 if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads)
983 ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads;
984 }
985 }
986 }
987
988 int
989 spec_strategy(struct vnop_strategy_args *ap)
990 {
991 buf_t bp;
992 int bflags;
993 int policy;
994 dev_t bdev;
995 uthread_t ut;
996 size_t devbsdunit;
997 mount_t mp;
998
999 bp = ap->a_bp;
1000 bdev = buf_device(bp);
1001 bflags = buf_flags(bp);
1002 mp = buf_vnode(bp)->v_mount;
1003
1004 if (kdebug_enable) {
1005 int code = 0;
1006
1007 if (bflags & B_READ)
1008 code |= DKIO_READ;
1009 if (bflags & B_ASYNC)
1010 code |= DKIO_ASYNC;
1011
1012 if (bflags & B_META)
1013 code |= DKIO_META;
1014 else if (bflags & B_PAGEIO)
1015 code |= DKIO_PAGING;
1016
1017 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
1018 bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0);
1019 }
1020 if (((bflags & (B_IOSTREAMING | B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
1021 mp && (mp->mnt_kern_flag & MNTK_ROOTDEV))
1022 hard_throttle_on_root = 1;
1023
1024
1025 if (mp != NULL)
1026 devbsdunit = mp->mnt_devbsdunit;
1027 else
1028 devbsdunit = LOWPRI_MAX_NUM_DEV - 1;
1029
1030 throttle_info_update(&_throttle_io_info[devbsdunit], bflags);
1031 if ((policy = throttle_get_io_policy(&ut)) == IOPOL_THROTTLE) {
1032 bp->b_flags |= B_THROTTLED_IO;
1033 }
1034
1035
1036 if ((bflags & B_READ) == 0) {
1037 microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp);
1038 if (mp) {
1039 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
1040 }
1041 } else if (mp) {
1042 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
1043 }
1044
1045 (*bdevsw[major(bdev)].d_strategy)(bp);
1046
1047 return (0);
1048 }
1049
1050
1051 /*
1052 * This is a noop, simply returning what one has been given.
1053 */
1054 int
1055 spec_blockmap(__unused struct vnop_blockmap_args *ap)
1056 {
1057 return (ENOTSUP);
1058 }
1059
1060
1061 /*
1062 * Device close routine
1063 */
1064 int
1065 spec_close(struct vnop_close_args *ap)
1066 {
1067 struct vnode *vp = ap->a_vp;
1068 dev_t dev = vp->v_rdev;
1069 int (*devclose)(dev_t, int, int, struct proc *);
1070 int mode, error;
1071 int flags = ap->a_fflag;
1072 struct proc *p = vfs_context_proc(ap->a_context);
1073 struct session *sessp;
1074
1075 switch (vp->v_type) {
1076
1077 case VCHR:
1078 /*
1079 * Hack: a tty device that is a controlling terminal
1080 * has a reference from the session structure.
1081 * We cannot easily tell that a character device is
1082 * a controlling terminal, unless it is the closing
1083 * process' controlling terminal. In that case,
1084 * if the reference count is 1 (this is the very
1085 * last close)
1086 */
1087 sessp = proc_session(p);
1088 if (sessp != SESSION_NULL) {
1089 if ((vcount(vp) == 1) &&
1090 (vp == sessp->s_ttyvp)) {
1091 session_lock(sessp);
1092 sessp->s_ttyvp = NULL;
1093 sessp->s_ttyvid = 0;
1094 sessp->s_ttyp = TTY_NULL;
1095 sessp->s_ttypgrpid = NO_PID;
1096 session_unlock(sessp);
1097 vnode_rele(vp);
1098 }
1099 session_rele(sessp);
1100 }
1101
1102 devclose = cdevsw[major(dev)].d_close;
1103 mode = S_IFCHR;
1104 /*
1105 * close on last reference or on vnode revoke call
1106 */
1107 if ((flags & IO_REVOKE) != 0)
1108 break;
1109 if (vcount(vp) > 0)
1110 return (0);
1111 break;
1112
1113 case VBLK:
1114 /*
1115 * Since every use (buffer, vnode, swap, blockmap)
1116 * holds a reference to the vnode, and because we mark
1117 * any other vnodes that alias this device, when the
1118 * sum of the reference counts on all the aliased
1119 * vnodes descends to zero, we are on last close.
1120 */
1121 if (vcount(vp) > 0)
1122 return (0);
1123
1124 /*
1125 * On last close of a block device (that isn't mounted)
1126 * we must invalidate any in core blocks, so that
1127 * we can, for instance, change floppy disks.
1128 */
1129 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
1130 return (error);
1131
1132 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
1133 if (error)
1134 return (error);
1135
1136 devclose = bdevsw[major(dev)].d_close;
1137 mode = S_IFBLK;
1138 break;
1139
1140 default:
1141 panic("spec_close: not special");
1142 return(EBADF);
1143 }
1144
1145 return ((*devclose)(dev, flags, mode, p));
1146 }
1147
1148 /*
1149 * Return POSIX pathconf information applicable to special devices.
1150 */
1151 int
1152 spec_pathconf(struct vnop_pathconf_args *ap)
1153 {
1154
1155 switch (ap->a_name) {
1156 case _PC_LINK_MAX:
1157 *ap->a_retval = LINK_MAX;
1158 return (0);
1159 case _PC_MAX_CANON:
1160 *ap->a_retval = MAX_CANON;
1161 return (0);
1162 case _PC_MAX_INPUT:
1163 *ap->a_retval = MAX_INPUT;
1164 return (0);
1165 case _PC_PIPE_BUF:
1166 *ap->a_retval = PIPE_BUF;
1167 return (0);
1168 case _PC_CHOWN_RESTRICTED:
1169 *ap->a_retval = 200112; /* _POSIX_CHOWN_RESTRICTED */
1170 return (0);
1171 case _PC_VDISABLE:
1172 *ap->a_retval = _POSIX_VDISABLE;
1173 return (0);
1174 default:
1175 return (EINVAL);
1176 }
1177 /* NOTREACHED */
1178 }
1179
1180 /*
1181 * Special device failed operation
1182 */
1183 int
1184 spec_ebadf(__unused void *dummy)
1185 {
1186
1187 return (EBADF);
1188 }
1189
1190 /* Blktooff derives file offset from logical block number */
1191 int
1192 spec_blktooff(struct vnop_blktooff_args *ap)
1193 {
1194 struct vnode *vp = ap->a_vp;
1195
1196 switch (vp->v_type) {
1197 case VCHR:
1198 *ap->a_offset = (off_t)-1; /* failure */
1199 return (ENOTSUP);
1200
1201 case VBLK:
1202 printf("spec_blktooff: not implemented for VBLK\n");
1203 *ap->a_offset = (off_t)-1; /* failure */
1204 return (ENOTSUP);
1205
1206 default:
1207 panic("spec_blktooff type");
1208 }
1209 /* NOTREACHED */
1210
1211 return (0);
1212 }
1213
1214 /* Offtoblk derives logical block number from file offset */
1215 int
1216 spec_offtoblk(struct vnop_offtoblk_args *ap)
1217 {
1218 struct vnode *vp = ap->a_vp;
1219
1220 switch (vp->v_type) {
1221 case VCHR:
1222 *ap->a_lblkno = (daddr64_t)-1; /* failure */
1223 return (ENOTSUP);
1224
1225 case VBLK:
1226 printf("spec_offtoblk: not implemented for VBLK\n");
1227 *ap->a_lblkno = (daddr64_t)-1; /* failure */
1228 return (ENOTSUP);
1229
1230 default:
1231 panic("spec_offtoblk type");
1232 }
1233 /* NOTREACHED */
1234
1235 return (0);
1236 }