bsd/miscfs/specfs/spec_vnops.c

   1 /*
   2  * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1989, 1993, 1995
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)spec_vnops.c        8.14 (Berkeley) 5/21/95
  62  */
  63
  64 #include <sys/param.h>
  65 #include <sys/proc_internal.h>
  66 #include <sys/kauth.h>
  67 #include <sys/systm.h>
  68 #include <sys/kernel.h>
  69 #include <sys/conf.h>
  70 #include <sys/buf_internal.h>
  71 #include <sys/mount_internal.h>
  72 #include <sys/vnode_internal.h>
  73 #include <sys/file_internal.h>
  74 #include <sys/namei.h>
  75 #include <sys/stat.h>
  76 #include <sys/errno.h>
  77 #include <sys/ioctl.h>
  78 #include <sys/file.h>
  79 #include <sys/user.h>
  80 #include <sys/malloc.h>
  81 #include <sys/disk.h>
  82 #include <sys/uio_internal.h>
  83 #include <sys/resource.h>
  84 #include <miscfs/specfs/specdev.h>
  85 #include <vfs/vfs_support.h>
  86 #include <kern/assert.h>
  87 #include <kern/task.h>
  88
  89 #include <sys/kdebug.h>
  90
  91 /* XXX following three prototypes should be in a header file somewhere */
  92 extern dev_t    chrtoblk(dev_t dev);
  93 extern int      iskmemdev(dev_t dev);
  94 extern int      bpfkqfilter(dev_t dev, struct knote *kn);
  95 extern int      ptsd_kqfilter(dev_t dev, struct knote *kn);
  96
  97 struct vnode *speclisth[SPECHSZ];
  98
  99 /* symbolic sleep message strings for devices */
 100 char    devopn[] = "devopn";
 101 char    devio[] = "devio";
 102 char    devwait[] = "devwait";
 103 char    devin[] = "devin";
 104 char    devout[] = "devout";
 105 char    devioc[] = "devioc";
 106 char    devcls[] = "devcls";
 107
 108 #define VOPFUNC int (*)(void *)
 109
 110 int (**spec_vnodeop_p)(void *);
 111 struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
 112         { &vnop_default_desc, (VOPFUNC)vn_default_error },
 113         { &vnop_lookup_desc, (VOPFUNC)spec_lookup },            /* lookup */
 114         { &vnop_create_desc, (VOPFUNC)err_create },             /* create */
 115         { &vnop_mknod_desc, (VOPFUNC)err_mknod },               /* mknod */
 116         { &vnop_open_desc, (VOPFUNC)spec_open },                        /* open */
 117         { &vnop_close_desc, (VOPFUNC)spec_close },              /* close */
 118         { &vnop_access_desc, (VOPFUNC)spec_access },            /* access */
 119         { &vnop_getattr_desc, (VOPFUNC)spec_getattr },          /* getattr */
 120         { &vnop_setattr_desc, (VOPFUNC)spec_setattr },          /* setattr */
 121         { &vnop_read_desc, (VOPFUNC)spec_read },                        /* read */
 122         { &vnop_write_desc, (VOPFUNC)spec_write },              /* write */
 123         { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl },              /* ioctl */
 124         { &vnop_select_desc, (VOPFUNC)spec_select },            /* select */
 125         { &vnop_revoke_desc, (VOPFUNC)nop_revoke },             /* revoke */
 126         { &vnop_mmap_desc, (VOPFUNC)err_mmap },                 /* mmap */
 127         { &vnop_fsync_desc, (VOPFUNC)spec_fsync },              /* fsync */
 128         { &vnop_remove_desc, (VOPFUNC)err_remove },             /* remove */
 129         { &vnop_link_desc, (VOPFUNC)err_link },                 /* link */
 130         { &vnop_rename_desc, (VOPFUNC)err_rename },             /* rename */
 131         { &vnop_mkdir_desc, (VOPFUNC)err_mkdir },               /* mkdir */
 132         { &vnop_rmdir_desc, (VOPFUNC)err_rmdir },               /* rmdir */
 133         { &vnop_symlink_desc, (VOPFUNC)err_symlink },           /* symlink */
 134         { &vnop_readdir_desc, (VOPFUNC)err_readdir },           /* readdir */
 135         { &vnop_readlink_desc, (VOPFUNC)err_readlink },         /* readlink */
 136         { &vnop_inactive_desc, (VOPFUNC)nop_inactive },         /* inactive */
 137         { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim },           /* reclaim */
 138         { &vnop_strategy_desc, (VOPFUNC)spec_strategy },                /* strategy */
 139         { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf },                /* pathconf */
 140         { &vnop_advlock_desc, (VOPFUNC)err_advlock },           /* advlock */
 141         { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite },            /* bwrite */
 142         { &vnop_pagein_desc, (VOPFUNC)err_pagein },             /* Pagein */
 143         { &vnop_pageout_desc, (VOPFUNC)err_pageout },           /* Pageout */
 144         { &vnop_copyfile_desc, (VOPFUNC)err_copyfile },         /* Copyfile */
 145         { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff },                /* blktooff */
 146         { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk },                /* offtoblk */
 147         { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap },                /* blockmap */
 148         { (struct vnodeop_desc*)NULL, (int(*)())NULL }
 149 };
 150 struct vnodeopv_desc spec_vnodeop_opv_desc =
 151         { &spec_vnodeop_p, spec_vnodeop_entries };
 152
 153
 154 static void set_blocksize(vnode_t, dev_t);
 155
 156
 157 /*
 158  * Trivial lookup routine that always fails.
 159  */
 160 int
 161 spec_lookup(struct vnop_lookup_args *ap)
 162 {
 163
 164         *ap->a_vpp = NULL;
 165         return (ENOTDIR);
 166 }
 167
 168 static void
 169 set_blocksize(struct vnode *vp, dev_t dev)
 170 {
 171     int (*size)(dev_t);
 172     int rsize;
 173
 174     if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
 175         rsize = (*size)(dev);
 176         if (rsize <= 0)        /* did size fail? */
 177             vp->v_specsize = DEV_BSIZE;
 178         else
 179             vp->v_specsize = rsize;
 180     }
 181     else
 182             vp->v_specsize = DEV_BSIZE;
 183 }
 184
 185 void
 186 set_fsblocksize(struct vnode *vp)
 187 {
 188
 189         if (vp->v_type == VBLK) {
 190                 dev_t dev = (dev_t)vp->v_rdev;
 191                 int maj = major(dev);
 192
 193                 if ((u_int)maj >= (u_int)nblkdev)
 194                         return;
 195
 196                 vnode_lock(vp);
 197                 set_blocksize(vp, dev);
 198                 vnode_unlock(vp);
 199         }
 200
 201 }
 202
 203
 204 /*
 205  * Open a special file.
 206  */
 207 int
 208 spec_open(struct vnop_open_args *ap)
 209 {
 210         struct proc *p = vfs_context_proc(ap->a_context);
 211         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
 212         struct vnode *vp = ap->a_vp;
 213         dev_t bdev, dev = (dev_t)vp->v_rdev;
 214         int maj = major(dev);
 215         int error;
 216
 217         /*
 218          * Don't allow open if fs is mounted -nodev.
 219          */
 220         if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
 221                 return (ENXIO);
 222
 223         switch (vp->v_type) {
 224
 225         case VCHR:
 226                 if ((u_int)maj >= (u_int)nchrdev)
 227                         return (ENXIO);
 228                 if (cred != FSCRED && (ap->a_mode & FWRITE)) {
 229                         /*
 230                          * When running in very secure mode, do not allow
 231                          * opens for writing of any disk character devices.
 232                          */
 233                         if (securelevel >= 2 && isdisk(dev, VCHR))
 234                                 return (EPERM);
 235                         /*
 236                          * When running in secure mode, do not allow opens
 237                          * for writing of /dev/mem, /dev/kmem, or character
 238                          * devices whose corresponding block devices are
 239                          * currently mounted.
 240                          */
 241                         if (securelevel >= 1) {
 242                                 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error))
 243                                         return (error);
 244                                 if (iskmemdev(dev))
 245                                         return (EPERM);
 246                         }
 247                 }
 248                 if (cdevsw[maj].d_type == D_TTY) {
 249                         vnode_lock(vp);
 250                         vp->v_flag |= VISTTY;
 251                         vnode_unlock(vp);
 252                 }
 253
 254                 devsw_lock(dev, S_IFCHR);
 255                 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
 256
 257                 if (error == 0) {
 258                         vp->v_specinfo->si_opencount++;
 259                 }
 260
 261                 devsw_unlock(dev, S_IFCHR);
 262                 return (error);
 263
 264         case VBLK:
 265                 if ((u_int)maj >= (u_int)nblkdev)
 266                         return (ENXIO);
 267                 /*
 268                  * When running in very secure mode, do not allow
 269                  * opens for writing of any disk block devices.
 270                  */
 271                 if (securelevel >= 2 && cred != FSCRED &&
 272                     (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK)
 273                         return (EPERM);
 274                 /*
 275                  * Do not allow opens of block devices that are
 276                  * currently mounted.
 277                  */
 278                 if ( (error = vfs_mountedon(vp)) )
 279                         return (error);
 280
 281                 devsw_lock(dev, S_IFBLK);
 282                 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
 283                 if (!error) {
 284                         vp->v_specinfo->si_opencount++;
 285                 }
 286                 devsw_unlock(dev, S_IFBLK);
 287
 288                 if (!error) {
 289                     u_int64_t blkcnt;
 290                     u_int32_t blksize;
 291                         int setsize = 0;
 292                         u_int32_t size512 = 512;
 293
 294
 295                     if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
 296                                 /* Switch to 512 byte sectors (temporarily) */
 297
 298                                 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
 299                                 /* Get the number of 512 byte physical blocks. */
 300                                 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
 301                                                 setsize = 1;
 302                                 }
 303                                 }
 304                                 /* If it doesn't set back, we can't recover */
 305                                 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context))
 306                                 error = ENXIO;
 307                     }
 308
 309
 310                         vnode_lock(vp);
 311                     set_blocksize(vp, dev);
 312
 313                     /*
 314                      * Cache the size in bytes of the block device for later
 315                      * use by spec_write().
 316                      */
 317                         if (setsize)
 318                                 vp->v_specdevsize = blkcnt * (u_int64_t)size512;
 319                         else
 320                         vp->v_specdevsize = (u_int64_t)0;       /* Default: Can't get */
 321
 322                         vnode_unlock(vp);
 323
 324                 }
 325                 return(error);
 326         default:
 327                 panic("spec_open type");
 328         }
 329         return (0);
 330 }
 331
 332 /*
 333  * Vnode op for read
 334  */
 335 int
 336 spec_read(struct vnop_read_args *ap)
 337 {
 338         struct vnode *vp = ap->a_vp;
 339         struct uio *uio = ap->a_uio;
 340         struct buf *bp;
 341         daddr64_t bn, nextbn;
 342         long bsize, bscale;
 343         int devBlockSize=0;
 344         int n, on;
 345         int error = 0;
 346         dev_t dev;
 347
 348 #if DIAGNOSTIC
 349         if (uio->uio_rw != UIO_READ)
 350                 panic("spec_read mode");
 351         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
 352                 panic("spec_read proc");
 353 #endif
 354         if (uio_resid(uio) == 0)
 355                 return (0);
 356
 357         switch (vp->v_type) {
 358
 359         case VCHR:
 360                 error = (*cdevsw[major(vp->v_rdev)].d_read)
 361                         (vp->v_rdev, uio, ap->a_ioflag);
 362                 return (error);
 363
 364         case VBLK:
 365                 if (uio->uio_offset < 0)
 366                         return (EINVAL);
 367
 368                 dev = vp->v_rdev;
 369
 370                 devBlockSize = vp->v_specsize;
 371
 372                 if (devBlockSize > PAGE_SIZE)
 373                         return (EINVAL);
 374
 375                 bscale = PAGE_SIZE / devBlockSize;
 376                 bsize = bscale * devBlockSize;
 377
 378                 do {
 379                         on = uio->uio_offset % bsize;
 380
 381                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1));
 382
 383                         if (vp->v_speclastr + bscale == bn) {
 384                                 nextbn = bn + bscale;
 385                                 error = buf_breadn(vp, bn, (int)bsize, &nextbn,
 386                                                (int *)&bsize, 1, NOCRED, &bp);
 387                         } else
 388                                 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
 389
 390                         vnode_lock(vp);
 391                         vp->v_speclastr = bn;
 392                         vnode_unlock(vp);
 393
 394                         n = bsize - buf_resid(bp);
 395                         if ((on > n) || error) {
 396                                 if (!error)
 397                                         error = EINVAL;
 398                                 buf_brelse(bp);
 399                                 return (error);
 400                         }
 401                         n = min((unsigned)(n  - on), uio_resid(uio));
 402
 403                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
 404                         if (n + on == bsize)
 405                                 buf_markaged(bp);
 406                         buf_brelse(bp);
 407                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
 408                 return (error);
 409
 410         default:
 411                 panic("spec_read type");
 412         }
 413         /* NOTREACHED */
 414
 415         return (0);
 416 }
 417
 418 /*
 419  * Vnode op for write
 420  */
 421 int
 422 spec_write(struct vnop_write_args *ap)
 423 {
 424         struct vnode *vp = ap->a_vp;
 425         struct uio *uio = ap->a_uio;
 426         struct buf *bp;
 427         daddr64_t bn;
 428         int bsize, blkmask, bscale;
 429         int io_sync;
 430         int devBlockSize=0;
 431         int n, on;
 432         int error = 0;
 433         dev_t dev;
 434
 435 #if DIAGNOSTIC
 436         if (uio->uio_rw != UIO_WRITE)
 437                 panic("spec_write mode");
 438         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
 439                 panic("spec_write proc");
 440 #endif
 441
 442         switch (vp->v_type) {
 443
 444         case VCHR:
 445                 error = (*cdevsw[major(vp->v_rdev)].d_write)
 446                         (vp->v_rdev, uio, ap->a_ioflag);
 447                 return (error);
 448
 449         case VBLK:
 450                 if (uio_resid(uio) == 0)
 451                         return (0);
 452                 if (uio->uio_offset < 0)
 453                         return (EINVAL);
 454
 455                 io_sync = (ap->a_ioflag & IO_SYNC);
 456
 457                 dev = (vp->v_rdev);
 458
 459                 devBlockSize = vp->v_specsize;
 460                 if (devBlockSize > PAGE_SIZE)
 461                         return(EINVAL);
 462
 463                 bscale = PAGE_SIZE / devBlockSize;
 464                 blkmask = bscale - 1;
 465                 bsize = bscale * devBlockSize;
 466
 467
 468                 do {
 469                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask);
 470                         on = uio->uio_offset % bsize;
 471
 472                         n = min((unsigned)(bsize - on), uio_resid(uio));
 473
 474                         /*
 475                          * Use buf_getblk() as an optimization IFF:
 476                          *
 477                          * 1)   We are reading exactly a block on a block
 478                          *      aligned boundary
 479                          * 2)   We know the size of the device from spec_open
 480                          * 3)   The read doesn't span the end of the device
 481                          *
 482                          * Otherwise, we fall back on buf_bread().
 483                          */
 484                         if (n == bsize &&
 485                             vp->v_specdevsize != (u_int64_t)0 &&
 486                             (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
 487                             /* reduce the size of the read to what is there */
 488                             n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
 489                         }
 490
 491                         if (n == bsize)
 492                                 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
 493                         else
 494                                 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
 495
 496                         /* Translate downstream error for upstream, if needed */
 497                         if (!error)
 498                                 error = (int)buf_error(bp);
 499                         if (error) {
 500                                 buf_brelse(bp);
 501                                 return (error);
 502                         }
 503                         n = min(n, bsize - buf_resid(bp));
 504
 505                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
 506                         if (error) {
 507                                 buf_brelse(bp);
 508                                 return (error);
 509                         }
 510                         buf_markaged(bp);
 511
 512                         if (io_sync)
 513                                 error = buf_bwrite(bp);
 514                         else {
 515                                 if ((n + on) == bsize)
 516                                         error = buf_bawrite(bp);
 517                                 else
 518                                         error = buf_bdwrite(bp);
 519                         }
 520                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
 521                 return (error);
 522
 523         default:
 524                 panic("spec_write type");
 525         }
 526         /* NOTREACHED */
 527
 528         return (0);
 529 }
 530
 531 /*
 532  * Device ioctl operation.
 533  */
 534 int
 535 spec_ioctl(struct vnop_ioctl_args *ap)
 536 {
 537         proc_t p = vfs_context_proc(ap->a_context);
 538         dev_t dev = ap->a_vp->v_rdev;
 539         int     retval = 0;
 540
 541         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START,
 542                               (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, (unsigned int)ap->a_vp->v_type, 0);
 543
 544         switch (ap->a_vp->v_type) {
 545
 546         case VCHR:
 547                 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
 548                                                        ap->a_fflag, p);
 549                 break;
 550
 551         case VBLK:
 552                 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
 553                                                        ap->a_fflag, p);
 554                 break;
 555
 556         default:
 557                 panic("spec_ioctl");
 558                 /* NOTREACHED */
 559         }
 560         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END,
 561                               (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, retval, 0);
 562
 563         return (retval);
 564 }
 565
 566 int
 567 spec_select(struct vnop_select_args *ap)
 568 {
 569         proc_t p = vfs_context_proc(ap->a_context);
 570         dev_t dev;
 571
 572         switch (ap->a_vp->v_type) {
 573
 574         default:
 575                 return (1);             /* XXX */
 576
 577         case VCHR:
 578                 dev = ap->a_vp->v_rdev;
 579                 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
 580         }
 581 }
 582
 583 static int filt_specattach(struct knote *kn);
 584
 585 int
 586 spec_kqfilter(vnode_t vp, struct knote *kn)
 587 {
 588         dev_t dev;
 589         int err = EINVAL;
 590
 591         /*
 592          * For a few special kinds of devices, we can attach knotes.
 593          * Each filter function must check whether the dev type matches it.
 594          */
 595         dev = vnode_specrdev(vp);
 596
 597         if (vnode_istty(vp)) {
 598                 /* We can hook into TTYs... */
 599                 err = filt_specattach(kn);
 600         } else {
 601                 /* Try a bpf device, as defined in bsd/net/bpf.c */
 602                 err = bpfkqfilter(dev, kn);
 603         }
 604
 605         return err;
 606 }
 607
 608 /*
 609  * Synch buffers associated with a block device
 610  */
 611 int
 612 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
 613 {
 614         if (vp->v_type == VCHR)
 615                 return (0);
 616         /*
 617          * Flush all dirty buffers associated with a block device.
 618          */
 619         buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync");
 620
 621         return (0);
 622 }
 623
 624 int
 625 spec_fsync(struct vnop_fsync_args *ap)
 626 {
 627         return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
 628 }
 629
 630 /*
 631  * Just call the device strategy routine
 632  */
 633 extern int hard_throttle_on_root;
 634 void IOSleep(int);
 635
 636 // the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
 637 #define LOWPRI_INITIAL_WINDOW_MSECS 100
 638 #define LOWPRI_WINDOW_MSECS_INC 50
 639 #define LOWPRI_MAX_WINDOW_MSECS 200
 640 #define LOWPRI_MAX_WAITING_MSECS 200
 641
 642 #if CONFIG_EMBEDDED
 643 #define LOWPRI_SLEEP_INTERVAL 5
 644 #else
 645 #define LOWPRI_SLEEP_INTERVAL 2
 646 #endif
 647
 648 struct _throttle_io_info_t {
 649         struct timeval  last_normal_IO_timestamp;
 650         struct timeval  last_IO_timestamp;
 651         SInt32 numthreads_throttling;
 652         SInt32 refcnt;
 653         SInt32 alloc;
 654 };
 655
 656 struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
 657 int     lowpri_IO_initial_window_msecs  = LOWPRI_INITIAL_WINDOW_MSECS;
 658 int     lowpri_IO_window_msecs_inc  = LOWPRI_WINDOW_MSECS_INC;
 659 int     lowpri_max_window_msecs  = LOWPRI_MAX_WINDOW_MSECS;
 660 int     lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS;
 661
 662 #if 0
 663 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)  \
 664         do {                                                    \
 665                if ((debug_info)->alloc)                           \
 666                printf("%s: "format, __FUNCTION__, ## args);     \
 667        } while(0)
 668
 669 #else
 670 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
 671 #endif
 672
 673 SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
 674 SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, "");
 675 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
 676 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
 677
 678 /*
 679  * throttled I/O helper function
 680  * convert the index of the lowest set bit to a device index
 681  */
 682 int
 683 num_trailing_0(uint64_t n)
 684 {
 685         /*
 686          * since in most cases the number of trailing 0s is very small,
 687      * we simply counting sequentially from the lowest bit
 688          */
 689         if (n == 0)
 690                 return sizeof(n) * 8;
 691         int count = 0;
 692         while (!ISSET(n, 1)) {
 693                 n >>= 1;
 694                 ++count;
 695         }
 696         return count;
 697 }
 698
 699 /*
 700  * Release the reference and if the item was allocated and this is the last
 701  * reference then free it.
 702  *
 703  * This routine always returns the old value.
 704  */
 705 static int
 706 throttle_info_rel(struct _throttle_io_info_t *info)
 707 {
 708         SInt32 oldValue = OSDecrementAtomic(&info->refcnt);
 709
 710         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
 711                 info, (int)(oldValue -1), info );
 712
 713         /* The reference count just went negative, very bad */
 714         if (oldValue == 0)
 715                 panic("throttle info ref cnt went negative!");
 716
 717         /*
 718          * Once reference count is zero, no one else should be able to take a
 719          * reference
 720          */
 721         if ((info->refcnt == 0) && (info->alloc)) {
 722                 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info, info );
 723                 FREE(info, M_TEMP);
 724         }
 725         return oldValue;
 726 }
 727
 728 /*
 729  * Just take a reference on the throttle info structure.
 730  *
 731  * This routine always returns the old value.
 732  */
 733 static SInt32
 734 throttle_info_ref(struct _throttle_io_info_t *info)
 735 {
 736         SInt32 oldValue = OSIncrementAtomic(&info->refcnt);
 737
 738         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
 739                 info, (int)(oldValue -1), info );
 740         /* Allocated items should never have a reference of zero */
 741         if (info->alloc && (oldValue == 0))
 742                 panic("Taking a reference without calling create throttle info!\n");
 743
 744         return oldValue;
 745 }
 746
 747 /*
 748  * KPI routine
 749  *
 750  * Create and take a reference on a throttle info structure and return a
 751  * pointer for the file system to use when calling throttle_info_update.
 752  * Calling file system must have a matching release for every create.
 753  */
 754 void *
 755 throttle_info_create(void)
 756 {
 757         struct _throttle_io_info_t *info;
 758
 759         MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK);
 760         /* Should never happen but just in case */
 761         if (info == NULL)
 762                 return NULL;
 763         /* Mark that this one was allocated and needs to be freed */
 764         DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
 765         info->alloc = TRUE;
 766         /* Take a reference */
 767         OSIncrementAtomic(&info->refcnt);
 768         return info;
 769 }
 770
 771 /*
 772  * KPI routine
 773  *
 774  * Release the throttle info pointer if all the reference are gone. Should be
 775  * called to release reference taken by throttle_info_create
 776  */
 777 void
 778 throttle_info_release(void *throttle_info)
 779 {
 780         DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
 781                 (struct _throttle_io_info_t *)throttle_info,
 782                 (struct _throttle_io_info_t *)throttle_info);
 783         if (throttle_info) /* Just to be careful */
 784                 throttle_info_rel(throttle_info);
 785 }
 786
 787 /*
 788  * KPI routine
 789  *
 790  * File Systems that create an info structure, need to call this routine in
 791  * their mount routine (used by cluster code). File Systems that call this in
 792  * their mount routines must call throttle_info_mount_rel in their unmount
 793  * routines.
 794  */
 795 void
 796 throttle_info_mount_ref(mount_t mp, void *throttle_info)
 797 {
 798         if ((throttle_info == NULL) || (mp == NULL))
 799                 return;
 800         throttle_info_ref(throttle_info);
 801         /* We already have a reference release it before adding the new one */
 802         if (mp->mnt_throttle_info)
 803                 throttle_info_rel(mp->mnt_throttle_info);
 804         mp->mnt_throttle_info = throttle_info;
 805 }
 806
 807 /*
 808  * Private KPI routine
 809  *
 810  * return a handle for accessing throttle_info given a throttle_mask.  The
 811  * handle must be released by throttle_info_rel_by_mask
 812  */
 813 int
 814 throttle_info_ref_by_mask(uint64_t throttle_mask,
 815                                                   throttle_info_handle_t *throttle_info_handle)
 816 {
 817         int dev_index;
 818         struct _throttle_io_info_t *info;
 819
 820         if (throttle_info_handle == NULL)
 821                 return EINVAL;
 822
 823         dev_index = num_trailing_0(throttle_mask);
 824         info = &_throttle_io_info[dev_index];
 825         throttle_info_ref(info);
 826         *(struct _throttle_io_info_t**)throttle_info_handle = info;
 827         return 0;
 828 }
 829
 830 /*
 831  * Private KPI routine
 832  *
 833  * release the handle obtained by throttle_info_ref_by_mask
 834  */
 835 void
 836 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
 837 {
 838         /* for now the handle is just a pointer to _throttle_io_info_t */
 839         throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
 840 }
 841
 842 /*
 843  * KPI routine
 844  *
 845  * File Systems that throttle_info_mount_ref, must call this routine in their
 846  * umount routine.
 847  */
 848 void
 849 throttle_info_mount_rel(mount_t mp)
 850 {
 851         if (mp->mnt_throttle_info)
 852                 throttle_info_rel(mp->mnt_throttle_info);
 853         mp->mnt_throttle_info = NULL;
 854 }
 855
 856 void
 857 throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
 858 {
 859         struct _throttle_io_info_t *info;
 860
 861         if (mp == NULL)
 862             info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
 863         else if (mp->mnt_throttle_info == NULL)
 864             info = &_throttle_io_info[mp->mnt_devbsdunit];
 865         else
 866             info = mp->mnt_throttle_info;
 867
 868         *tv = info->last_IO_timestamp;
 869 }
 870
 871 void
 872 update_last_io_time(mount_t mp)
 873 {
 874         struct _throttle_io_info_t *info;
 875
 876         if (mp == NULL)
 877             info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
 878         else if (mp->mnt_throttle_info == NULL)
 879             info = &_throttle_io_info[mp->mnt_devbsdunit];
 880         else
 881             info = mp->mnt_throttle_info;
 882
 883         microuptime(&info->last_IO_timestamp);
 884 }
 885
 886
 887 #if CONFIG_EMBEDDED
 888
 889 int throttle_get_io_policy(struct uthread **ut)
 890 {
 891         int policy = IOPOL_DEFAULT;
 892         proc_t p = current_proc();
 893
 894         *ut = get_bsdthread_info(current_thread());
 895
 896         if (p != NULL)
 897                 policy = p->p_iopol_disk;
 898
 899         if (*ut != NULL) {
 900                 // the I/O policy of the thread overrides that of the process
 901                 // unless the I/O policy of the thread is default
 902                 if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT)
 903                         policy = (*ut)->uu_iopol_disk;
 904         }
 905         return policy;
 906 }
 907 #else
 908
 909 int throttle_get_io_policy(__unused struct uthread **ut)
 910 {
 911         *ut = get_bsdthread_info(current_thread());
 912
 913         return (proc_get_task_selfdiskacc());
 914 }
 915 #endif
 916
 917
 918 static int
 919 throttle_io_will_be_throttled_internal(int lowpri_window_msecs, void * throttle_info)
 920 {
 921         struct _throttle_io_info_t *info = throttle_info;
 922         struct timeval elapsed;
 923         int elapsed_msecs;
 924         int policy;
 925         struct uthread  *ut;
 926
 927         policy = throttle_get_io_policy(&ut);
 928
 929         if (ut->uu_throttle_bc == FALSE && policy != IOPOL_THROTTLE)
 930                 return (0);
 931
 932         microuptime(&elapsed);
 933         timevalsub(&elapsed, &info->last_normal_IO_timestamp);
 934         elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
 935
 936         if (lowpri_window_msecs == -1) // use the max waiting time
 937                 lowpri_window_msecs = lowpri_max_waiting_msecs;
 938
 939         return elapsed_msecs < lowpri_window_msecs;
 940 }
 941
 942 /*
 943  * If we have a mount point and it has a throttle info pointer then
 944  * use it to do the check, otherwise use the device unit number to find
 945  * the correct throttle info array element.
 946  */
 947 int
 948 throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp)
 949 {
 950         void *info;
 951
 952         /* Should we just return zero if no mount point */
 953         if (mp == NULL)
 954             info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
 955         else if (mp->mnt_throttle_info == NULL)
 956             info = &_throttle_io_info[mp->mnt_devbsdunit];
 957         else
 958             info = mp->mnt_throttle_info;
 959         return throttle_io_will_be_throttled_internal(lowpri_window_msecs, info);
 960 }
 961
 962 uint32_t
 963 throttle_lowpri_io(int sleep_amount)
 964 {
 965         int sleep_cnt = 0;
 966         int numthreads_throttling;
 967         int max_try_num;
 968         struct uthread *ut;
 969         struct _throttle_io_info_t *info;
 970         int max_waiting_msecs;
 971
 972         ut = get_bsdthread_info(current_thread());
 973
 974         if ((ut->uu_lowpri_window == 0) || (ut->uu_throttle_info == NULL))
 975                 goto done;
 976
 977         info = ut->uu_throttle_info;
 978
 979         if (sleep_amount != 0) {
 980 #if CONFIG_EMBEDDED
 981                 max_waiting_msecs = lowpri_max_waiting_msecs;
 982 #else
 983                 if (ut->uu_throttle_isssd == TRUE)
 984                         max_waiting_msecs = lowpri_max_waiting_msecs / 100;
 985                 else
 986                         max_waiting_msecs = lowpri_max_waiting_msecs;
 987 #endif
 988                 if (max_waiting_msecs < LOWPRI_SLEEP_INTERVAL)
 989                         max_waiting_msecs = LOWPRI_SLEEP_INTERVAL;
 990
 991                 numthreads_throttling = info->numthreads_throttling + MIN(10, MAX(1, sleep_amount)) - 1;
 992                 max_try_num = max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, numthreads_throttling);
 993
 994                 for (sleep_cnt = 0; sleep_cnt < max_try_num; sleep_cnt++) {
 995                         if (throttle_io_will_be_throttled_internal(ut->uu_lowpri_window, info)) {
 996                                 if (sleep_cnt == 0) {
 997                                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
 998                                                               ut->uu_lowpri_window, max_try_num, numthreads_throttling, 0, 0);
 999                                 }
1000                                 IOSleep(LOWPRI_SLEEP_INTERVAL);
1001                                 DEBUG_ALLOC_THROTTLE_INFO("sleeping because of info = %p\n", info, info );
1002                         } else {
1003                                 break;
1004                         }
1005                 }
1006                 if (sleep_cnt) {
1007                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
1008                                               ut->uu_lowpri_window, sleep_cnt, 0, 0, 0);
1009                 }
1010         }
1011         SInt32 oldValue;
1012         oldValue = OSDecrementAtomic(&info->numthreads_throttling);
1013
1014         if (oldValue <= 0) {
1015                 panic("%s: numthreads negative", __func__);
1016         }
1017 done:
1018         ut->uu_lowpri_window = 0;
1019         if (ut->uu_throttle_info)
1020                 throttle_info_rel(ut->uu_throttle_info);
1021         ut->uu_throttle_info = NULL;
1022         ut->uu_throttle_bc = FALSE;
1023
1024         return (sleep_cnt * LOWPRI_SLEEP_INTERVAL);
1025 }
1026
1027 /*
1028  * KPI routine
1029  *
1030  * set a kernel thread's IO policy.  policy can be:
1031  * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE
1032  *
1033  * explanations about these policies are in the man page of setiopolicy_np
1034  */
1035 void throttle_set_thread_io_policy(int policy)
1036 {
1037 #if !CONFIG_EMBEDDED
1038         proc_apply_thread_selfdiskacc(policy);
1039 #else /* !CONFIG_EMBEDDED */
1040         struct uthread *ut;
1041         ut = get_bsdthread_info(current_thread());
1042         ut->uu_iopol_disk = policy;
1043 #endif /* !CONFIG_EMBEDDED */
1044 }
1045
1046
1047 static
1048 void throttle_info_reset_window(struct uthread *ut)
1049 {
1050         struct _throttle_io_info_t *info;
1051
1052         info = ut->uu_throttle_info;
1053
1054         OSDecrementAtomic(&info->numthreads_throttling);
1055         throttle_info_rel(info);
1056         ut->uu_throttle_info = NULL;
1057         ut->uu_lowpri_window = 0;
1058 }
1059
1060 static
1061 void throttle_info_set_initial_window(struct uthread *ut, struct _throttle_io_info_t *info, boolean_t isssd, boolean_t BC_throttle)
1062 {
1063         SInt32 oldValue;
1064
1065         ut->uu_throttle_info = info;
1066         throttle_info_ref(info);
1067         DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
1068
1069         oldValue = OSIncrementAtomic(&info->numthreads_throttling);
1070         if (oldValue < 0) {
1071                 panic("%s: numthreads negative", __func__);
1072         }
1073         ut->uu_lowpri_window = lowpri_IO_initial_window_msecs;
1074         ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue;
1075         ut->uu_throttle_isssd = isssd;
1076         ut->uu_throttle_bc = BC_throttle;
1077 }
1078
1079
1080 static
1081 void throttle_info_update_internal(void *throttle_info, int flags, boolean_t isssd)
1082 {
1083         struct _throttle_io_info_t *info = throttle_info;
1084         struct uthread  *ut;
1085         int policy;
1086         int is_throttleable_io = 0;
1087         int is_passive_io = 0;
1088
1089         if (!lowpri_IO_initial_window_msecs || (info == NULL))
1090                 return;
1091         policy = throttle_get_io_policy(&ut);
1092
1093         switch (policy) {
1094         case IOPOL_DEFAULT:
1095         case IOPOL_NORMAL:
1096                 break;
1097         case IOPOL_THROTTLE:
1098                 is_throttleable_io = 1;
1099                 break;
1100         case IOPOL_PASSIVE:
1101                 is_passive_io = 1;
1102                 break;
1103         default:
1104                 printf("unknown I/O policy %d", policy);
1105                 break;
1106         }
1107
1108         if (!is_throttleable_io && ISSET(flags, B_PASSIVE))
1109                 is_passive_io |= 1;
1110
1111         if (!is_throttleable_io) {
1112                 if (!is_passive_io){
1113                         microuptime(&info->last_normal_IO_timestamp);
1114                 }
1115         } else if (ut) {
1116                 /*
1117                  * I'd really like to do the IOSleep here, but
1118                  * we may be holding all kinds of filesystem related locks
1119                  * and the pages for this I/O marked 'busy'...
1120                  * we don't want to cause a normal task to block on
1121                  * one of these locks while we're throttling a task marked
1122                  * for low priority I/O... we'll mark the uthread and
1123                  * do the delay just before we return from the system
1124                  * call that triggered this I/O or from vnode_pagein
1125                  */
1126                 if (ut->uu_lowpri_window == 0)
1127                         throttle_info_set_initial_window(ut, info, isssd, FALSE);
1128                 else {
1129                         /* The thread sends I/Os to different devices within the same system call */
1130                         if (ut->uu_throttle_info != info) {
1131                                 struct _throttle_io_info_t *old_info = ut->uu_throttle_info;
1132
1133                                 // keep track of the numthreads in the right device
1134                                 OSDecrementAtomic(&old_info->numthreads_throttling);
1135                                 OSIncrementAtomic(&info->numthreads_throttling);
1136
1137                                 DEBUG_ALLOC_THROTTLE_INFO("switching from info = %p\n", old_info, old_info );
1138                                 DEBUG_ALLOC_THROTTLE_INFO("switching to info = %p\n", info, info );
1139                                 /* This thread no longer needs a reference on that throttle info */
1140                                 throttle_info_rel(ut->uu_throttle_info);
1141                                 ut->uu_throttle_info = info;
1142                                 /* Need to take a reference on this throttle info */
1143                                 throttle_info_ref(ut->uu_throttle_info);
1144                         }
1145                         int numthreads = MAX(1, info->numthreads_throttling);
1146                         ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads;
1147                         if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads)
1148                                 ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads;
1149
1150                         if (isssd == FALSE) {
1151                                 /*
1152                                  * we're here because we've actually issued I/Os to different devices...
1153                                  * if at least one of them was a non SSD, then thottle the thread
1154                                  * using the policy for non SSDs
1155                                  */
1156                                 ut->uu_throttle_isssd = FALSE;
1157                         }
1158                 }
1159         }
1160 }
1161
1162 /*
1163  * KPI routine
1164  *
1165  * this is usually called before every I/O, used for throttled I/O
1166  * book keeping.  This routine has low overhead and does not sleep
1167  */
1168 void throttle_info_update(void *throttle_info, int flags)
1169 {
1170         throttle_info_update_internal(throttle_info, flags, FALSE);
1171 }
1172
1173 /*
1174  * KPI routine
1175  *
1176  * this is usually called before every I/O, used for throttled I/O
1177  * book keeping.  This routine has low overhead and does not sleep
1178  */
1179 void throttle_info_update_by_mask(void *throttle_info_handle, int flags)
1180 {
1181         void *throttle_info = throttle_info_handle;
1182         /* for now we only use the lowest bit of the throttle mask, so the
1183          * handle is the same as the throttle_info.  Later if we store a
1184          * set of throttle infos in the handle, we will want to loop through
1185          * them and call throttle_info_update in a loop
1186          */
1187         throttle_info_update(throttle_info, flags);
1188 }
1189
1190 extern int ignore_is_ssd;
1191
1192 int
1193 spec_strategy(struct vnop_strategy_args *ap)
1194 {
1195         buf_t   bp;
1196         int     bflags;
1197         int     policy;
1198         dev_t   bdev;
1199         uthread_t ut;
1200         mount_t mp;
1201         int strategy_ret;
1202         struct _throttle_io_info_t *throttle_info;
1203         boolean_t isssd = FALSE;
1204
1205         bp = ap->a_bp;
1206         bdev = buf_device(bp);
1207         mp = buf_vnode(bp)->v_mount;
1208
1209         policy = throttle_get_io_policy(&ut);
1210
1211         if (policy == IOPOL_THROTTLE) {
1212                 bp->b_flags |= B_THROTTLED_IO;
1213                 bp->b_flags &= ~B_PASSIVE;
1214         } else if (policy == IOPOL_PASSIVE)
1215                 bp->b_flags |= B_PASSIVE;
1216
1217         bflags = bp->b_flags;
1218
1219         if (kdebug_enable) {
1220                 int    code = 0;
1221
1222                 if (bflags & B_READ)
1223                         code |= DKIO_READ;
1224                 if (bflags & B_ASYNC)
1225                         code |= DKIO_ASYNC;
1226
1227                 if (bflags & B_META)
1228                         code |= DKIO_META;
1229                 else if (bflags & B_PAGEIO)
1230                         code |= DKIO_PAGING;
1231
1232                 if (bflags & B_THROTTLED_IO)
1233                         code |= DKIO_THROTTLE;
1234                 else if (bflags & B_PASSIVE)
1235                         code |= DKIO_PASSIVE;
1236
1237                 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
1238                                       bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0);
1239         }
1240         if (((bflags & (B_IOSTREAMING | B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
1241             mp && (mp->mnt_kern_flag & MNTK_ROOTDEV))
1242                 hard_throttle_on_root = 1;
1243
1244         if (mp != NULL) {
1245                 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
1246                         isssd = TRUE;
1247                 throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
1248         } else
1249                 throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1250
1251         throttle_info_update_internal(throttle_info, bflags, isssd);
1252
1253         if ((bflags & B_READ) == 0) {
1254                 microuptime(&throttle_info->last_IO_timestamp);
1255                 if (mp) {
1256                         INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
1257                 }
1258         } else if (mp) {
1259                 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
1260         }
1261         /*
1262          * The BootCache may give us special information about
1263          * the IO, so it returns special values that we check
1264          * for here.
1265          *
1266          * IO_SATISFIED_BY_CACHE
1267          * The read has been satisfied by the boot cache. Don't
1268          * throttle the thread unnecessarily.
1269          *
1270          * IO_SHOULD_BE_THROTTLED
1271          * The boot cache is playing back a playlist and this IO
1272          * cut through. Throttle it so we're not cutting through
1273          * the boot cache too often.
1274          *
1275          * Note that typical strategy routines are defined with
1276          * a void return so we'll get garbage here. In the
1277          * unlikely case the garbage matches our special return
1278          * value, it's not a big deal since we're only adjusting
1279          * the throttling delay.
1280          */
1281 #define IO_SATISFIED_BY_CACHE  ((int)0xcafefeed)
1282 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
1283         typedef int strategy_fcn_ret_t(struct buf *bp);
1284
1285         strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp);
1286
1287         if ((IO_SATISFIED_BY_CACHE == strategy_ret) && (ut->uu_lowpri_window != 0) && (ut->uu_throttle_info != NULL)) {
1288                 /*
1289                  * If this was a throttled IO satisfied by the boot cache,
1290                  * don't delay the thread.
1291                  */
1292                 throttle_info_reset_window(ut);
1293
1294         } else if ((IO_SHOULD_BE_THROTTLED == strategy_ret) && (ut->uu_lowpri_window == 0) && (ut->uu_throttle_info == NULL)) {
1295                 /*
1296                  * If the boot cache indicates this IO should be throttled,
1297                  * delay the thread.
1298                  */
1299                 throttle_info_set_initial_window(ut, throttle_info, isssd, TRUE);
1300         }
1301         return (0);
1302 }
1303
1304
1305 /*
1306  * This is a noop, simply returning what one has been given.
1307  */
1308 int
1309 spec_blockmap(__unused struct vnop_blockmap_args *ap)
1310 {
1311         return (ENOTSUP);
1312 }
1313
1314
1315 /*
1316  * Device close routine
1317  */
1318 int
1319 spec_close(struct vnop_close_args *ap)
1320 {
1321         struct vnode *vp = ap->a_vp;
1322         dev_t dev = vp->v_rdev;
1323         int error = 0;
1324         int flags = ap->a_fflag;
1325         struct proc *p = vfs_context_proc(ap->a_context);
1326         struct session *sessp;
1327         int do_rele = 0;
1328
1329         switch (vp->v_type) {
1330
1331         case VCHR:
1332                 /*
1333                  * Hack: a tty device that is a controlling terminal
1334                  * has a reference from the session structure.
1335                  * We cannot easily tell that a character device is
1336                  * a controlling terminal, unless it is the closing
1337                  * process' controlling terminal.  In that case,
1338                  * if the reference count is 1 (this is the very
1339              * last close)
1340                  */
1341                 sessp = proc_session(p);
1342                 if (sessp != SESSION_NULL) {
1343                         if ((vcount(vp) == 1) &&
1344                                 (vp == sessp->s_ttyvp)) {
1345
1346                                 session_lock(sessp);
1347                                 if (vp == sessp->s_ttyvp) {
1348                                         sessp->s_ttyvp = NULL;
1349                                         sessp->s_ttyvid = 0;
1350                                         sessp->s_ttyp = TTY_NULL;
1351                                         sessp->s_ttypgrpid = NO_PID;
1352                                         do_rele = 1;
1353                                 }
1354                                 session_unlock(sessp);
1355
1356                                 if (do_rele) {
1357                                         vnode_rele(vp);
1358                                 }
1359                         }
1360                         session_rele(sessp);
1361                 }
1362
1363                 devsw_lock(dev, S_IFCHR);
1364
1365                 vp->v_specinfo->si_opencount--;
1366
1367                 if (vp->v_specinfo->si_opencount < 0) {
1368                         panic("Negative open count?");
1369                 }
1370                 /*
1371                  * close on last reference or on vnode revoke call
1372                  */
1373                 if ((vcount(vp) > 0) && ((flags & IO_REVOKE) == 0)) {
1374                         devsw_unlock(dev, S_IFCHR);
1375                         return (0);
1376                 }
1377
1378                 error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
1379
1380                 devsw_unlock(dev, S_IFCHR);
1381                 break;
1382
1383         case VBLK:
1384                 /*
1385                  * If there is more than one outstanding open, don't
1386                  * send the close to the device.
1387                  */
1388                 devsw_lock(dev, S_IFBLK);
1389                 if (vcount(vp) > 1) {
1390                         vp->v_specinfo->si_opencount--;
1391                         devsw_unlock(dev, S_IFBLK);
1392                         return (0);
1393                 }
1394                 devsw_unlock(dev, S_IFBLK);
1395
1396                 /*
1397                  * On last close of a block device (that isn't mounted)
1398                  * we must invalidate any in core blocks, so that
1399                  * we can, for instance, change floppy disks.
1400                  */
1401                 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
1402                         return (error);
1403
1404                 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
1405                 if (error)
1406                         return (error);
1407
1408                 devsw_lock(dev, S_IFBLK);
1409
1410                 vp->v_specinfo->si_opencount--;
1411
1412                 if (vp->v_specinfo->si_opencount < 0) {
1413                         panic("Negative open count?");
1414                 }
1415
1416                 if (vcount(vp) > 0) {
1417                         devsw_unlock(dev, S_IFBLK);
1418                         return (0);
1419                 }
1420
1421                 error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
1422
1423                 devsw_unlock(dev, S_IFBLK);
1424                 break;
1425
1426         default:
1427                 panic("spec_close: not special");
1428                 return(EBADF);
1429         }
1430
1431         return error;
1432 }
1433
1434 /*
1435  * Return POSIX pathconf information applicable to special devices.
1436  */
1437 int
1438 spec_pathconf(struct vnop_pathconf_args *ap)
1439 {
1440
1441         switch (ap->a_name) {
1442         case _PC_LINK_MAX:
1443                 *ap->a_retval = LINK_MAX;
1444                 return (0);
1445         case _PC_MAX_CANON:
1446                 *ap->a_retval = MAX_CANON;
1447                 return (0);
1448         case _PC_MAX_INPUT:
1449                 *ap->a_retval = MAX_INPUT;
1450                 return (0);
1451         case _PC_PIPE_BUF:
1452                 *ap->a_retval = PIPE_BUF;
1453                 return (0);
1454         case _PC_CHOWN_RESTRICTED:
1455                 *ap->a_retval = 200112;         /* _POSIX_CHOWN_RESTRICTED */
1456                 return (0);
1457         case _PC_VDISABLE:
1458                 *ap->a_retval = _POSIX_VDISABLE;
1459                 return (0);
1460         default:
1461                 return (EINVAL);
1462         }
1463         /* NOTREACHED */
1464 }
1465
1466 /*
1467  * Special device failed operation
1468  */
1469 int
1470 spec_ebadf(__unused void *dummy)
1471 {
1472
1473         return (EBADF);
1474 }
1475
1476 /* Blktooff derives file offset from logical block number */
1477 int
1478 spec_blktooff(struct vnop_blktooff_args *ap)
1479 {
1480         struct vnode *vp = ap->a_vp;
1481
1482         switch (vp->v_type) {
1483         case VCHR:
1484                 *ap->a_offset = (off_t)-1; /* failure */
1485                 return (ENOTSUP);
1486
1487         case VBLK:
1488                 printf("spec_blktooff: not implemented for VBLK\n");
1489                 *ap->a_offset = (off_t)-1; /* failure */
1490                 return (ENOTSUP);
1491
1492         default:
1493                 panic("spec_blktooff type");
1494         }
1495         /* NOTREACHED */
1496
1497         return (0);
1498 }
1499
1500 /* Offtoblk derives logical block number from file offset */
1501 int
1502 spec_offtoblk(struct vnop_offtoblk_args *ap)
1503 {
1504         struct vnode *vp = ap->a_vp;
1505
1506         switch (vp->v_type) {
1507         case VCHR:
1508                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
1509                 return (ENOTSUP);
1510
1511         case VBLK:
1512                 printf("spec_offtoblk: not implemented for VBLK\n");
1513                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
1514                 return (ENOTSUP);
1515
1516         default:
1517                 panic("spec_offtoblk type");
1518         }
1519         /* NOTREACHED */
1520
1521         return (0);
1522 }
1523
1524 static void filt_specdetach(struct knote *kn);
1525 static int filt_spec(struct knote *kn, long hint);
1526 static unsigned filt_specpeek(struct knote *kn);
1527
1528 struct filterops spec_filtops = {
1529         .f_isfd         = 1,
1530         .f_attach       = filt_specattach,
1531         .f_detach       = filt_specdetach,
1532         .f_event        = filt_spec,
1533         .f_peek         = filt_specpeek
1534 };
1535
1536 static int
1537 filter_to_seltype(int16_t filter)
1538 {
1539         switch (filter) {
1540         case EVFILT_READ:
1541                 return FREAD;
1542         case EVFILT_WRITE:
1543                 return FWRITE;
1544                 break;
1545         default:
1546                 panic("filt_to_seltype(): invalid filter %d\n", filter);
1547                 return 0;
1548         }
1549 }
1550
1551 static int
1552 filt_specattach(struct knote *kn)
1553 {
1554         vnode_t vp;
1555         dev_t dev;
1556
1557         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */
1558
1559         assert(vnode_ischr(vp));
1560
1561         dev = vnode_specrdev(vp);
1562
1563         if (major(dev) > nchrdev) {
1564                 return ENXIO;
1565         }
1566
1567         if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0) {
1568                 return EINVAL;
1569         }
1570
1571         /* Resulting wql is safe to unlink even if it has never been linked */
1572         kn->kn_hook = wait_queue_link_allocate();
1573         if (kn->kn_hook == NULL) {
1574                 return EAGAIN;
1575         }
1576
1577         kn->kn_fop = &spec_filtops;
1578         kn->kn_hookid = vnode_vid(vp);
1579
1580         knote_markstayqueued(kn);
1581
1582         return 0;
1583 }
1584
1585 static void
1586 filt_specdetach(struct knote *kn)
1587 {
1588         kern_return_t ret;
1589
1590         /*
1591          * Given wait queue link and wait queue set, unlink.  This is subtle.
1592          * If the device has been revoked from under us, selclearthread() will
1593          * have removed our link from the kqueue's wait queue set, which
1594          * wait_queue_set_unlink_one() will detect and handle.
1595          */
1596         ret = wait_queue_set_unlink_one(kn->kn_kq->kq_wqs, kn->kn_hook);
1597         if (ret != KERN_SUCCESS) {
1598                 panic("filt_specdetach(): failed to unlink wait queue link.");
1599         }
1600
1601         (void)wait_queue_link_free(kn->kn_hook);
1602         kn->kn_hook = NULL;
1603         kn->kn_status &= ~KN_STAYQUEUED;
1604 }
1605
1606 static int
1607 filt_spec(struct knote *kn, long hint)
1608 {
1609         vnode_t vp;
1610         uthread_t uth;
1611         wait_queue_set_t old_wqs;
1612         vfs_context_t ctx;
1613         int selres;
1614         int error;
1615         int use_offset;
1616         dev_t dev;
1617         uint64_t flags;
1618
1619         assert(kn->kn_hook != NULL);
1620
1621         if (hint != 0) {
1622                 panic("filt_spec(): nonzero hint?");
1623         }
1624
1625         uth = get_bsdthread_info(current_thread());
1626         ctx = vfs_context_current();
1627         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
1628
1629         error = vnode_getwithvid(vp, kn->kn_hookid);
1630         if (error != 0) {
1631                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1632                 return 1;
1633         }
1634
1635         dev = vnode_specrdev(vp);
1636         flags = cdevsw_flags[major(dev)];
1637         use_offset = ((flags & CDEVSW_USE_OFFSET) != 0);
1638         assert((flags & CDEVSW_SELECT_KQUEUE) != 0);
1639
1640         /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */
1641         old_wqs = uth->uu_wqset;
1642         uth->uu_wqset = kn->kn_kq->kq_wqs;
1643         selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx);
1644         uth->uu_wqset = old_wqs;
1645
1646         if (use_offset) {
1647                 if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) {
1648                         kn->kn_data = 0;
1649                 } else {
1650                         kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset;
1651                 }
1652         } else {
1653                 kn->kn_data = selres;
1654         }
1655
1656         vnode_put(vp);
1657
1658         return (kn->kn_data != 0);
1659 }
1660
1661 static unsigned
1662 filt_specpeek(struct knote *kn)
1663 {
1664         vnode_t vp;
1665         uthread_t uth;
1666         wait_queue_set_t old_wqs;
1667         vfs_context_t ctx;
1668         int error, selres;
1669
1670         uth = get_bsdthread_info(current_thread());
1671         ctx = vfs_context_current();
1672         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
1673
1674         error = vnode_getwithvid(vp, kn->kn_hookid);
1675         if (error != 0) {
1676                 return 1; /* Just like VNOP_SELECT() on recycled vnode */
1677         }
1678
1679         /*
1680          * Why pass the link here?  Because we may not have registered in the past...
1681          */
1682         old_wqs = uth->uu_wqset;
1683         uth->uu_wqset = kn->kn_kq->kq_wqs;
1684         selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx);
1685         uth->uu_wqset = old_wqs;
1686
1687         vnode_put(vp);
1688         return selres;
1689 }
1690