bsd/miscfs/specfs/spec_vnops.c

   1 /*
   2  * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1989, 1993, 1995
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)spec_vnops.c        8.14 (Berkeley) 5/21/95
  62  */
  63
  64 #include <sys/param.h>
  65 #include <sys/proc_internal.h>
  66 #include <sys/kauth.h>
  67 #include <sys/systm.h>
  68 #include <sys/kernel.h>
  69 #include <sys/conf.h>
  70 #include <sys/buf_internal.h>
  71 #include <sys/mount_internal.h>
  72 #include <sys/vnode_internal.h>
  73 #include <sys/file_internal.h>
  74 #include <sys/namei.h>
  75 #include <sys/stat.h>
  76 #include <sys/errno.h>
  77 #include <sys/ioctl.h>
  78 #include <sys/file.h>
  79 #include <sys/user.h>
  80 #include <sys/malloc.h>
  81 #include <sys/disk.h>
  82 #include <sys/uio_internal.h>
  83 #include <sys/resource.h>
  84 #include <miscfs/specfs/specdev.h>
  85 #include <vfs/vfs_support.h>
  86 #include <kern/assert.h>
  87 #include <kern/task.h>
  88
  89 #include <sys/kdebug.h>
  90
  91 /* XXX following three prototypes should be in a header file somewhere */
  92 extern dev_t    chrtoblk(dev_t dev);
  93 extern int      iskmemdev(dev_t dev);
  94 extern int      bpfkqfilter(dev_t dev, struct knote *kn);
  95 extern int      ptsd_kqfilter(dev_t dev, struct knote *kn);
  96
  97 struct vnode *speclisth[SPECHSZ];
  98
  99 /* symbolic sleep message strings for devices */
 100 char    devopn[] = "devopn";
 101 char    devio[] = "devio";
 102 char    devwait[] = "devwait";
 103 char    devin[] = "devin";
 104 char    devout[] = "devout";
 105 char    devioc[] = "devioc";
 106 char    devcls[] = "devcls";
 107
 108 #define VOPFUNC int (*)(void *)
 109
 110 int (**spec_vnodeop_p)(void *);
 111 struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
 112         { &vnop_default_desc, (VOPFUNC)vn_default_error },
 113         { &vnop_lookup_desc, (VOPFUNC)spec_lookup },            /* lookup */
 114         { &vnop_create_desc, (VOPFUNC)err_create },             /* create */
 115         { &vnop_mknod_desc, (VOPFUNC)err_mknod },               /* mknod */
 116         { &vnop_open_desc, (VOPFUNC)spec_open },                        /* open */
 117         { &vnop_close_desc, (VOPFUNC)spec_close },              /* close */
 118         { &vnop_access_desc, (VOPFUNC)spec_access },            /* access */
 119         { &vnop_getattr_desc, (VOPFUNC)spec_getattr },          /* getattr */
 120         { &vnop_setattr_desc, (VOPFUNC)spec_setattr },          /* setattr */
 121         { &vnop_read_desc, (VOPFUNC)spec_read },                        /* read */
 122         { &vnop_write_desc, (VOPFUNC)spec_write },              /* write */
 123         { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl },              /* ioctl */
 124         { &vnop_select_desc, (VOPFUNC)spec_select },            /* select */
 125         { &vnop_revoke_desc, (VOPFUNC)nop_revoke },             /* revoke */
 126         { &vnop_mmap_desc, (VOPFUNC)err_mmap },                 /* mmap */
 127         { &vnop_fsync_desc, (VOPFUNC)spec_fsync },              /* fsync */
 128         { &vnop_remove_desc, (VOPFUNC)err_remove },             /* remove */
 129         { &vnop_link_desc, (VOPFUNC)err_link },                 /* link */
 130         { &vnop_rename_desc, (VOPFUNC)err_rename },             /* rename */
 131         { &vnop_mkdir_desc, (VOPFUNC)err_mkdir },               /* mkdir */
 132         { &vnop_rmdir_desc, (VOPFUNC)err_rmdir },               /* rmdir */
 133         { &vnop_symlink_desc, (VOPFUNC)err_symlink },           /* symlink */
 134         { &vnop_readdir_desc, (VOPFUNC)err_readdir },           /* readdir */
 135         { &vnop_readlink_desc, (VOPFUNC)err_readlink },         /* readlink */
 136         { &vnop_inactive_desc, (VOPFUNC)nop_inactive },         /* inactive */
 137         { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim },           /* reclaim */
 138         { &vnop_strategy_desc, (VOPFUNC)spec_strategy },                /* strategy */
 139         { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf },                /* pathconf */
 140         { &vnop_advlock_desc, (VOPFUNC)err_advlock },           /* advlock */
 141         { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite },            /* bwrite */
 142         { &vnop_pagein_desc, (VOPFUNC)err_pagein },             /* Pagein */
 143         { &vnop_pageout_desc, (VOPFUNC)err_pageout },           /* Pageout */
 144         { &vnop_copyfile_desc, (VOPFUNC)err_copyfile },         /* Copyfile */
 145         { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff },                /* blktooff */
 146         { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk },                /* offtoblk */
 147         { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap },                /* blockmap */
 148         { (struct vnodeop_desc*)NULL, (int(*)())NULL }
 149 };
 150 struct vnodeopv_desc spec_vnodeop_opv_desc =
 151         { &spec_vnodeop_p, spec_vnodeop_entries };
 152
 153
 154 static void set_blocksize(vnode_t, dev_t);
 155
 156
 157 struct _throttle_io_info_t {
 158         struct timeval  last_normal_IO_timestamp;
 159         struct timeval  last_IO_timestamp;
 160         SInt32 numthreads_throttling;
 161         SInt32 refcnt;
 162         SInt32 alloc;
 163 };
 164
 165 struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
 166
 167 static void throttle_info_update_internal(void *throttle_info, int flags, boolean_t isssd);
 168
 169
 170
 171 /*
 172  * Trivial lookup routine that always fails.
 173  */
 174 int
 175 spec_lookup(struct vnop_lookup_args *ap)
 176 {
 177
 178         *ap->a_vpp = NULL;
 179         return (ENOTDIR);
 180 }
 181
 182 static void
 183 set_blocksize(struct vnode *vp, dev_t dev)
 184 {
 185     int (*size)(dev_t);
 186     int rsize;
 187
 188     if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
 189         rsize = (*size)(dev);
 190         if (rsize <= 0)        /* did size fail? */
 191             vp->v_specsize = DEV_BSIZE;
 192         else
 193             vp->v_specsize = rsize;
 194     }
 195     else
 196             vp->v_specsize = DEV_BSIZE;
 197 }
 198
 199 void
 200 set_fsblocksize(struct vnode *vp)
 201 {
 202
 203         if (vp->v_type == VBLK) {
 204                 dev_t dev = (dev_t)vp->v_rdev;
 205                 int maj = major(dev);
 206
 207                 if ((u_int)maj >= (u_int)nblkdev)
 208                         return;
 209
 210                 vnode_lock(vp);
 211                 set_blocksize(vp, dev);
 212                 vnode_unlock(vp);
 213         }
 214
 215 }
 216
 217
 218 /*
 219  * Open a special file.
 220  */
 221 int
 222 spec_open(struct vnop_open_args *ap)
 223 {
 224         struct proc *p = vfs_context_proc(ap->a_context);
 225         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
 226         struct vnode *vp = ap->a_vp;
 227         dev_t bdev, dev = (dev_t)vp->v_rdev;
 228         int maj = major(dev);
 229         int error;
 230
 231         /*
 232          * Don't allow open if fs is mounted -nodev.
 233          */
 234         if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
 235                 return (ENXIO);
 236
 237         switch (vp->v_type) {
 238
 239         case VCHR:
 240                 if ((u_int)maj >= (u_int)nchrdev)
 241                         return (ENXIO);
 242                 if (cred != FSCRED && (ap->a_mode & FWRITE)) {
 243                         /*
 244                          * When running in very secure mode, do not allow
 245                          * opens for writing of any disk character devices.
 246                          */
 247                         if (securelevel >= 2 && isdisk(dev, VCHR))
 248                                 return (EPERM);
 249                         /*
 250                          * When running in secure mode, do not allow opens
 251                          * for writing of /dev/mem, /dev/kmem, or character
 252                          * devices whose corresponding block devices are
 253                          * currently mounted.
 254                          */
 255                         if (securelevel >= 1) {
 256                                 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error))
 257                                         return (error);
 258                                 if (iskmemdev(dev))
 259                                         return (EPERM);
 260                         }
 261                 }
 262                 if (cdevsw[maj].d_type == D_TTY) {
 263                         vnode_lock(vp);
 264                         vp->v_flag |= VISTTY;
 265                         vnode_unlock(vp);
 266                 }
 267
 268                 devsw_lock(dev, S_IFCHR);
 269                 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
 270
 271                 if (error == 0) {
 272                         vp->v_specinfo->si_opencount++;
 273                 }
 274
 275                 devsw_unlock(dev, S_IFCHR);
 276
 277                 if (error == 0 && cdevsw[maj].d_type == D_DISK && !vp->v_un.vu_specinfo->si_initted) {
 278                         int     isssd = 0;
 279                         uint64_t throttle_mask = 0;
 280                         uint32_t devbsdunit = 0;
 281
 282                         if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL) == 0) {
 283
 284                                 if (VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) {
 285                                         /*
 286                                          * as a reasonable approximation, only use the lowest bit of the mask
 287                                          * to generate a disk unit number
 288                                          */
 289                                         devbsdunit = num_trailing_0(throttle_mask);
 290
 291                                         vnode_lock(vp);
 292
 293                                         vp->v_un.vu_specinfo->si_isssd = isssd;
 294                                         vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit;
 295                                         vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask;
 296                                         vp->v_un.vu_specinfo->si_throttleable = 1;
 297                                         vp->v_un.vu_specinfo->si_initted = 1;
 298
 299                                         vnode_unlock(vp);
 300                                 }
 301                         }
 302                         if (vp->v_un.vu_specinfo->si_initted == 0) {
 303                                 vnode_lock(vp);
 304                                 vp->v_un.vu_specinfo->si_initted = 1;
 305                                 vnode_unlock(vp);
 306                         }
 307                 }
 308                 return (error);
 309
 310         case VBLK:
 311                 if ((u_int)maj >= (u_int)nblkdev)
 312                         return (ENXIO);
 313                 /*
 314                  * When running in very secure mode, do not allow
 315                  * opens for writing of any disk block devices.
 316                  */
 317                 if (securelevel >= 2 && cred != FSCRED &&
 318                     (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK)
 319                         return (EPERM);
 320                 /*
 321                  * Do not allow opens of block devices that are
 322                  * currently mounted.
 323                  */
 324                 if ( (error = vfs_mountedon(vp)) )
 325                         return (error);
 326
 327                 devsw_lock(dev, S_IFBLK);
 328                 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
 329                 if (!error) {
 330                         vp->v_specinfo->si_opencount++;
 331                 }
 332                 devsw_unlock(dev, S_IFBLK);
 333
 334                 if (!error) {
 335                     u_int64_t blkcnt;
 336                     u_int32_t blksize;
 337                         int setsize = 0;
 338                         u_int32_t size512 = 512;
 339
 340
 341                     if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
 342                                 /* Switch to 512 byte sectors (temporarily) */
 343
 344                                 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
 345                                 /* Get the number of 512 byte physical blocks. */
 346                                 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
 347                                                 setsize = 1;
 348                                 }
 349                                 }
 350                                 /* If it doesn't set back, we can't recover */
 351                                 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context))
 352                                 error = ENXIO;
 353                     }
 354
 355
 356                         vnode_lock(vp);
 357                     set_blocksize(vp, dev);
 358
 359                     /*
 360                      * Cache the size in bytes of the block device for later
 361                      * use by spec_write().
 362                      */
 363                         if (setsize)
 364                                 vp->v_specdevsize = blkcnt * (u_int64_t)size512;
 365                         else
 366                         vp->v_specdevsize = (u_int64_t)0;       /* Default: Can't get */
 367
 368                         vnode_unlock(vp);
 369
 370                 }
 371                 return(error);
 372         default:
 373                 panic("spec_open type");
 374         }
 375         return (0);
 376 }
 377
 378 /*
 379  * Vnode op for read
 380  */
 381 int
 382 spec_read(struct vnop_read_args *ap)
 383 {
 384         struct vnode *vp = ap->a_vp;
 385         struct uio *uio = ap->a_uio;
 386         struct buf *bp;
 387         daddr64_t bn, nextbn;
 388         long bsize, bscale;
 389         int devBlockSize=0;
 390         int n, on;
 391         int error = 0;
 392         dev_t dev;
 393
 394 #if DIAGNOSTIC
 395         if (uio->uio_rw != UIO_READ)
 396                 panic("spec_read mode");
 397         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
 398                 panic("spec_read proc");
 399 #endif
 400         if (uio_resid(uio) == 0)
 401                 return (0);
 402
 403         switch (vp->v_type) {
 404
 405         case VCHR:
 406                 if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
 407                         struct _throttle_io_info_t *throttle_info;
 408
 409                         throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
 410
 411                         throttle_info_update_internal(throttle_info, 0, vp->v_un.vu_specinfo->si_isssd);
 412                 }
 413
 414                 error = (*cdevsw[major(vp->v_rdev)].d_read)
 415                         (vp->v_rdev, uio, ap->a_ioflag);
 416
 417                 return (error);
 418
 419         case VBLK:
 420                 if (uio->uio_offset < 0)
 421                         return (EINVAL);
 422
 423                 dev = vp->v_rdev;
 424
 425                 devBlockSize = vp->v_specsize;
 426
 427                 if (devBlockSize > PAGE_SIZE)
 428                         return (EINVAL);
 429
 430                 bscale = PAGE_SIZE / devBlockSize;
 431                 bsize = bscale * devBlockSize;
 432
 433                 do {
 434                         on = uio->uio_offset % bsize;
 435
 436                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1));
 437
 438                         if (vp->v_speclastr + bscale == bn) {
 439                                 nextbn = bn + bscale;
 440                                 error = buf_breadn(vp, bn, (int)bsize, &nextbn,
 441                                                (int *)&bsize, 1, NOCRED, &bp);
 442                         } else
 443                                 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
 444
 445                         vnode_lock(vp);
 446                         vp->v_speclastr = bn;
 447                         vnode_unlock(vp);
 448
 449                         n = bsize - buf_resid(bp);
 450                         if ((on > n) || error) {
 451                                 if (!error)
 452                                         error = EINVAL;
 453                                 buf_brelse(bp);
 454                                 return (error);
 455                         }
 456                         n = min((unsigned)(n  - on), uio_resid(uio));
 457
 458                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
 459                         if (n + on == bsize)
 460                                 buf_markaged(bp);
 461                         buf_brelse(bp);
 462                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
 463                 return (error);
 464
 465         default:
 466                 panic("spec_read type");
 467         }
 468         /* NOTREACHED */
 469
 470         return (0);
 471 }
 472
 473 /*
 474  * Vnode op for write
 475  */
 476 int
 477 spec_write(struct vnop_write_args *ap)
 478 {
 479         struct vnode *vp = ap->a_vp;
 480         struct uio *uio = ap->a_uio;
 481         struct buf *bp;
 482         daddr64_t bn;
 483         int bsize, blkmask, bscale;
 484         int io_sync;
 485         int devBlockSize=0;
 486         int n, on;
 487         int error = 0;
 488         dev_t dev;
 489
 490 #if DIAGNOSTIC
 491         if (uio->uio_rw != UIO_WRITE)
 492                 panic("spec_write mode");
 493         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
 494                 panic("spec_write proc");
 495 #endif
 496
 497         switch (vp->v_type) {
 498
 499         case VCHR:
 500                 if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
 501                         struct _throttle_io_info_t *throttle_info;
 502
 503                         throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
 504
 505                         throttle_info_update_internal(throttle_info, 0, vp->v_un.vu_specinfo->si_isssd);
 506
 507                         microuptime(&throttle_info->last_IO_timestamp);
 508                 }
 509
 510                 error = (*cdevsw[major(vp->v_rdev)].d_write)
 511                         (vp->v_rdev, uio, ap->a_ioflag);
 512
 513                 return (error);
 514
 515         case VBLK:
 516                 if (uio_resid(uio) == 0)
 517                         return (0);
 518                 if (uio->uio_offset < 0)
 519                         return (EINVAL);
 520
 521                 io_sync = (ap->a_ioflag & IO_SYNC);
 522
 523                 dev = (vp->v_rdev);
 524
 525                 devBlockSize = vp->v_specsize;
 526                 if (devBlockSize > PAGE_SIZE)
 527                         return(EINVAL);
 528
 529                 bscale = PAGE_SIZE / devBlockSize;
 530                 blkmask = bscale - 1;
 531                 bsize = bscale * devBlockSize;
 532
 533
 534                 do {
 535                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask);
 536                         on = uio->uio_offset % bsize;
 537
 538                         n = min((unsigned)(bsize - on), uio_resid(uio));
 539
 540                         /*
 541                          * Use buf_getblk() as an optimization IFF:
 542                          *
 543                          * 1)   We are reading exactly a block on a block
 544                          *      aligned boundary
 545                          * 2)   We know the size of the device from spec_open
 546                          * 3)   The read doesn't span the end of the device
 547                          *
 548                          * Otherwise, we fall back on buf_bread().
 549                          */
 550                         if (n == bsize &&
 551                             vp->v_specdevsize != (u_int64_t)0 &&
 552                             (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
 553                             /* reduce the size of the read to what is there */
 554                             n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
 555                         }
 556
 557                         if (n == bsize)
 558                                 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
 559                         else
 560                                 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
 561
 562                         /* Translate downstream error for upstream, if needed */
 563                         if (!error)
 564                                 error = (int)buf_error(bp);
 565                         if (error) {
 566                                 buf_brelse(bp);
 567                                 return (error);
 568                         }
 569                         n = min(n, bsize - buf_resid(bp));
 570
 571                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
 572                         if (error) {
 573                                 buf_brelse(bp);
 574                                 return (error);
 575                         }
 576                         buf_markaged(bp);
 577
 578                         if (io_sync)
 579                                 error = buf_bwrite(bp);
 580                         else {
 581                                 if ((n + on) == bsize)
 582                                         error = buf_bawrite(bp);
 583                                 else
 584                                         error = buf_bdwrite(bp);
 585                         }
 586                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
 587                 return (error);
 588
 589         default:
 590                 panic("spec_write type");
 591         }
 592         /* NOTREACHED */
 593
 594         return (0);
 595 }
 596
 597 /*
 598  * Device ioctl operation.
 599  */
 600 int
 601 spec_ioctl(struct vnop_ioctl_args *ap)
 602 {
 603         proc_t p = vfs_context_proc(ap->a_context);
 604         dev_t dev = ap->a_vp->v_rdev;
 605         int     retval = 0;
 606
 607         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START,
 608                               (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, (unsigned int)ap->a_vp->v_type, 0);
 609
 610         switch (ap->a_vp->v_type) {
 611
 612         case VCHR:
 613                 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
 614                                                        ap->a_fflag, p);
 615                 break;
 616
 617         case VBLK:
 618                 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
 619                                                        ap->a_fflag, p);
 620                 break;
 621
 622         default:
 623                 panic("spec_ioctl");
 624                 /* NOTREACHED */
 625         }
 626         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END,
 627                               (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, retval, 0);
 628
 629         return (retval);
 630 }
 631
 632 int
 633 spec_select(struct vnop_select_args *ap)
 634 {
 635         proc_t p = vfs_context_proc(ap->a_context);
 636         dev_t dev;
 637
 638         switch (ap->a_vp->v_type) {
 639
 640         default:
 641                 return (1);             /* XXX */
 642
 643         case VCHR:
 644                 dev = ap->a_vp->v_rdev;
 645                 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
 646         }
 647 }
 648
 649 static int filt_specattach(struct knote *kn);
 650
 651 int
 652 spec_kqfilter(vnode_t vp, struct knote *kn)
 653 {
 654         dev_t dev;
 655         int err = EINVAL;
 656
 657         /*
 658          * For a few special kinds of devices, we can attach knotes.
 659          * Each filter function must check whether the dev type matches it.
 660          */
 661         dev = vnode_specrdev(vp);
 662
 663         if (vnode_istty(vp)) {
 664                 /* We can hook into TTYs... */
 665                 err = filt_specattach(kn);
 666         } else {
 667                 /* Try a bpf device, as defined in bsd/net/bpf.c */
 668                 err = bpfkqfilter(dev, kn);
 669         }
 670
 671         return err;
 672 }
 673
 674 /*
 675  * Synch buffers associated with a block device
 676  */
 677 int
 678 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
 679 {
 680         if (vp->v_type == VCHR)
 681                 return (0);
 682         /*
 683          * Flush all dirty buffers associated with a block device.
 684          */
 685         buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync");
 686
 687         return (0);
 688 }
 689
 690 int
 691 spec_fsync(struct vnop_fsync_args *ap)
 692 {
 693         return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
 694 }
 695
 696 /*
 697  * Just call the device strategy routine
 698  */
 699 extern int hard_throttle_on_root;
 700 void IOSleep(int);
 701
 702 // the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
 703 #define LOWPRI_INITIAL_WINDOW_MSECS 100
 704 #define LOWPRI_WINDOW_MSECS_INC 50
 705 #define LOWPRI_MAX_WINDOW_MSECS 200
 706 #define LOWPRI_MAX_WAITING_MSECS 200
 707
 708 #if CONFIG_EMBEDDED
 709 #define LOWPRI_SLEEP_INTERVAL 5
 710 #else
 711 #define LOWPRI_SLEEP_INTERVAL 2
 712 #endif
 713
 714 int     lowpri_IO_initial_window_msecs  = LOWPRI_INITIAL_WINDOW_MSECS;
 715 int     lowpri_IO_window_msecs_inc  = LOWPRI_WINDOW_MSECS_INC;
 716 int     lowpri_max_window_msecs  = LOWPRI_MAX_WINDOW_MSECS;
 717 int     lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS;
 718
 719 #if 0
 720 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)  \
 721         do {                                                    \
 722                if ((debug_info)->alloc)                           \
 723                printf("%s: "format, __FUNCTION__, ## args);     \
 724        } while(0)
 725
 726 #else
 727 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
 728 #endif
 729
 730 SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
 731 SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, "");
 732 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
 733 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
 734
 735 /*
 736  * throttled I/O helper function
 737  * convert the index of the lowest set bit to a device index
 738  */
 739 int
 740 num_trailing_0(uint64_t n)
 741 {
 742         /*
 743          * since in most cases the number of trailing 0s is very small,
 744      * we simply counting sequentially from the lowest bit
 745          */
 746         if (n == 0)
 747                 return sizeof(n) * 8;
 748         int count = 0;
 749         while (!ISSET(n, 1)) {
 750                 n >>= 1;
 751                 ++count;
 752         }
 753         return count;
 754 }
 755
 756 /*
 757  * Release the reference and if the item was allocated and this is the last
 758  * reference then free it.
 759  *
 760  * This routine always returns the old value.
 761  */
 762 static int
 763 throttle_info_rel(struct _throttle_io_info_t *info)
 764 {
 765         SInt32 oldValue = OSDecrementAtomic(&info->refcnt);
 766
 767         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
 768                 info, (int)(oldValue -1), info );
 769
 770         /* The reference count just went negative, very bad */
 771         if (oldValue == 0)
 772                 panic("throttle info ref cnt went negative!");
 773
 774         /*
 775          * Once reference count is zero, no one else should be able to take a
 776          * reference
 777          */
 778         if ((info->refcnt == 0) && (info->alloc)) {
 779                 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info, info );
 780                 FREE(info, M_TEMP);
 781         }
 782         return oldValue;
 783 }
 784
 785 /*
 786  * Just take a reference on the throttle info structure.
 787  *
 788  * This routine always returns the old value.
 789  */
 790 static SInt32
 791 throttle_info_ref(struct _throttle_io_info_t *info)
 792 {
 793         SInt32 oldValue = OSIncrementAtomic(&info->refcnt);
 794
 795         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
 796                 info, (int)(oldValue -1), info );
 797         /* Allocated items should never have a reference of zero */
 798         if (info->alloc && (oldValue == 0))
 799                 panic("Taking a reference without calling create throttle info!\n");
 800
 801         return oldValue;
 802 }
 803
 804 /*
 805  * KPI routine
 806  *
 807  * Create and take a reference on a throttle info structure and return a
 808  * pointer for the file system to use when calling throttle_info_update.
 809  * Calling file system must have a matching release for every create.
 810  */
 811 void *
 812 throttle_info_create(void)
 813 {
 814         struct _throttle_io_info_t *info;
 815
 816         MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK);
 817         /* Should never happen but just in case */
 818         if (info == NULL)
 819                 return NULL;
 820         /* Mark that this one was allocated and needs to be freed */
 821         DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
 822         info->alloc = TRUE;
 823         /* Take a reference */
 824         OSIncrementAtomic(&info->refcnt);
 825         return info;
 826 }
 827
 828 /*
 829  * KPI routine
 830  *
 831  * Release the throttle info pointer if all the reference are gone. Should be
 832  * called to release reference taken by throttle_info_create
 833  */
 834 void
 835 throttle_info_release(void *throttle_info)
 836 {
 837         DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
 838                 (struct _throttle_io_info_t *)throttle_info,
 839                 (struct _throttle_io_info_t *)throttle_info);
 840         if (throttle_info) /* Just to be careful */
 841                 throttle_info_rel(throttle_info);
 842 }
 843
 844 /*
 845  * KPI routine
 846  *
 847  * File Systems that create an info structure, need to call this routine in
 848  * their mount routine (used by cluster code). File Systems that call this in
 849  * their mount routines must call throttle_info_mount_rel in their unmount
 850  * routines.
 851  */
 852 void
 853 throttle_info_mount_ref(mount_t mp, void *throttle_info)
 854 {
 855         if ((throttle_info == NULL) || (mp == NULL))
 856                 return;
 857         throttle_info_ref(throttle_info);
 858         /* We already have a reference release it before adding the new one */
 859         if (mp->mnt_throttle_info)
 860                 throttle_info_rel(mp->mnt_throttle_info);
 861         mp->mnt_throttle_info = throttle_info;
 862 }
 863
 864 /*
 865  * Private KPI routine
 866  *
 867  * return a handle for accessing throttle_info given a throttle_mask.  The
 868  * handle must be released by throttle_info_rel_by_mask
 869  */
 870 int
 871 throttle_info_ref_by_mask(uint64_t throttle_mask,
 872                                                   throttle_info_handle_t *throttle_info_handle)
 873 {
 874         int dev_index;
 875         struct _throttle_io_info_t *info;
 876
 877         if (throttle_info_handle == NULL)
 878                 return EINVAL;
 879
 880         dev_index = num_trailing_0(throttle_mask);
 881         info = &_throttle_io_info[dev_index];
 882         throttle_info_ref(info);
 883         *(struct _throttle_io_info_t**)throttle_info_handle = info;
 884         return 0;
 885 }
 886
 887 /*
 888  * Private KPI routine
 889  *
 890  * release the handle obtained by throttle_info_ref_by_mask
 891  */
 892 void
 893 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
 894 {
 895         /* for now the handle is just a pointer to _throttle_io_info_t */
 896         throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
 897 }
 898
 899 /*
 900  * KPI routine
 901  *
 902  * File Systems that throttle_info_mount_ref, must call this routine in their
 903  * umount routine.
 904  */
 905 void
 906 throttle_info_mount_rel(mount_t mp)
 907 {
 908         if (mp->mnt_throttle_info)
 909                 throttle_info_rel(mp->mnt_throttle_info);
 910         mp->mnt_throttle_info = NULL;
 911 }
 912
 913 void
 914 throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
 915 {
 916         struct _throttle_io_info_t *info;
 917
 918         if (mp == NULL)
 919             info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
 920         else if (mp->mnt_throttle_info == NULL)
 921             info = &_throttle_io_info[mp->mnt_devbsdunit];
 922         else
 923             info = mp->mnt_throttle_info;
 924
 925         *tv = info->last_IO_timestamp;
 926 }
 927
 928 void
 929 update_last_io_time(mount_t mp)
 930 {
 931         struct _throttle_io_info_t *info;
 932
 933         if (mp == NULL)
 934             info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
 935         else if (mp->mnt_throttle_info == NULL)
 936             info = &_throttle_io_info[mp->mnt_devbsdunit];
 937         else
 938             info = mp->mnt_throttle_info;
 939
 940         microuptime(&info->last_IO_timestamp);
 941 }
 942
 943
 944 #if CONFIG_EMBEDDED
 945
 946 int throttle_get_io_policy(struct uthread **ut)
 947 {
 948         int policy = IOPOL_DEFAULT;
 949         proc_t p = current_proc();
 950
 951         *ut = get_bsdthread_info(current_thread());
 952
 953         if (p != NULL)
 954                 policy = p->p_iopol_disk;
 955
 956         if (*ut != NULL) {
 957                 // the I/O policy of the thread overrides that of the process
 958                 // unless the I/O policy of the thread is default
 959                 if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT)
 960                         policy = (*ut)->uu_iopol_disk;
 961         }
 962         return policy;
 963 }
 964 #else
 965
 966 int throttle_get_io_policy(__unused struct uthread **ut)
 967 {
 968         *ut = get_bsdthread_info(current_thread());
 969
 970         return (proc_get_task_selfdiskacc());
 971 }
 972 #endif
 973
 974
 975 static int
 976 throttle_io_will_be_throttled_internal(int lowpri_window_msecs, void * throttle_info)
 977 {
 978         struct _throttle_io_info_t *info = throttle_info;
 979         struct timeval elapsed;
 980         int elapsed_msecs;
 981         int policy;
 982         struct uthread  *ut;
 983
 984         policy = throttle_get_io_policy(&ut);
 985
 986         if (ut->uu_throttle_bc == FALSE && policy != IOPOL_THROTTLE)
 987                 return (0);
 988
 989         microuptime(&elapsed);
 990         timevalsub(&elapsed, &info->last_normal_IO_timestamp);
 991         elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
 992
 993         if (lowpri_window_msecs == -1) // use the max waiting time
 994                 lowpri_window_msecs = lowpri_max_waiting_msecs;
 995
 996         return elapsed_msecs < lowpri_window_msecs;
 997 }
 998
 999 /*
1000  * If we have a mount point and it has a throttle info pointer then
1001  * use it to do the check, otherwise use the device unit number to find
1002  * the correct throttle info array element.
1003  */
1004 int
1005 throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp)
1006 {
1007         void *info;
1008
1009         /* Should we just return zero if no mount point */
1010         if (mp == NULL)
1011             info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1012         else if (mp->mnt_throttle_info == NULL)
1013             info = &_throttle_io_info[mp->mnt_devbsdunit];
1014         else
1015             info = mp->mnt_throttle_info;
1016         return throttle_io_will_be_throttled_internal(lowpri_window_msecs, info);
1017 }
1018
1019 uint32_t
1020 throttle_lowpri_io(int sleep_amount)
1021 {
1022         int sleep_cnt = 0;
1023         int numthreads_throttling;
1024         int max_try_num;
1025         struct uthread *ut;
1026         struct _throttle_io_info_t *info;
1027         int max_waiting_msecs;
1028
1029         ut = get_bsdthread_info(current_thread());
1030
1031         if ((ut->uu_lowpri_window == 0) || (ut->uu_throttle_info == NULL))
1032                 goto done;
1033
1034         info = ut->uu_throttle_info;
1035
1036         if (sleep_amount != 0) {
1037 #if CONFIG_EMBEDDED
1038                 max_waiting_msecs = lowpri_max_waiting_msecs;
1039 #else
1040                 if (ut->uu_throttle_isssd == TRUE)
1041                         max_waiting_msecs = lowpri_max_waiting_msecs / 100;
1042                 else
1043                         max_waiting_msecs = lowpri_max_waiting_msecs;
1044 #endif
1045                 if (max_waiting_msecs < LOWPRI_SLEEP_INTERVAL)
1046                         max_waiting_msecs = LOWPRI_SLEEP_INTERVAL;
1047
1048                 numthreads_throttling = info->numthreads_throttling + MIN(10, MAX(1, sleep_amount)) - 1;
1049                 max_try_num = max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, numthreads_throttling);
1050
1051                 for (sleep_cnt = 0; sleep_cnt < max_try_num; sleep_cnt++) {
1052                         if (throttle_io_will_be_throttled_internal(ut->uu_lowpri_window, info)) {
1053                                 if (sleep_cnt == 0) {
1054                                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
1055                                                               ut->uu_lowpri_window, max_try_num, numthreads_throttling, 0, 0);
1056                                 }
1057                                 IOSleep(LOWPRI_SLEEP_INTERVAL);
1058                                 DEBUG_ALLOC_THROTTLE_INFO("sleeping because of info = %p\n", info, info );
1059                         } else {
1060                                 break;
1061                         }
1062                 }
1063                 if (sleep_cnt) {
1064                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
1065                                               ut->uu_lowpri_window, sleep_cnt, 0, 0, 0);
1066                 }
1067         }
1068         SInt32 oldValue;
1069         oldValue = OSDecrementAtomic(&info->numthreads_throttling);
1070
1071         if (oldValue <= 0) {
1072                 panic("%s: numthreads negative", __func__);
1073         }
1074 done:
1075         ut->uu_lowpri_window = 0;
1076         if (ut->uu_throttle_info)
1077                 throttle_info_rel(ut->uu_throttle_info);
1078         ut->uu_throttle_info = NULL;
1079         ut->uu_throttle_bc = FALSE;
1080
1081         return (sleep_cnt * LOWPRI_SLEEP_INTERVAL);
1082 }
1083
1084 /*
1085  * KPI routine
1086  *
1087  * set a kernel thread's IO policy.  policy can be:
1088  * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE
1089  *
1090  * explanations about these policies are in the man page of setiopolicy_np
1091  */
1092 void throttle_set_thread_io_policy(int policy)
1093 {
1094 #if !CONFIG_EMBEDDED
1095         proc_apply_thread_selfdiskacc(policy);
1096 #else /* !CONFIG_EMBEDDED */
1097         struct uthread *ut;
1098         ut = get_bsdthread_info(current_thread());
1099         ut->uu_iopol_disk = policy;
1100 #endif /* !CONFIG_EMBEDDED */
1101 }
1102
1103
1104 static
1105 void throttle_info_reset_window(struct uthread *ut)
1106 {
1107         struct _throttle_io_info_t *info;
1108
1109         info = ut->uu_throttle_info;
1110
1111         OSDecrementAtomic(&info->numthreads_throttling);
1112         throttle_info_rel(info);
1113         ut->uu_throttle_info = NULL;
1114         ut->uu_lowpri_window = 0;
1115 }
1116
1117 static
1118 void throttle_info_set_initial_window(struct uthread *ut, struct _throttle_io_info_t *info, boolean_t isssd, boolean_t BC_throttle)
1119 {
1120         SInt32 oldValue;
1121
1122         ut->uu_throttle_info = info;
1123         throttle_info_ref(info);
1124         DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
1125
1126         oldValue = OSIncrementAtomic(&info->numthreads_throttling);
1127         if (oldValue < 0) {
1128                 panic("%s: numthreads negative", __func__);
1129         }
1130         ut->uu_lowpri_window = lowpri_IO_initial_window_msecs;
1131         ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue;
1132         ut->uu_throttle_isssd = isssd;
1133         ut->uu_throttle_bc = BC_throttle;
1134 }
1135
1136
1137 static
1138 void throttle_info_update_internal(void *throttle_info, int flags, boolean_t isssd)
1139 {
1140         struct _throttle_io_info_t *info = throttle_info;
1141         struct uthread  *ut;
1142         int policy;
1143         int is_throttleable_io = 0;
1144         int is_passive_io = 0;
1145
1146         if (!lowpri_IO_initial_window_msecs || (info == NULL))
1147                 return;
1148         policy = throttle_get_io_policy(&ut);
1149
1150         switch (policy) {
1151         case IOPOL_DEFAULT:
1152         case IOPOL_NORMAL:
1153                 break;
1154         case IOPOL_THROTTLE:
1155                 is_throttleable_io = 1;
1156                 break;
1157         case IOPOL_PASSIVE:
1158                 is_passive_io = 1;
1159                 break;
1160         default:
1161                 printf("unknown I/O policy %d", policy);
1162                 break;
1163         }
1164
1165         if (!is_throttleable_io && ISSET(flags, B_PASSIVE))
1166                 is_passive_io |= 1;
1167
1168         if (!is_throttleable_io) {
1169                 if (!is_passive_io){
1170                         microuptime(&info->last_normal_IO_timestamp);
1171                 }
1172         } else if (ut) {
1173                 /*
1174                  * I'd really like to do the IOSleep here, but
1175                  * we may be holding all kinds of filesystem related locks
1176                  * and the pages for this I/O marked 'busy'...
1177                  * we don't want to cause a normal task to block on
1178                  * one of these locks while we're throttling a task marked
1179                  * for low priority I/O... we'll mark the uthread and
1180                  * do the delay just before we return from the system
1181                  * call that triggered this I/O or from vnode_pagein
1182                  */
1183                 if (ut->uu_lowpri_window == 0)
1184                         throttle_info_set_initial_window(ut, info, isssd, FALSE);
1185                 else {
1186                         /* The thread sends I/Os to different devices within the same system call */
1187                         if (ut->uu_throttle_info != info) {
1188                                 struct _throttle_io_info_t *old_info = ut->uu_throttle_info;
1189
1190                                 // keep track of the numthreads in the right device
1191                                 OSDecrementAtomic(&old_info->numthreads_throttling);
1192                                 OSIncrementAtomic(&info->numthreads_throttling);
1193
1194                                 DEBUG_ALLOC_THROTTLE_INFO("switching from info = %p\n", old_info, old_info );
1195                                 DEBUG_ALLOC_THROTTLE_INFO("switching to info = %p\n", info, info );
1196                                 /* This thread no longer needs a reference on that throttle info */
1197                                 throttle_info_rel(ut->uu_throttle_info);
1198                                 ut->uu_throttle_info = info;
1199                                 /* Need to take a reference on this throttle info */
1200                                 throttle_info_ref(ut->uu_throttle_info);
1201                         }
1202                         int numthreads = MAX(1, info->numthreads_throttling);
1203                         ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads;
1204                         if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads)
1205                                 ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads;
1206
1207                         if (isssd == FALSE) {
1208                                 /*
1209                                  * we're here because we've actually issued I/Os to different devices...
1210                                  * if at least one of them was a non SSD, then thottle the thread
1211                                  * using the policy for non SSDs
1212                                  */
1213                                 ut->uu_throttle_isssd = FALSE;
1214                         }
1215                 }
1216         }
1217 }
1218
1219 /*
1220  * KPI routine
1221  *
1222  * this is usually called before every I/O, used for throttled I/O
1223  * book keeping.  This routine has low overhead and does not sleep
1224  */
1225 void throttle_info_update(void *throttle_info, int flags)
1226 {
1227         throttle_info_update_internal(throttle_info, flags, FALSE);
1228 }
1229
1230 /*
1231  * KPI routine
1232  *
1233  * this is usually called before every I/O, used for throttled I/O
1234  * book keeping.  This routine has low overhead and does not sleep
1235  */
1236 void throttle_info_update_by_mask(void *throttle_info_handle, int flags)
1237 {
1238         void *throttle_info = throttle_info_handle;
1239         /* for now we only use the lowest bit of the throttle mask, so the
1240          * handle is the same as the throttle_info.  Later if we store a
1241          * set of throttle infos in the handle, we will want to loop through
1242          * them and call throttle_info_update in a loop
1243          */
1244         throttle_info_update(throttle_info, flags);
1245 }
1246
1247 extern int ignore_is_ssd;
1248
1249 int
1250 spec_strategy(struct vnop_strategy_args *ap)
1251 {
1252         buf_t   bp;
1253         int     bflags;
1254         int     policy;
1255         dev_t   bdev;
1256         uthread_t ut;
1257         mount_t mp;
1258         int strategy_ret;
1259         struct _throttle_io_info_t *throttle_info;
1260         boolean_t isssd = FALSE;
1261
1262         bp = ap->a_bp;
1263         bdev = buf_device(bp);
1264         mp = buf_vnode(bp)->v_mount;
1265
1266         policy = throttle_get_io_policy(&ut);
1267
1268         if (policy == IOPOL_THROTTLE) {
1269                 bp->b_flags |= B_THROTTLED_IO;
1270                 bp->b_attr.ba_flags |= BA_THROTTLED_IO;
1271                 bp->b_flags &= ~B_PASSIVE;
1272         } else if (policy == IOPOL_PASSIVE)
1273                 bp->b_flags |= B_PASSIVE;
1274
1275         bflags = bp->b_flags;
1276
1277         if (kdebug_enable) {
1278                 int    code = 0;
1279
1280                 if (bflags & B_READ)
1281                         code |= DKIO_READ;
1282                 if (bflags & B_ASYNC)
1283                         code |= DKIO_ASYNC;
1284
1285                 if (bflags & B_META)
1286                         code |= DKIO_META;
1287                 else if (bflags & B_PAGEIO)
1288                         code |= DKIO_PAGING;
1289
1290                 if (bflags & B_THROTTLED_IO)
1291                         code |= DKIO_THROTTLE;
1292                 else if (bflags & B_PASSIVE)
1293                         code |= DKIO_PASSIVE;
1294
1295                 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
1296                                       bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0);
1297         }
1298         if (((bflags & (B_IOSTREAMING | B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
1299             mp && (mp->mnt_kern_flag & MNTK_ROOTDEV))
1300                 hard_throttle_on_root = 1;
1301
1302         if (mp != NULL) {
1303                 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
1304                         isssd = TRUE;
1305                 throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
1306         } else
1307                 throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1308
1309         throttle_info_update_internal(throttle_info, bflags, isssd);
1310
1311         if ((bflags & B_READ) == 0) {
1312                 microuptime(&throttle_info->last_IO_timestamp);
1313                 if (mp) {
1314                         INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
1315                 }
1316         } else if (mp) {
1317                 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
1318         }
1319         /*
1320          * The BootCache may give us special information about
1321          * the IO, so it returns special values that we check
1322          * for here.
1323          *
1324          * IO_SATISFIED_BY_CACHE
1325          * The read has been satisfied by the boot cache. Don't
1326          * throttle the thread unnecessarily.
1327          *
1328          * IO_SHOULD_BE_THROTTLED
1329          * The boot cache is playing back a playlist and this IO
1330          * cut through. Throttle it so we're not cutting through
1331          * the boot cache too often.
1332          *
1333          * Note that typical strategy routines are defined with
1334          * a void return so we'll get garbage here. In the
1335          * unlikely case the garbage matches our special return
1336          * value, it's not a big deal since we're only adjusting
1337          * the throttling delay.
1338          */
1339 #define IO_SATISFIED_BY_CACHE  ((int)0xcafefeed)
1340 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
1341         typedef int strategy_fcn_ret_t(struct buf *bp);
1342
1343         strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp);
1344
1345         if ((IO_SATISFIED_BY_CACHE == strategy_ret) && (ut->uu_lowpri_window != 0) && (ut->uu_throttle_info != NULL)) {
1346                 /*
1347                  * If this was a throttled IO satisfied by the boot cache,
1348                  * don't delay the thread.
1349                  */
1350                 throttle_info_reset_window(ut);
1351
1352         } else if ((IO_SHOULD_BE_THROTTLED == strategy_ret) && (ut->uu_lowpri_window == 0) && (ut->uu_throttle_info == NULL)) {
1353                 /*
1354                  * If the boot cache indicates this IO should be throttled,
1355                  * delay the thread.
1356                  */
1357                 throttle_info_set_initial_window(ut, throttle_info, isssd, TRUE);
1358         }
1359         return (0);
1360 }
1361
1362
1363 /*
1364  * This is a noop, simply returning what one has been given.
1365  */
1366 int
1367 spec_blockmap(__unused struct vnop_blockmap_args *ap)
1368 {
1369         return (ENOTSUP);
1370 }
1371
1372
1373 /*
1374  * Device close routine
1375  */
1376 int
1377 spec_close(struct vnop_close_args *ap)
1378 {
1379         struct vnode *vp = ap->a_vp;
1380         dev_t dev = vp->v_rdev;
1381         int error = 0;
1382         int flags = ap->a_fflag;
1383         struct proc *p = vfs_context_proc(ap->a_context);
1384         struct session *sessp;
1385         int do_rele = 0;
1386
1387         switch (vp->v_type) {
1388
1389         case VCHR:
1390                 /*
1391                  * Hack: a tty device that is a controlling terminal
1392                  * has a reference from the session structure.
1393                  * We cannot easily tell that a character device is
1394                  * a controlling terminal, unless it is the closing
1395                  * process' controlling terminal.  In that case,
1396                  * if the reference count is 1 (this is the very
1397              * last close)
1398                  */
1399                 sessp = proc_session(p);
1400                 if (sessp != SESSION_NULL) {
1401                         if ((vcount(vp) == 1) &&
1402                                 (vp == sessp->s_ttyvp)) {
1403
1404                                 session_lock(sessp);
1405                                 if (vp == sessp->s_ttyvp) {
1406                                         sessp->s_ttyvp = NULL;
1407                                         sessp->s_ttyvid = 0;
1408                                         sessp->s_ttyp = TTY_NULL;
1409                                         sessp->s_ttypgrpid = NO_PID;
1410                                         do_rele = 1;
1411                                 }
1412                                 session_unlock(sessp);
1413
1414                                 if (do_rele) {
1415                                         vnode_rele(vp);
1416                                 }
1417                         }
1418                         session_rele(sessp);
1419                 }
1420
1421                 devsw_lock(dev, S_IFCHR);
1422
1423                 vp->v_specinfo->si_opencount--;
1424
1425                 if (vp->v_specinfo->si_opencount < 0) {
1426                         panic("Negative open count?");
1427                 }
1428                 /*
1429                  * close on last reference or on vnode revoke call
1430                  */
1431                 if ((vcount(vp) > 0) && ((flags & IO_REVOKE) == 0)) {
1432                         devsw_unlock(dev, S_IFCHR);
1433                         return (0);
1434                 }
1435
1436                 error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
1437
1438                 devsw_unlock(dev, S_IFCHR);
1439                 break;
1440
1441         case VBLK:
1442                 /*
1443                  * If there is more than one outstanding open, don't
1444                  * send the close to the device.
1445                  */
1446                 devsw_lock(dev, S_IFBLK);
1447                 if (vcount(vp) > 1) {
1448                         vp->v_specinfo->si_opencount--;
1449                         devsw_unlock(dev, S_IFBLK);
1450                         return (0);
1451                 }
1452                 devsw_unlock(dev, S_IFBLK);
1453
1454                 /*
1455                  * On last close of a block device (that isn't mounted)
1456                  * we must invalidate any in core blocks, so that
1457                  * we can, for instance, change floppy disks.
1458                  */
1459                 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
1460                         return (error);
1461
1462                 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
1463                 if (error)
1464                         return (error);
1465
1466                 devsw_lock(dev, S_IFBLK);
1467
1468                 vp->v_specinfo->si_opencount--;
1469
1470                 if (vp->v_specinfo->si_opencount < 0) {
1471                         panic("Negative open count?");
1472                 }
1473
1474                 if (vcount(vp) > 0) {
1475                         devsw_unlock(dev, S_IFBLK);
1476                         return (0);
1477                 }
1478
1479                 error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
1480
1481                 devsw_unlock(dev, S_IFBLK);
1482                 break;
1483
1484         default:
1485                 panic("spec_close: not special");
1486                 return(EBADF);
1487         }
1488
1489         return error;
1490 }
1491
1492 /*
1493  * Return POSIX pathconf information applicable to special devices.
1494  */
1495 int
1496 spec_pathconf(struct vnop_pathconf_args *ap)
1497 {
1498
1499         switch (ap->a_name) {
1500         case _PC_LINK_MAX:
1501                 *ap->a_retval = LINK_MAX;
1502                 return (0);
1503         case _PC_MAX_CANON:
1504                 *ap->a_retval = MAX_CANON;
1505                 return (0);
1506         case _PC_MAX_INPUT:
1507                 *ap->a_retval = MAX_INPUT;
1508                 return (0);
1509         case _PC_PIPE_BUF:
1510                 *ap->a_retval = PIPE_BUF;
1511                 return (0);
1512         case _PC_CHOWN_RESTRICTED:
1513                 *ap->a_retval = 200112;         /* _POSIX_CHOWN_RESTRICTED */
1514                 return (0);
1515         case _PC_VDISABLE:
1516                 *ap->a_retval = _POSIX_VDISABLE;
1517                 return (0);
1518         default:
1519                 return (EINVAL);
1520         }
1521         /* NOTREACHED */
1522 }
1523
1524 /*
1525  * Special device failed operation
1526  */
1527 int
1528 spec_ebadf(__unused void *dummy)
1529 {
1530
1531         return (EBADF);
1532 }
1533
1534 /* Blktooff derives file offset from logical block number */
1535 int
1536 spec_blktooff(struct vnop_blktooff_args *ap)
1537 {
1538         struct vnode *vp = ap->a_vp;
1539
1540         switch (vp->v_type) {
1541         case VCHR:
1542                 *ap->a_offset = (off_t)-1; /* failure */
1543                 return (ENOTSUP);
1544
1545         case VBLK:
1546                 printf("spec_blktooff: not implemented for VBLK\n");
1547                 *ap->a_offset = (off_t)-1; /* failure */
1548                 return (ENOTSUP);
1549
1550         default:
1551                 panic("spec_blktooff type");
1552         }
1553         /* NOTREACHED */
1554
1555         return (0);
1556 }
1557
1558 /* Offtoblk derives logical block number from file offset */
1559 int
1560 spec_offtoblk(struct vnop_offtoblk_args *ap)
1561 {
1562         struct vnode *vp = ap->a_vp;
1563
1564         switch (vp->v_type) {
1565         case VCHR:
1566                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
1567                 return (ENOTSUP);
1568
1569         case VBLK:
1570                 printf("spec_offtoblk: not implemented for VBLK\n");
1571                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
1572                 return (ENOTSUP);
1573
1574         default:
1575                 panic("spec_offtoblk type");
1576         }
1577         /* NOTREACHED */
1578
1579         return (0);
1580 }
1581
1582 static void filt_specdetach(struct knote *kn);
1583 static int filt_spec(struct knote *kn, long hint);
1584 static unsigned filt_specpeek(struct knote *kn);
1585
1586 struct filterops spec_filtops = {
1587         .f_isfd         = 1,
1588         .f_attach       = filt_specattach,
1589         .f_detach       = filt_specdetach,
1590         .f_event        = filt_spec,
1591         .f_peek         = filt_specpeek
1592 };
1593
1594 static int
1595 filter_to_seltype(int16_t filter)
1596 {
1597         switch (filter) {
1598         case EVFILT_READ:
1599                 return FREAD;
1600         case EVFILT_WRITE:
1601                 return FWRITE;
1602                 break;
1603         default:
1604                 panic("filt_to_seltype(): invalid filter %d\n", filter);
1605                 return 0;
1606         }
1607 }
1608
1609 static int
1610 filt_specattach(struct knote *kn)
1611 {
1612         vnode_t vp;
1613         dev_t dev;
1614
1615         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */
1616
1617         assert(vnode_ischr(vp));
1618
1619         dev = vnode_specrdev(vp);
1620
1621         if (major(dev) > nchrdev) {
1622                 return ENXIO;
1623         }
1624
1625         if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0) {
1626                 return EINVAL;
1627         }
1628
1629         /* Resulting wql is safe to unlink even if it has never been linked */
1630         kn->kn_hook = wait_queue_link_allocate();
1631         if (kn->kn_hook == NULL) {
1632                 return EAGAIN;
1633         }
1634
1635         kn->kn_fop = &spec_filtops;
1636         kn->kn_hookid = vnode_vid(vp);
1637
1638         knote_markstayqueued(kn);
1639
1640         return 0;
1641 }
1642
1643 static void
1644 filt_specdetach(struct knote *kn)
1645 {
1646         kern_return_t ret;
1647
1648         /*
1649          * Given wait queue link and wait queue set, unlink.  This is subtle.
1650          * If the device has been revoked from under us, selclearthread() will
1651          * have removed our link from the kqueue's wait queue set, which
1652          * wait_queue_set_unlink_one() will detect and handle.
1653          */
1654         ret = wait_queue_set_unlink_one(kn->kn_kq->kq_wqs, kn->kn_hook);
1655         if (ret != KERN_SUCCESS) {
1656                 panic("filt_specdetach(): failed to unlink wait queue link.");
1657         }
1658
1659         (void)wait_queue_link_free(kn->kn_hook);
1660         kn->kn_hook = NULL;
1661         kn->kn_status &= ~KN_STAYQUEUED;
1662 }
1663
1664 static int
1665 filt_spec(struct knote *kn, long hint)
1666 {
1667         vnode_t vp;
1668         uthread_t uth;
1669         wait_queue_set_t old_wqs;
1670         vfs_context_t ctx;
1671         int selres;
1672         int error;
1673         int use_offset;
1674         dev_t dev;
1675         uint64_t flags;
1676
1677         assert(kn->kn_hook != NULL);
1678
1679         if (hint != 0) {
1680                 panic("filt_spec(): nonzero hint?");
1681         }
1682
1683         uth = get_bsdthread_info(current_thread());
1684         ctx = vfs_context_current();
1685         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
1686
1687         error = vnode_getwithvid(vp, kn->kn_hookid);
1688         if (error != 0) {
1689                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1690                 return 1;
1691         }
1692
1693         dev = vnode_specrdev(vp);
1694         flags = cdevsw_flags[major(dev)];
1695         use_offset = ((flags & CDEVSW_USE_OFFSET) != 0);
1696         assert((flags & CDEVSW_SELECT_KQUEUE) != 0);
1697
1698         /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */
1699         old_wqs = uth->uu_wqset;
1700         uth->uu_wqset = kn->kn_kq->kq_wqs;
1701         selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx);
1702         uth->uu_wqset = old_wqs;
1703
1704         if (use_offset) {
1705                 if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) {
1706                         kn->kn_data = 0;
1707                 } else {
1708                         kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset;
1709                 }
1710         } else {
1711                 kn->kn_data = selres;
1712         }
1713
1714         vnode_put(vp);
1715
1716         return (kn->kn_data != 0);
1717 }
1718
1719 static unsigned
1720 filt_specpeek(struct knote *kn)
1721 {
1722         vnode_t vp;
1723         uthread_t uth;
1724         wait_queue_set_t old_wqs;
1725         vfs_context_t ctx;
1726         int error, selres;
1727
1728         uth = get_bsdthread_info(current_thread());
1729         ctx = vfs_context_current();
1730         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
1731
1732         error = vnode_getwithvid(vp, kn->kn_hookid);
1733         if (error != 0) {
1734                 return 1; /* Just like VNOP_SELECT() on recycled vnode */
1735         }
1736
1737         /*
1738          * Why pass the link here?  Because we may not have registered in the past...
1739          */
1740         old_wqs = uth->uu_wqset;
1741         uth->uu_wqset = kn->kn_kq->kq_wqs;
1742         selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx);
1743         uth->uu_wqset = old_wqs;
1744
1745         vnode_put(vp);
1746         return selres;
1747 }
1748