bsd/miscfs/specfs/spec_vnops.c

   1 /*
   2  * Copyright (c) 2000-2012 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1989, 1993, 1995
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)spec_vnops.c        8.14 (Berkeley) 5/21/95
  62  */
  63
  64 #include <sys/param.h>
  65 #include <sys/proc_internal.h>
  66 #include <sys/kauth.h>
  67 #include <sys/systm.h>
  68 #include <sys/kernel.h>
  69 #include <sys/conf.h>
  70 #include <sys/buf_internal.h>
  71 #include <sys/mount_internal.h>
  72 #include <sys/vnode_internal.h>
  73 #include <sys/file_internal.h>
  74 #include <sys/namei.h>
  75 #include <sys/stat.h>
  76 #include <sys/errno.h>
  77 #include <sys/ioctl.h>
  78 #include <sys/file.h>
  79 #include <sys/user.h>
  80 #include <sys/malloc.h>
  81 #include <sys/disk.h>
  82 #include <sys/uio_internal.h>
  83 #include <sys/resource.h>
  84 #include <miscfs/specfs/specdev.h>
  85 #include <vfs/vfs_support.h>
  86 #include <kern/assert.h>
  87 #include <kern/task.h>
  88
  89 #include <sys/kdebug.h>
  90
  91 /* XXX following three prototypes should be in a header file somewhere */
  92 extern dev_t    chrtoblk(dev_t dev);
  93 extern int      iskmemdev(dev_t dev);
  94 extern int      bpfkqfilter(dev_t dev, struct knote *kn);
  95 extern int      ptsd_kqfilter(dev_t dev, struct knote *kn);
  96
  97 extern int ignore_is_ssd;
  98
  99 struct vnode *speclisth[SPECHSZ];
 100
 101 /* symbolic sleep message strings for devices */
 102 char    devopn[] = "devopn";
 103 char    devio[] = "devio";
 104 char    devwait[] = "devwait";
 105 char    devin[] = "devin";
 106 char    devout[] = "devout";
 107 char    devioc[] = "devioc";
 108 char    devcls[] = "devcls";
 109
 110 #define VOPFUNC int (*)(void *)
 111
 112 int (**spec_vnodeop_p)(void *);
 113 struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
 114         { &vnop_default_desc, (VOPFUNC)vn_default_error },
 115         { &vnop_lookup_desc, (VOPFUNC)spec_lookup },            /* lookup */
 116         { &vnop_create_desc, (VOPFUNC)err_create },             /* create */
 117         { &vnop_mknod_desc, (VOPFUNC)err_mknod },               /* mknod */
 118         { &vnop_open_desc, (VOPFUNC)spec_open },                        /* open */
 119         { &vnop_close_desc, (VOPFUNC)spec_close },              /* close */
 120         { &vnop_access_desc, (VOPFUNC)spec_access },            /* access */
 121         { &vnop_getattr_desc, (VOPFUNC)spec_getattr },          /* getattr */
 122         { &vnop_setattr_desc, (VOPFUNC)spec_setattr },          /* setattr */
 123         { &vnop_read_desc, (VOPFUNC)spec_read },                        /* read */
 124         { &vnop_write_desc, (VOPFUNC)spec_write },              /* write */
 125         { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl },              /* ioctl */
 126         { &vnop_select_desc, (VOPFUNC)spec_select },            /* select */
 127         { &vnop_revoke_desc, (VOPFUNC)nop_revoke },             /* revoke */
 128         { &vnop_mmap_desc, (VOPFUNC)err_mmap },                 /* mmap */
 129         { &vnop_fsync_desc, (VOPFUNC)spec_fsync },              /* fsync */
 130         { &vnop_remove_desc, (VOPFUNC)err_remove },             /* remove */
 131         { &vnop_link_desc, (VOPFUNC)err_link },                 /* link */
 132         { &vnop_rename_desc, (VOPFUNC)err_rename },             /* rename */
 133         { &vnop_mkdir_desc, (VOPFUNC)err_mkdir },               /* mkdir */
 134         { &vnop_rmdir_desc, (VOPFUNC)err_rmdir },               /* rmdir */
 135         { &vnop_symlink_desc, (VOPFUNC)err_symlink },           /* symlink */
 136         { &vnop_readdir_desc, (VOPFUNC)err_readdir },           /* readdir */
 137         { &vnop_readlink_desc, (VOPFUNC)err_readlink },         /* readlink */
 138         { &vnop_inactive_desc, (VOPFUNC)nop_inactive },         /* inactive */
 139         { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim },           /* reclaim */
 140         { &vnop_strategy_desc, (VOPFUNC)spec_strategy },                /* strategy */
 141         { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf },                /* pathconf */
 142         { &vnop_advlock_desc, (VOPFUNC)err_advlock },           /* advlock */
 143         { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite },            /* bwrite */
 144         { &vnop_pagein_desc, (VOPFUNC)err_pagein },             /* Pagein */
 145         { &vnop_pageout_desc, (VOPFUNC)err_pageout },           /* Pageout */
 146         { &vnop_copyfile_desc, (VOPFUNC)err_copyfile },         /* Copyfile */
 147         { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff },                /* blktooff */
 148         { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk },                /* offtoblk */
 149         { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap },                /* blockmap */
 150         { (struct vnodeop_desc*)NULL, (int(*)())NULL }
 151 };
 152 struct vnodeopv_desc spec_vnodeop_opv_desc =
 153         { &spec_vnodeop_p, spec_vnodeop_entries };
 154
 155
 156 static void set_blocksize(vnode_t, dev_t);
 157
 158
 159 #define THROTTLE_LEVEL_NONE     -1
 160 #define THROTTLE_LEVEL_TIER0     0
 161
 162 #define THROTTLE_LEVEL_THROTTLED 1
 163 #define THROTTLE_LEVEL_TIER1     1
 164 #define THROTTLE_LEVEL_TIER2     2
 165
 166 #define THROTTLE_LEVEL_START     0
 167 #define THROTTLE_LEVEL_END       2
 168
 169
 170 struct _throttle_io_info_t {
 171         struct timeval  throttle_last_IO_timestamp[THROTTLE_LEVEL_END + 1];
 172         struct timeval  throttle_last_write_timestamp;
 173         struct timeval  throttle_start_IO_period_timestamp;
 174
 175         TAILQ_HEAD( , uthread) throttle_uthlist;        /* List of throttled uthreads */
 176
 177         lck_mtx_t       throttle_lock;
 178         thread_call_t   throttle_timer_call;
 179         int32_t throttle_timer_running;
 180         int32_t throttle_io_count;
 181         int32_t throttle_io_count_begin;
 182         int32_t throttle_io_period;
 183         uint32_t throttle_io_period_num;
 184         int32_t throttle_refcnt;
 185         int32_t throttle_alloc;
 186 };
 187
 188 struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
 189
 190 static void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int policy, int flags, boolean_t isssd);
 191 static int throttle_get_thread_throttle_level(uthread_t ut, int policy);
 192
 193 __private_extern__ int32_t throttle_legacy_process_count = 0;
 194
 195 /*
 196  * Trivial lookup routine that always fails.
 197  */
 198 int
 199 spec_lookup(struct vnop_lookup_args *ap)
 200 {
 201
 202         *ap->a_vpp = NULL;
 203         return (ENOTDIR);
 204 }
 205
 206 static void
 207 set_blocksize(struct vnode *vp, dev_t dev)
 208 {
 209     int (*size)(dev_t);
 210     int rsize;
 211
 212     if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
 213         rsize = (*size)(dev);
 214         if (rsize <= 0)        /* did size fail? */
 215             vp->v_specsize = DEV_BSIZE;
 216         else
 217             vp->v_specsize = rsize;
 218     }
 219     else
 220             vp->v_specsize = DEV_BSIZE;
 221 }
 222
 223 void
 224 set_fsblocksize(struct vnode *vp)
 225 {
 226
 227         if (vp->v_type == VBLK) {
 228                 dev_t dev = (dev_t)vp->v_rdev;
 229                 int maj = major(dev);
 230
 231                 if ((u_int)maj >= (u_int)nblkdev)
 232                         return;
 233
 234                 vnode_lock(vp);
 235                 set_blocksize(vp, dev);
 236                 vnode_unlock(vp);
 237         }
 238
 239 }
 240
 241
 242 /*
 243  * Open a special file.
 244  */
 245 int
 246 spec_open(struct vnop_open_args *ap)
 247 {
 248         struct proc *p = vfs_context_proc(ap->a_context);
 249         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
 250         struct vnode *vp = ap->a_vp;
 251         dev_t bdev, dev = (dev_t)vp->v_rdev;
 252         int maj = major(dev);
 253         int error;
 254
 255         /*
 256          * Don't allow open if fs is mounted -nodev.
 257          */
 258         if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
 259                 return (ENXIO);
 260
 261         switch (vp->v_type) {
 262
 263         case VCHR:
 264                 if ((u_int)maj >= (u_int)nchrdev)
 265                         return (ENXIO);
 266                 if (cred != FSCRED && (ap->a_mode & FWRITE)) {
 267                         /*
 268                          * When running in very secure mode, do not allow
 269                          * opens for writing of any disk character devices.
 270                          */
 271                         if (securelevel >= 2 && isdisk(dev, VCHR))
 272                                 return (EPERM);
 273                         /*
 274                          * When running in secure mode, do not allow opens
 275                          * for writing of /dev/mem, /dev/kmem, or character
 276                          * devices whose corresponding block devices are
 277                          * currently mounted.
 278                          */
 279                         if (securelevel >= 1) {
 280                                 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error))
 281                                         return (error);
 282                                 if (iskmemdev(dev))
 283                                         return (EPERM);
 284                         }
 285                 }
 286
 287                 devsw_lock(dev, S_IFCHR);
 288                 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
 289
 290                 if (error == 0) {
 291                         vp->v_specinfo->si_opencount++;
 292                 }
 293
 294                 devsw_unlock(dev, S_IFCHR);
 295
 296                 if (error == 0 && (D_TYPEMASK & cdevsw[maj].d_type) == D_DISK && !vp->v_un.vu_specinfo->si_initted) {
 297                         int     isssd = 0;
 298                         uint64_t throttle_mask = 0;
 299                         uint32_t devbsdunit = 0;
 300
 301                         if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL) == 0) {
 302
 303                                 if (throttle_mask != 0 &&
 304                                     VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) {
 305                                         /*
 306                                          * as a reasonable approximation, only use the lowest bit of the mask
 307                                          * to generate a disk unit number
 308                                          */
 309                                         devbsdunit = num_trailing_0(throttle_mask);
 310
 311                                         vnode_lock(vp);
 312
 313                                         vp->v_un.vu_specinfo->si_isssd = isssd;
 314                                         vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit;
 315                                         vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask;
 316                                         vp->v_un.vu_specinfo->si_throttleable = 1;
 317                                         vp->v_un.vu_specinfo->si_initted = 1;
 318
 319                                         vnode_unlock(vp);
 320                                 }
 321                         }
 322                         if (vp->v_un.vu_specinfo->si_initted == 0) {
 323                                 vnode_lock(vp);
 324                                 vp->v_un.vu_specinfo->si_initted = 1;
 325                                 vnode_unlock(vp);
 326                         }
 327                 }
 328                 return (error);
 329
 330         case VBLK:
 331                 if ((u_int)maj >= (u_int)nblkdev)
 332                         return (ENXIO);
 333                 /*
 334                  * When running in very secure mode, do not allow
 335                  * opens for writing of any disk block devices.
 336                  */
 337                 if (securelevel >= 2 && cred != FSCRED &&
 338                     (ap->a_mode & FWRITE) && isdisk(dev, VBLK))
 339                         return (EPERM);
 340                 /*
 341                  * Do not allow opens of block devices that are
 342                  * currently mounted.
 343                  */
 344                 if ( (error = vfs_mountedon(vp)) )
 345                         return (error);
 346
 347                 devsw_lock(dev, S_IFBLK);
 348                 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
 349                 if (!error) {
 350                         vp->v_specinfo->si_opencount++;
 351                 }
 352                 devsw_unlock(dev, S_IFBLK);
 353
 354                 if (!error) {
 355                     u_int64_t blkcnt;
 356                     u_int32_t blksize;
 357                         int setsize = 0;
 358                         u_int32_t size512 = 512;
 359
 360
 361                     if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
 362                                 /* Switch to 512 byte sectors (temporarily) */
 363
 364                                 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
 365                                 /* Get the number of 512 byte physical blocks. */
 366                                 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
 367                                                 setsize = 1;
 368                                 }
 369                                 }
 370                                 /* If it doesn't set back, we can't recover */
 371                                 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context))
 372                                 error = ENXIO;
 373                     }
 374
 375
 376                         vnode_lock(vp);
 377                     set_blocksize(vp, dev);
 378
 379                     /*
 380                      * Cache the size in bytes of the block device for later
 381                      * use by spec_write().
 382                      */
 383                         if (setsize)
 384                                 vp->v_specdevsize = blkcnt * (u_int64_t)size512;
 385                         else
 386                         vp->v_specdevsize = (u_int64_t)0;       /* Default: Can't get */
 387
 388                         vnode_unlock(vp);
 389
 390                 }
 391                 return(error);
 392         default:
 393                 panic("spec_open type");
 394         }
 395         return (0);
 396 }
 397
 398 /*
 399  * Vnode op for read
 400  */
 401 int
 402 spec_read(struct vnop_read_args *ap)
 403 {
 404         struct vnode *vp = ap->a_vp;
 405         struct uio *uio = ap->a_uio;
 406         struct buf *bp;
 407         daddr64_t bn, nextbn;
 408         long bsize, bscale;
 409         int devBlockSize=0;
 410         int n, on;
 411         int error = 0;
 412         dev_t dev;
 413
 414 #if DIAGNOSTIC
 415         if (uio->uio_rw != UIO_READ)
 416                 panic("spec_read mode");
 417         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
 418                 panic("spec_read proc");
 419 #endif
 420         if (uio_resid(uio) == 0)
 421                 return (0);
 422
 423         switch (vp->v_type) {
 424
 425         case VCHR:
 426                 if ((D_TYPEMASK & cdevsw[major(vp->v_rdev)].d_type) == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
 427                         struct _throttle_io_info_t *throttle_info;
 428
 429                         throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
 430
 431                         throttle_info_update_internal(throttle_info, NULL, -1, 0, vp->v_un.vu_specinfo->si_isssd);
 432                 }
 433                 error = (*cdevsw[major(vp->v_rdev)].d_read)
 434                         (vp->v_rdev, uio, ap->a_ioflag);
 435
 436                 return (error);
 437
 438         case VBLK:
 439                 if (uio->uio_offset < 0)
 440                         return (EINVAL);
 441
 442                 dev = vp->v_rdev;
 443
 444                 devBlockSize = vp->v_specsize;
 445
 446                 if (devBlockSize > PAGE_SIZE)
 447                         return (EINVAL);
 448
 449                 bscale = PAGE_SIZE / devBlockSize;
 450                 bsize = bscale * devBlockSize;
 451
 452                 do {
 453                         on = uio->uio_offset % bsize;
 454
 455                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1));
 456
 457                         if (vp->v_speclastr + bscale == bn) {
 458                                 nextbn = bn + bscale;
 459                                 error = buf_breadn(vp, bn, (int)bsize, &nextbn,
 460                                                (int *)&bsize, 1, NOCRED, &bp);
 461                         } else
 462                                 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
 463
 464                         vnode_lock(vp);
 465                         vp->v_speclastr = bn;
 466                         vnode_unlock(vp);
 467
 468                         n = bsize - buf_resid(bp);
 469                         if ((on > n) || error) {
 470                                 if (!error)
 471                                         error = EINVAL;
 472                                 buf_brelse(bp);
 473                                 return (error);
 474                         }
 475                         n = min((unsigned)(n  - on), uio_resid(uio));
 476
 477                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
 478                         if (n + on == bsize)
 479                                 buf_markaged(bp);
 480                         buf_brelse(bp);
 481                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
 482                 return (error);
 483
 484         default:
 485                 panic("spec_read type");
 486         }
 487         /* NOTREACHED */
 488
 489         return (0);
 490 }
 491
 492 /*
 493  * Vnode op for write
 494  */
 495 int
 496 spec_write(struct vnop_write_args *ap)
 497 {
 498         struct vnode *vp = ap->a_vp;
 499         struct uio *uio = ap->a_uio;
 500         struct buf *bp;
 501         daddr64_t bn;
 502         int bsize, blkmask, bscale;
 503         int io_sync;
 504         int devBlockSize=0;
 505         int n, on;
 506         int error = 0;
 507         dev_t dev;
 508
 509 #if DIAGNOSTIC
 510         if (uio->uio_rw != UIO_WRITE)
 511                 panic("spec_write mode");
 512         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
 513                 panic("spec_write proc");
 514 #endif
 515
 516         switch (vp->v_type) {
 517
 518         case VCHR:
 519                 if ((D_TYPEMASK & cdevsw[major(vp->v_rdev)].d_type) == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
 520                         struct _throttle_io_info_t *throttle_info;
 521
 522                         throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
 523
 524                         throttle_info_update_internal(throttle_info, NULL, -1, 0, vp->v_un.vu_specinfo->si_isssd);
 525
 526                         microuptime(&throttle_info->throttle_last_write_timestamp);
 527                 }
 528                 error = (*cdevsw[major(vp->v_rdev)].d_write)
 529                         (vp->v_rdev, uio, ap->a_ioflag);
 530
 531                 return (error);
 532
 533         case VBLK:
 534                 if (uio_resid(uio) == 0)
 535                         return (0);
 536                 if (uio->uio_offset < 0)
 537                         return (EINVAL);
 538
 539                 io_sync = (ap->a_ioflag & IO_SYNC);
 540
 541                 dev = (vp->v_rdev);
 542
 543                 devBlockSize = vp->v_specsize;
 544                 if (devBlockSize > PAGE_SIZE)
 545                         return(EINVAL);
 546
 547                 bscale = PAGE_SIZE / devBlockSize;
 548                 blkmask = bscale - 1;
 549                 bsize = bscale * devBlockSize;
 550
 551
 552                 do {
 553                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask);
 554                         on = uio->uio_offset % bsize;
 555
 556                         n = min((unsigned)(bsize - on), uio_resid(uio));
 557
 558                         /*
 559                          * Use buf_getblk() as an optimization IFF:
 560                          *
 561                          * 1)   We are reading exactly a block on a block
 562                          *      aligned boundary
 563                          * 2)   We know the size of the device from spec_open
 564                          * 3)   The read doesn't span the end of the device
 565                          *
 566                          * Otherwise, we fall back on buf_bread().
 567                          */
 568                         if (n == bsize &&
 569                             vp->v_specdevsize != (u_int64_t)0 &&
 570                             (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
 571                             /* reduce the size of the read to what is there */
 572                             n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
 573                         }
 574
 575                         if (n == bsize)
 576                                 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
 577                         else
 578                                 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
 579
 580                         /* Translate downstream error for upstream, if needed */
 581                         if (!error)
 582                                 error = (int)buf_error(bp);
 583                         if (error) {
 584                                 buf_brelse(bp);
 585                                 return (error);
 586                         }
 587                         n = min(n, bsize - buf_resid(bp));
 588
 589                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
 590                         if (error) {
 591                                 buf_brelse(bp);
 592                                 return (error);
 593                         }
 594                         buf_markaged(bp);
 595
 596                         if (io_sync)
 597                                 error = buf_bwrite(bp);
 598                         else {
 599                                 if ((n + on) == bsize)
 600                                         error = buf_bawrite(bp);
 601                                 else
 602                                         error = buf_bdwrite(bp);
 603                         }
 604                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
 605                 return (error);
 606
 607         default:
 608                 panic("spec_write type");
 609         }
 610         /* NOTREACHED */
 611
 612         return (0);
 613 }
 614
 615 /*
 616  * Device ioctl operation.
 617  */
 618 int
 619 spec_ioctl(struct vnop_ioctl_args *ap)
 620 {
 621         proc_t p = vfs_context_proc(ap->a_context);
 622         dev_t dev = ap->a_vp->v_rdev;
 623         int     retval = 0;
 624
 625         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START,
 626                               (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, (unsigned int)ap->a_vp->v_type, 0);
 627
 628         switch (ap->a_vp->v_type) {
 629
 630         case VCHR:
 631                 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
 632                                                        ap->a_fflag, p);
 633                 break;
 634
 635         case VBLK:
 636                 if (kdebug_enable) {
 637                         if (ap->a_command == DKIOCUNMAP) {
 638                                 dk_unmap_t      *unmap;
 639                                 dk_extent_t     *extent;
 640                                 uint32_t        i;
 641
 642                                 unmap = (dk_unmap_t *)ap->a_data;
 643                                 extent = unmap->extents;
 644
 645                                 for (i = 0; i < unmap->extentsCount; i++, extent++) {
 646                                         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 1) | DBG_FUNC_NONE, dev, extent->offset/ap->a_vp->v_specsize, extent->length, 0, 0);
 647                                 }
 648                         }
 649                 }
 650                 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, p);
 651                 break;
 652
 653         default:
 654                 panic("spec_ioctl");
 655                 /* NOTREACHED */
 656         }
 657         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END,
 658                               (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, retval, 0);
 659
 660         return (retval);
 661 }
 662
 663 int
 664 spec_select(struct vnop_select_args *ap)
 665 {
 666         proc_t p = vfs_context_proc(ap->a_context);
 667         dev_t dev;
 668
 669         switch (ap->a_vp->v_type) {
 670
 671         default:
 672                 return (1);             /* XXX */
 673
 674         case VCHR:
 675                 dev = ap->a_vp->v_rdev;
 676                 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
 677         }
 678 }
 679
 680 static int filt_specattach(struct knote *kn);
 681
 682 int
 683 spec_kqfilter(vnode_t vp, struct knote *kn)
 684 {
 685         dev_t dev;
 686         int err = EINVAL;
 687
 688         /*
 689          * For a few special kinds of devices, we can attach knotes.
 690          * Each filter function must check whether the dev type matches it.
 691          */
 692         dev = vnode_specrdev(vp);
 693
 694         if (vnode_istty(vp)) {
 695                 /* We can hook into TTYs... */
 696                 err = filt_specattach(kn);
 697         } else {
 698                 /* Try a bpf device, as defined in bsd/net/bpf.c */
 699                 err = bpfkqfilter(dev, kn);
 700         }
 701
 702         return err;
 703 }
 704
 705 /*
 706  * Synch buffers associated with a block device
 707  */
 708 int
 709 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
 710 {
 711         if (vp->v_type == VCHR)
 712                 return (0);
 713         /*
 714          * Flush all dirty buffers associated with a block device.
 715          */
 716         buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync");
 717
 718         return (0);
 719 }
 720
 721 int
 722 spec_fsync(struct vnop_fsync_args *ap)
 723 {
 724         return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
 725 }
 726
 727
 728 /*
 729  * Just call the device strategy routine
 730  */
 731 extern int hard_throttle_on_root;
 732
 733 void throttle_init(void);
 734
 735
 736 #define LOWPRI_THROTTLE_WINDOW_MSECS 500
 737 #define LOWPRI_LEGACY_THROTTLE_WINDOW_MSECS 200
 738 #define LOWPRI_IO_PERIOD_MSECS 200
 739 #define LOWPRI_IO_PERIOD_SSD_MSECS 20
 740 #define LOWPRI_TIMER_PERIOD_MSECS 10
 741
 742
 743 int     lowpri_throttle_window_msecs = LOWPRI_THROTTLE_WINDOW_MSECS;
 744 int     lowpri_legacy_throttle_window_msecs = LOWPRI_LEGACY_THROTTLE_WINDOW_MSECS;
 745 int     lowpri_io_period_msecs = LOWPRI_IO_PERIOD_MSECS;
 746 int     lowpri_io_period_ssd_msecs = LOWPRI_IO_PERIOD_SSD_MSECS;
 747 int     lowpri_timer_period_msecs = LOWPRI_TIMER_PERIOD_MSECS;
 748
 749 /*
 750  * If a process requiring legacy iothrottle behavior is running on the
 751  * system, use legacy limits for throttle window and max IO size.
 752  */
 753 #if CONFIG_EMBEDDED
 754 #define THROTTLE_WINDOW (lowpri_throttle_window_msecs)
 755 #else
 756 #define THROTTLE_WINDOW (throttle_legacy_process_count == 0 ? lowpri_throttle_window_msecs : lowpri_legacy_throttle_window_msecs)
 757 #endif
 758
 759 #if 0
 760 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)  \
 761         do {                                                    \
 762                if ((debug_info)->alloc)                           \
 763                printf("%s: "format, __FUNCTION__, ## args);     \
 764        } while(0)
 765
 766 #else
 767 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
 768 #endif
 769
 770 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_throttle_window_msecs, 0, "");
 771 SYSCTL_INT(_debug, OID_AUTO, lowpri_legacy_throttle_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_legacy_throttle_window_msecs, 0, "");
 772 SYSCTL_INT(_debug, OID_AUTO, lowpri_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_io_period_msecs, 0, "");
 773 SYSCTL_INT(_debug, OID_AUTO, lowpri_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_io_period_ssd_msecs, 0, "");
 774 SYSCTL_INT(_debug, OID_AUTO, lowpri_timer_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_timer_period_msecs, 0, "");
 775 SYSCTL_INT(_debug, OID_AUTO, lowpri_legacy_process_count, CTLFLAG_RD | CTLFLAG_LOCKED, &throttle_legacy_process_count, 0, "");
 776
 777 static lck_grp_t        *throttle_mtx_grp;
 778 static lck_attr_t       *throttle_mtx_attr;
 779 static lck_grp_attr_t   *throttle_mtx_grp_attr;
 780
 781
 782 /*
 783  * throttled I/O helper function
 784  * convert the index of the lowest set bit to a device index
 785  */
 786 int
 787 num_trailing_0(uint64_t n)
 788 {
 789         /*
 790          * since in most cases the number of trailing 0s is very small,
 791          * we simply counting sequentially from the lowest bit
 792          */
 793         if (n == 0)
 794                 return sizeof(n) * 8;
 795         int count = 0;
 796         while (!ISSET(n, 1)) {
 797                 n >>= 1;
 798                 ++count;
 799         }
 800         return count;
 801 }
 802
 803
 804 /*
 805  * Release the reference and if the item was allocated and this is the last
 806  * reference then free it.
 807  *
 808  * This routine always returns the old value.
 809  */
 810 static int
 811 throttle_info_rel(struct _throttle_io_info_t *info)
 812 {
 813         SInt32 oldValue = OSDecrementAtomic(&info->throttle_refcnt);
 814
 815         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
 816                 info, (int)(oldValue -1), info );
 817
 818         /* The reference count just went negative, very bad */
 819         if (oldValue == 0)
 820                 panic("throttle info ref cnt went negative!");
 821
 822         /*
 823          * Once reference count is zero, no one else should be able to take a
 824          * reference
 825          */
 826         if ((info->throttle_refcnt == 0) && (info->throttle_alloc)) {
 827                 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info);
 828
 829                 lck_mtx_destroy(&info->throttle_lock, throttle_mtx_grp);
 830                 FREE(info, M_TEMP);
 831         }
 832         return oldValue;
 833 }
 834
 835
 836 /*
 837  * Just take a reference on the throttle info structure.
 838  *
 839  * This routine always returns the old value.
 840  */
 841 static SInt32
 842 throttle_info_ref(struct _throttle_io_info_t *info)
 843 {
 844         SInt32 oldValue = OSIncrementAtomic(&info->throttle_refcnt);
 845
 846         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
 847                 info, (int)(oldValue -1), info );
 848         /* Allocated items should never have a reference of zero */
 849         if (info->throttle_alloc && (oldValue == 0))
 850                 panic("Taking a reference without calling create throttle info!\n");
 851
 852         return oldValue;
 853 }
 854
 855
 856 /*
 857  * on entry the throttle_lock is held...
 858  * this function is responsible for taking
 859  * and dropping the reference on the info
 860  * structure which will keep it from going
 861  * away while the timer is running if it
 862  * happens to have been dynamically allocated by
 863  * a network fileystem kext which is now trying
 864  * to free it
 865  */
 866 static uint32_t
 867 throttle_timer_start(struct _throttle_io_info_t *info, boolean_t update_io_count)
 868 {
 869         struct timeval  elapsed;
 870         uint64_t        elapsed_msecs;
 871         int             throttle_level;
 872         uint64_t        deadline;
 873
 874         if (update_io_count == TRUE) {
 875                 info->throttle_io_count_begin = info->throttle_io_count;
 876                 info->throttle_io_period_num++;
 877
 878                 microuptime(&info->throttle_start_IO_period_timestamp);
 879         }
 880         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < THROTTLE_LEVEL_END; throttle_level++) {
 881
 882                 microuptime(&elapsed);
 883                 timevalsub(&elapsed, &info->throttle_last_IO_timestamp[throttle_level]);
 884                 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
 885
 886                 if (elapsed_msecs < (uint64_t)THROTTLE_WINDOW) {
 887                         /*
 888                          * we had an I/O occur in this level within
 889                          * our throttle window, so we need to
 890                          * to make sure the timer continues to run
 891                          */
 892                         break;
 893                 }
 894         }
 895         if (throttle_level >= THROTTLE_LEVEL_END) {
 896                 /*
 897                  * we're outside all of the throttle windows...
 898                  * don't start a new timer
 899                  */
 900                 info->throttle_timer_running = 0;
 901
 902                 return (THROTTLE_LEVEL_END);
 903         }
 904         if (info->throttle_timer_running == 0) {
 905                 /*
 906                  * take a reference for the timer
 907                  */
 908                 throttle_info_ref(info);
 909
 910                 info->throttle_timer_running = 1;
 911         }
 912         clock_interval_to_deadline(lowpri_timer_period_msecs, 1000000, &deadline);
 913
 914         thread_call_enter_delayed(info->throttle_timer_call, deadline);
 915
 916         return (throttle_level);
 917 }
 918
 919
 920 static void
 921 throttle_timer(struct _throttle_io_info_t *info)
 922 {
 923         uthread_t       ut, utlist;
 924         struct timeval  elapsed;
 925         uint64_t        elapsed_msecs;
 926         int             throttle_level;
 927         boolean_t       update_io_count = FALSE;
 928         boolean_t       need_wakeup = FALSE;
 929         boolean_t       need_release = FALSE;
 930
 931         lck_mtx_lock(&info->throttle_lock);
 932
 933         microuptime(&elapsed);
 934         timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp);
 935         elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
 936
 937         if (elapsed_msecs >= (uint64_t)info->throttle_io_period) {
 938                 /*
 939                  * we're closing out the current IO period...
 940                  * if we have a waiting thread, wake it up
 941                  * after we have reset the I/O window info
 942                  */
 943                 need_wakeup = TRUE;
 944                 update_io_count = TRUE;
 945         }
 946         if ((throttle_level = throttle_timer_start(info, update_io_count)) == THROTTLE_LEVEL_END) {
 947                 /*
 948                  * we are now outside of the throttle window
 949                  * for all throttle levels...
 950                  *
 951                  * the timer is not restarted in this case, so
 952                  * we need to get rid of the reference we took when
 953                  * we started up the timer... we can't do this
 954                  * until we are entirely done playing with 'info'
 955                  */
 956                 need_release = TRUE;
 957         }
 958
 959         TAILQ_FOREACH_SAFE(ut, &info->throttle_uthlist, uu_throttlelist, utlist) {
 960                 /*
 961                  * if we are now outside of the throttle window release
 962                  * all of the currently blocked threads, otherwise
 963                  * look for threads that have had their IO policy changed
 964                  * by someone else and are no longer throttleable, or are
 965                  * not at the current throttle level and unblock them
 966                  */
 967                 if (throttle_level == THROTTLE_LEVEL_END || throttle_get_thread_throttle_level(ut, -1) <= throttle_level) {
 968
 969                         TAILQ_REMOVE(&info->throttle_uthlist, ut, uu_throttlelist);
 970                         ut->uu_on_throttlelist = 0;
 971
 972                         wakeup(&ut->uu_on_throttlelist);
 973                 }
 974         }
 975         if (need_wakeup && !TAILQ_EMPTY(&info->throttle_uthlist)) {
 976                 /*
 977                  * we've entered a new I/O period and we're still
 978                  * in the throttle window, so wakeup the next guy in line
 979                  */
 980                 ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist);
 981                 TAILQ_REMOVE(&info->throttle_uthlist, ut, uu_throttlelist);
 982                 ut->uu_on_throttlelist = 0;
 983
 984                 wakeup(&ut->uu_on_throttlelist);
 985         }
 986         lck_mtx_unlock(&info->throttle_lock);
 987
 988         if (need_release == TRUE)
 989                 throttle_info_rel(info);
 990 }
 991
 992
 993 void
 994 throttle_init(void)
 995 {
 996         struct _throttle_io_info_t *info;
 997         int     i;
 998
 999         /*
1000          * allocate lock group attribute and group
1001          */
1002         throttle_mtx_grp_attr = lck_grp_attr_alloc_init();
1003         throttle_mtx_grp = lck_grp_alloc_init("throttle I/O", throttle_mtx_grp_attr);
1004
1005         /*
1006          * allocate the lock attribute
1007          */
1008         throttle_mtx_attr = lck_attr_alloc_init();
1009
1010         for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) {
1011                 info = &_throttle_io_info[i];
1012
1013                 lck_mtx_init(&info->throttle_lock, throttle_mtx_grp, throttle_mtx_attr);
1014                 info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1015
1016                 TAILQ_INIT(&info->throttle_uthlist);
1017         }
1018 }
1019
1020
1021 /*
1022  * KPI routine
1023  *
1024  * wakeup and remove the specified thread from the throttle queue
1025  * if it's no longer in a throttleable state...
1026  * takes a valid uthread (which may or may not be on the
1027  * throttle queue) as input
1028  */
1029 void
1030 unthrottle_thread(uthread_t ut)
1031 {
1032        struct _throttle_io_info_t *info;
1033
1034        if ((info = ut->uu_throttle_info) == NULL)
1035                return;
1036
1037         lck_mtx_lock(&info->throttle_lock);
1038
1039        if (ut->uu_on_throttlelist && throttle_get_thread_throttle_level(ut, -1) <= THROTTLE_LEVEL_THROTTLED) {
1040                TAILQ_REMOVE(&info->throttle_uthlist, ut, uu_throttlelist);
1041                ut->uu_on_throttlelist = 0;
1042
1043                wakeup(&ut->uu_on_throttlelist);
1044        }
1045         lck_mtx_unlock(&info->throttle_lock);
1046 }
1047
1048
1049 /*
1050  * KPI routine
1051  *
1052  * Create and take a reference on a throttle info structure and return a
1053  * pointer for the file system to use when calling throttle_info_update.
1054  * Calling file system must have a matching release for every create.
1055  */
1056 void *
1057 throttle_info_create(void)
1058 {
1059         struct _throttle_io_info_t *info;
1060
1061         MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK);
1062         /* Should never happen but just in case */
1063         if (info == NULL)
1064                 return NULL;
1065         /* Mark that this one was allocated and needs to be freed */
1066         DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
1067         info->throttle_alloc = TRUE;
1068
1069         lck_mtx_init(&info->throttle_lock, throttle_mtx_grp, throttle_mtx_attr);
1070         info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1071
1072         TAILQ_INIT(&info->throttle_uthlist);
1073
1074         /* Take a reference */
1075         OSIncrementAtomic(&info->throttle_refcnt);
1076         return info;
1077 }
1078
1079 /*
1080  * KPI routine
1081  *
1082  * Release the throttle info pointer if all the reference are gone. Should be
1083  * called to release reference taken by throttle_info_create
1084  */
1085 void
1086 throttle_info_release(void *throttle_info)
1087 {
1088         DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
1089                 (struct _throttle_io_info_t *)throttle_info,
1090                 (struct _throttle_io_info_t *)throttle_info);
1091         if (throttle_info) /* Just to be careful */
1092                 throttle_info_rel(throttle_info);
1093 }
1094
1095 /*
1096  * KPI routine
1097  *
1098  * File Systems that create an info structure, need to call this routine in
1099  * their mount routine (used by cluster code). File Systems that call this in
1100  * their mount routines must call throttle_info_mount_rel in their unmount
1101  * routines.
1102  */
1103 void
1104 throttle_info_mount_ref(mount_t mp, void *throttle_info)
1105 {
1106         if ((throttle_info == NULL) || (mp == NULL))
1107                 return;
1108         throttle_info_ref(throttle_info);
1109
1110         /*
1111          * We already have a reference release it before adding the new one
1112          */
1113         if (mp->mnt_throttle_info)
1114                 throttle_info_rel(mp->mnt_throttle_info);
1115         mp->mnt_throttle_info = throttle_info;
1116 }
1117
1118 /*
1119  * Private KPI routine
1120  *
1121  * return a handle for accessing throttle_info given a throttle_mask.  The
1122  * handle must be released by throttle_info_rel_by_mask
1123  */
1124 int
1125 throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle)
1126 {
1127         int     dev_index;
1128         struct _throttle_io_info_t *info;
1129
1130         if (throttle_info_handle == NULL)
1131                 return EINVAL;
1132
1133         dev_index = num_trailing_0(throttle_mask);
1134         info = &_throttle_io_info[dev_index];
1135         throttle_info_ref(info);
1136         *(struct _throttle_io_info_t**)throttle_info_handle = info;
1137
1138         return 0;
1139 }
1140
1141 /*
1142  * Private KPI routine
1143  *
1144  * release the handle obtained by throttle_info_ref_by_mask
1145  */
1146 void
1147 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
1148 {
1149         /*
1150          * for now the handle is just a pointer to _throttle_io_info_t
1151          */
1152         throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
1153 }
1154
1155 /*
1156  * KPI routine
1157  *
1158  * File Systems that throttle_info_mount_ref, must call this routine in their
1159  * umount routine.
1160  */
1161 void
1162 throttle_info_mount_rel(mount_t mp)
1163 {
1164         if (mp->mnt_throttle_info)
1165                 throttle_info_rel(mp->mnt_throttle_info);
1166         mp->mnt_throttle_info = NULL;
1167 }
1168
1169 void
1170 throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
1171 {
1172         struct _throttle_io_info_t *info;
1173
1174         if (mp == NULL)
1175                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1176         else if (mp->mnt_throttle_info == NULL)
1177                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1178         else
1179                 info = mp->mnt_throttle_info;
1180
1181         *tv = info->throttle_last_write_timestamp;
1182 }
1183
1184 void
1185 update_last_io_time(mount_t mp)
1186 {
1187         struct _throttle_io_info_t *info;
1188
1189         if (mp == NULL)
1190                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1191         else if (mp->mnt_throttle_info == NULL)
1192                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1193         else
1194                 info = mp->mnt_throttle_info;
1195
1196         microuptime(&info->throttle_last_write_timestamp);
1197 }
1198
1199
1200 int
1201 throttle_get_io_policy(uthread_t *ut)
1202 {
1203         *ut = get_bsdthread_info(current_thread());
1204
1205         return (proc_get_task_selfdiskacc());
1206 }
1207
1208
1209
1210 static int
1211 throttle_get_thread_throttle_level(uthread_t ut, int policy)
1212 {
1213         int     thread_throttle_level = THROTTLE_LEVEL_NONE;
1214
1215         if (ut == NULL)
1216                 ut = get_bsdthread_info(current_thread());
1217
1218         if (policy == -1)
1219                 policy = proc_get_diskacc(ut->uu_thread);
1220
1221         switch (policy) {
1222
1223         case IOPOL_DEFAULT:
1224         case IOPOL_NORMAL:
1225                 thread_throttle_level = THROTTLE_LEVEL_TIER0;
1226         case IOPOL_PASSIVE:
1227                 if (ut->uu_throttle_bc == TRUE)
1228                         thread_throttle_level = THROTTLE_LEVEL_TIER2;
1229                 break;
1230         case IOPOL_THROTTLE:
1231                 thread_throttle_level = THROTTLE_LEVEL_TIER2;
1232                 break;
1233         case IOPOL_UTILITY:
1234                 thread_throttle_level = THROTTLE_LEVEL_TIER1;
1235                 break;
1236         default:
1237                 printf("unknown I/O policy %d", policy);
1238                 break;
1239         }
1240         return (thread_throttle_level);
1241 }
1242
1243
1244 static int
1245 throttle_io_will_be_throttled_internal(void * throttle_info)
1246 {
1247         struct _throttle_io_info_t *info = throttle_info;
1248         struct timeval elapsed;
1249         uint64_t elapsed_msecs;
1250         int     thread_throttle_level;
1251         int     throttle_level;
1252
1253         if ((thread_throttle_level = throttle_get_thread_throttle_level(NULL, -1)) < THROTTLE_LEVEL_THROTTLED)
1254                 return (0);
1255
1256         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
1257
1258                 microuptime(&elapsed);
1259                 timevalsub(&elapsed, &info->throttle_last_IO_timestamp[throttle_level]);
1260                 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1261
1262                 if (elapsed_msecs < (uint64_t)THROTTLE_WINDOW)
1263                         break;
1264         }
1265         if (throttle_level >= thread_throttle_level) {
1266                 /*
1267                  * we're beyond all of the throttle windows
1268                  * that affect the throttle level of this thread,
1269                  * so go ahead and treat as normal I/O
1270                  */
1271                 return (0);
1272         }
1273         if (info->throttle_io_count != info->throttle_io_count_begin) {
1274                 /*
1275                  * we've already issued at least one throttleable I/O
1276                  * in the current I/O window, so avoid issuing another one
1277                  */
1278                 return (2);
1279         }
1280         /*
1281          * we're in the throttle window, so
1282          * cut the I/O size back
1283          */
1284         return (1);
1285 }
1286
1287 /*
1288  * If we have a mount point and it has a throttle info pointer then
1289  * use it to do the check, otherwise use the device unit number to find
1290  * the correct throttle info array element.
1291  */
1292 int
1293 throttle_io_will_be_throttled(__unused int lowpri_window_msecs, mount_t mp)
1294 {
1295         void    *info;
1296
1297         /*
1298          * Should we just return zero if no mount point
1299          */
1300         if (mp == NULL)
1301                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1302         else if (mp->mnt_throttle_info == NULL)
1303                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1304         else
1305                 info = mp->mnt_throttle_info;
1306
1307         return throttle_io_will_be_throttled_internal(info);
1308 }
1309
1310
1311 uint32_t
1312 throttle_lowpri_io(int sleep_amount)
1313 {
1314         uthread_t ut;
1315         struct _throttle_io_info_t *info;
1316         int     throttle_type = 0;
1317         int     sleep_cnt = 0;
1318         int     locked = 0;
1319         uint32_t  throttle_io_period_num = 0;
1320         boolean_t insert_tail = TRUE;
1321
1322         ut = get_bsdthread_info(current_thread());
1323
1324         if (ut->uu_lowpri_window == 0)
1325                 return (0);
1326
1327         info = ut->uu_throttle_info;
1328
1329         if ((sleep_amount == 0) || (info == NULL))
1330                 goto done;
1331
1332         if (sleep_amount == 1 && ut->uu_throttle_bc == FALSE)
1333                 sleep_amount = 0;
1334
1335         throttle_io_period_num = info->throttle_io_period_num;
1336
1337         while ( (throttle_type = throttle_io_will_be_throttled_internal(info)) ) {
1338
1339                 if (throttle_type == 1) {
1340                         if (sleep_amount == 0)
1341                                 break;
1342                         if (info->throttle_io_period_num < throttle_io_period_num)
1343                                 break;
1344                         if ((info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount)
1345                                 break;
1346                 }
1347                 if (!locked) {
1348                         lck_mtx_lock(&info->throttle_lock);
1349                         locked = 1;
1350                 }
1351                 if (info->throttle_timer_running == 0) {
1352                         /*
1353                          * try to start the timer since it's
1354                          * currently not running.  on failure, no
1355                          * timer reference to drop since it wasn't started
1356                          */
1357                         if (throttle_timer_start(info, TRUE) == THROTTLE_LEVEL_END)
1358                                 goto done;
1359                 }
1360                 if (sleep_cnt == 0) {
1361                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
1362                                                       ut->uu_lowpri_window, info->throttle_io_period, info->throttle_io_count, 0, 0);
1363                 }
1364                 if (ut->uu_on_throttlelist == 0) {
1365                         if (insert_tail == TRUE)
1366                                 TAILQ_INSERT_TAIL(&info->throttle_uthlist, ut, uu_throttlelist);
1367                         else
1368                                 TAILQ_INSERT_HEAD(&info->throttle_uthlist, ut, uu_throttlelist);
1369
1370                         ut->uu_on_throttlelist = 1;
1371                 }
1372                 msleep((caddr_t)&ut->uu_on_throttlelist, &info->throttle_lock, PRIBIO + 1, "throttle_lowpri_io", NULL);
1373
1374                 sleep_cnt++;
1375
1376                 if (sleep_amount == 0)
1377                         insert_tail = FALSE;
1378                 else if (info->throttle_io_period_num < throttle_io_period_num ||
1379                          (info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) {
1380                         insert_tail = FALSE;
1381                         sleep_amount = 0;
1382                 }
1383         }
1384 done:
1385         if (ut->uu_on_throttlelist) {
1386                 if (!locked) {
1387                         lck_mtx_lock(&info->throttle_lock);
1388                         locked = 1;
1389                 }
1390                 if (ut->uu_on_throttlelist) {
1391                         TAILQ_REMOVE(&info->throttle_uthlist, ut, uu_throttlelist);
1392
1393                         ut->uu_on_throttlelist = 0;
1394                 }
1395         }
1396         if (locked)
1397                 lck_mtx_unlock(&info->throttle_lock);
1398
1399         if (sleep_cnt)
1400                 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
1401                                       ut->uu_lowpri_window, info->throttle_io_period, info->throttle_io_count, 0, 0);
1402         if (info)
1403                 throttle_info_rel(info);
1404
1405         ut->uu_throttle_info = NULL;
1406         ut->uu_throttle_bc = FALSE;
1407         ut->uu_lowpri_window = 0;
1408
1409         return (sleep_cnt);
1410 }
1411
1412 /*
1413  * KPI routine
1414  *
1415  * set a kernel thread's IO policy.  policy can be:
1416  * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE
1417  *
1418  * explanations about these policies are in the man page of setiopolicy_np
1419  */
1420 void throttle_set_thread_io_policy(int policy)
1421 {
1422         proc_apply_thread_selfdiskacc(policy);
1423 }
1424
1425
1426 static
1427 void throttle_info_reset_window(uthread_t ut)
1428 {
1429         struct _throttle_io_info_t *info;
1430
1431         if ( (info = ut->uu_throttle_info) ) {
1432                 throttle_info_rel(info);
1433
1434                 ut->uu_throttle_info = NULL;
1435                 ut->uu_lowpri_window = 0;
1436                 ut->uu_throttle_bc = FALSE;
1437         }
1438 }
1439
1440 static
1441 void throttle_info_set_initial_window(uthread_t ut, struct _throttle_io_info_t *info, boolean_t BC_throttle)
1442 {
1443         if (ut->uu_throttle_info == NULL) {
1444
1445                 ut->uu_throttle_info = info;
1446                 throttle_info_ref(info);
1447                 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
1448
1449                 ut->uu_lowpri_window = THROTTLE_WINDOW;
1450                 ut->uu_throttle_bc = BC_throttle;
1451         }
1452 }
1453
1454
1455 static
1456 void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int policy, int flags, boolean_t isssd)
1457 {
1458         int     thread_throttle_level;
1459
1460         if (THROTTLE_WINDOW == 0)
1461                 return;
1462
1463         if (ut == NULL)
1464                 ut = get_bsdthread_info(current_thread());
1465
1466         thread_throttle_level = throttle_get_thread_throttle_level(ut, policy);
1467
1468         if (thread_throttle_level == THROTTLE_LEVEL_TIER0 && ISSET(flags, B_PASSIVE))
1469                 thread_throttle_level = THROTTLE_LEVEL_NONE;
1470
1471         if (thread_throttle_level != THROTTLE_LEVEL_NONE)
1472                 microuptime(&info->throttle_last_IO_timestamp[thread_throttle_level]);
1473
1474         if (thread_throttle_level >= THROTTLE_LEVEL_THROTTLED) {
1475                 /*
1476                  * I'd really like to do the IOSleep here, but
1477                  * we may be holding all kinds of filesystem related locks
1478                  * and the pages for this I/O marked 'busy'...
1479                  * we don't want to cause a normal task to block on
1480                  * one of these locks while we're throttling a task marked
1481                  * for low priority I/O... we'll mark the uthread and
1482                  * do the delay just before we return from the system
1483                  * call that triggered this I/O or from vnode_pagein
1484                  */
1485                 if (info->throttle_io_period == 0) {
1486
1487                         if (isssd == TRUE)
1488                                 info->throttle_io_period = lowpri_io_period_ssd_msecs;
1489                         else
1490                                 info->throttle_io_period = lowpri_io_period_msecs;
1491
1492                         if (info->throttle_io_period < lowpri_timer_period_msecs)
1493                                 info->throttle_io_period = lowpri_timer_period_msecs;
1494                 }
1495                 OSAddAtomic(1, &info->throttle_io_count);
1496
1497                 throttle_info_set_initial_window(ut, info, FALSE);
1498         }
1499 }
1500
1501 void throttle_info_update_by_mount(mount_t mp)
1502 {
1503         struct _throttle_io_info_t *info;
1504         uthread_t ut;
1505         boolean_t isssd = FALSE;
1506
1507         ut = get_bsdthread_info(current_thread());
1508
1509         if (ut->uu_lowpri_window)
1510                 return;
1511
1512         if (mp != NULL) {
1513                 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
1514                         isssd = TRUE;
1515                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1516         } else
1517                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1518
1519         if (info->throttle_io_period == 0) {
1520
1521                 if (isssd == TRUE)
1522                         info->throttle_io_period = lowpri_io_period_ssd_msecs;
1523                 else
1524                         info->throttle_io_period = lowpri_io_period_msecs;
1525
1526                 if (info->throttle_io_period < lowpri_timer_period_msecs)
1527                         info->throttle_io_period = lowpri_timer_period_msecs;
1528         }
1529         throttle_info_set_initial_window(ut, info, FALSE);
1530 }
1531
1532
1533 /*
1534  * KPI routine
1535  *
1536  * this is usually called before every I/O, used for throttled I/O
1537  * book keeping.  This routine has low overhead and does not sleep
1538  */
1539 void throttle_info_update(void *throttle_info, int flags)
1540 {
1541         if (throttle_info)
1542                 throttle_info_update_internal(throttle_info, NULL, -1, flags, FALSE);
1543 }
1544
1545 /*
1546  * KPI routine
1547  *
1548  * this is usually called before every I/O, used for throttled I/O
1549  * book keeping.  This routine has low overhead and does not sleep
1550  */
1551 void throttle_info_update_by_mask(void *throttle_info_handle, int flags)
1552 {
1553         void *throttle_info = throttle_info_handle;
1554
1555         /*
1556          * for now we only use the lowest bit of the throttle mask, so the
1557          * handle is the same as the throttle_info.  Later if we store a
1558          * set of throttle infos in the handle, we will want to loop through
1559          * them and call throttle_info_update in a loop
1560          */
1561         throttle_info_update(throttle_info, flags);
1562 }
1563
1564
1565 int throttle_info_io_will_be_throttled(void * throttle_info, int policy)
1566 {
1567         struct _throttle_io_info_t *info = throttle_info;
1568         struct timeval elapsed;
1569         uint64_t elapsed_msecs;
1570         int     throttle_level;
1571         int     thread_throttle_level;
1572
1573         switch (policy) {
1574
1575         case IOPOL_THROTTLE:
1576                 thread_throttle_level = THROTTLE_LEVEL_TIER2;
1577                 break;
1578         case IOPOL_UTILITY:
1579                 thread_throttle_level = THROTTLE_LEVEL_TIER1;
1580                 break;
1581         default:
1582                 thread_throttle_level = THROTTLE_LEVEL_TIER0;
1583                 break;
1584         }
1585         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
1586
1587                 microuptime(&elapsed);
1588                 timevalsub(&elapsed, &info->throttle_last_IO_timestamp[throttle_level]);
1589                 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1590
1591                 if (elapsed_msecs < (uint64_t)THROTTLE_WINDOW)
1592                         break;
1593         }
1594         if (throttle_level >= thread_throttle_level) {
1595                 /*
1596                  * we're beyond all of the throttle windows
1597                  * so go ahead and treat as normal I/O
1598                  */
1599                 return (0);
1600         }
1601         /*
1602          * we're in the throttle window
1603          */
1604         return (1);
1605 }
1606
1607 void
1608 throttle_legacy_process_incr(void)
1609 {
1610         OSIncrementAtomic(&throttle_legacy_process_count);
1611 }
1612
1613 void
1614 throttle_legacy_process_decr(void)
1615 {
1616         OSDecrementAtomic(&throttle_legacy_process_count);
1617 }
1618
1619
1620 int
1621 spec_strategy(struct vnop_strategy_args *ap)
1622 {
1623         buf_t   bp;
1624         int     bflags;
1625         int     policy;
1626         dev_t   bdev;
1627         uthread_t ut;
1628         mount_t mp;
1629         int     strategy_ret;
1630         struct _throttle_io_info_t *throttle_info;
1631         boolean_t isssd = FALSE;
1632 #if !CONFIG_EMBEDDED
1633         proc_t curproc = current_proc();
1634 #endif /* !CONFIG_EMBEDDED */
1635
1636         bp = ap->a_bp;
1637         bdev = buf_device(bp);
1638         mp = buf_vnode(bp)->v_mount;
1639
1640         policy = throttle_get_io_policy(&ut);
1641
1642         if (bp->b_flags & B_META)
1643                 bp->b_attr.ba_flags |= BA_META;
1644
1645         if (policy == IOPOL_THROTTLE || policy == IOPOL_UTILITY) {
1646                 bp->b_flags |= B_THROTTLED_IO;
1647                 bp->b_attr.ba_flags |= BA_THROTTLED_IO;
1648                 bp->b_flags &= ~B_PASSIVE;
1649         } else if (policy == IOPOL_PASSIVE)
1650                 bp->b_flags |= B_PASSIVE;
1651
1652 #if !CONFIG_EMBEDDED
1653         if ((curproc != NULL) && ((curproc->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP))
1654                 bp->b_attr.ba_flags |= BA_DELAYIDLESLEEP;
1655 #endif /* !CONFIG_EMBEDDED */
1656
1657         bflags = bp->b_flags;
1658
1659         if (kdebug_enable) {
1660                 int    code = 0;
1661
1662                 if (bflags & B_READ)
1663                         code |= DKIO_READ;
1664                 if (bflags & B_ASYNC)
1665                         code |= DKIO_ASYNC;
1666
1667                 if (bflags & B_META)
1668                         code |= DKIO_META;
1669                 else if (bflags & B_PAGEIO)
1670                         code |= DKIO_PAGING;
1671
1672                 if (bflags & B_THROTTLED_IO)
1673                         code |= DKIO_THROTTLE;
1674                 else if (bflags & B_PASSIVE)
1675                         code |= DKIO_PASSIVE;
1676
1677                 if (bp->b_attr.ba_flags & BA_NOCACHE)
1678                         code |= DKIO_NOCACHE;
1679
1680                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
1681                                           bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0);
1682         }
1683         if (((bflags & (B_THROTTLED_IO | B_PASSIVE | B_IOSTREAMING | B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
1684             mp && (mp->mnt_kern_flag & MNTK_ROOTDEV))
1685                 hard_throttle_on_root = 1;
1686
1687         if (mp != NULL) {
1688                 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
1689                         isssd = TRUE;
1690                 throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
1691         } else
1692                 throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1693
1694         throttle_info_update_internal(throttle_info, ut, policy, bflags, isssd);
1695
1696         if ((bflags & B_READ) == 0) {
1697                 microuptime(&throttle_info->throttle_last_write_timestamp);
1698
1699                 if (mp) {
1700                         INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
1701                 }
1702         } else if (mp) {
1703                 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
1704         }
1705         /*
1706          * The BootCache may give us special information about
1707          * the IO, so it returns special values that we check
1708          * for here.
1709          *
1710          * IO_SATISFIED_BY_CACHE
1711          * The read has been satisfied by the boot cache. Don't
1712          * throttle the thread unnecessarily.
1713          *
1714          * IO_SHOULD_BE_THROTTLED
1715          * The boot cache is playing back a playlist and this IO
1716          * cut through. Throttle it so we're not cutting through
1717          * the boot cache too often.
1718          *
1719          * Note that typical strategy routines are defined with
1720          * a void return so we'll get garbage here. In the
1721          * unlikely case the garbage matches our special return
1722          * value, it's not a big deal since we're only adjusting
1723          * the throttling delay.
1724          */
1725 #define IO_SATISFIED_BY_CACHE  ((int)0xcafefeed)
1726 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
1727         typedef int strategy_fcn_ret_t(struct buf *bp);
1728
1729         strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp);
1730
1731         if (IO_SATISFIED_BY_CACHE == strategy_ret) {
1732                 /*
1733                  * If this was a throttled IO satisfied by the boot cache,
1734                  * don't delay the thread.
1735                  */
1736                 throttle_info_reset_window(ut);
1737
1738         } else if (IO_SHOULD_BE_THROTTLED == strategy_ret) {
1739                 /*
1740                  * If the boot cache indicates this IO should be throttled,
1741                  * delay the thread.
1742                  */
1743                 throttle_info_set_initial_window(ut, throttle_info, TRUE);
1744         }
1745         return (0);
1746 }
1747
1748
1749 /*
1750  * This is a noop, simply returning what one has been given.
1751  */
1752 int
1753 spec_blockmap(__unused struct vnop_blockmap_args *ap)
1754 {
1755         return (ENOTSUP);
1756 }
1757
1758
1759 /*
1760  * Device close routine
1761  */
1762 int
1763 spec_close(struct vnop_close_args *ap)
1764 {
1765         struct vnode *vp = ap->a_vp;
1766         dev_t dev = vp->v_rdev;
1767         int error = 0;
1768         int flags = ap->a_fflag;
1769         struct proc *p = vfs_context_proc(ap->a_context);
1770         struct session *sessp;
1771         int do_rele = 0;
1772
1773         switch (vp->v_type) {
1774
1775         case VCHR:
1776                 /*
1777                  * Hack: a tty device that is a controlling terminal
1778                  * has a reference from the session structure.
1779                  * We cannot easily tell that a character device is
1780                  * a controlling terminal, unless it is the closing
1781                  * process' controlling terminal.  In that case,
1782                  * if the reference count is 1 (this is the very
1783                  * last close)
1784                  */
1785                 sessp = proc_session(p);
1786                 if (sessp != SESSION_NULL) {
1787                         if (vp == sessp->s_ttyvp && vcount(vp) == 1) {
1788                                 struct tty *tp;
1789
1790                                 session_lock(sessp);
1791                                 if (vp == sessp->s_ttyvp) {
1792                                         tp = SESSION_TP(sessp);
1793                                         sessp->s_ttyvp = NULL;
1794                                         sessp->s_ttyvid = 0;
1795                                         sessp->s_ttyp = TTY_NULL;
1796                                         sessp->s_ttypgrpid = NO_PID;
1797                                         do_rele = 1;
1798                                 }
1799                                 session_unlock(sessp);
1800
1801                                 if (do_rele) {
1802                                         vnode_rele(vp);
1803                                         if (NULL != tp)
1804                                                 ttyfree(tp);
1805                                 }
1806                         }
1807                         session_rele(sessp);
1808                 }
1809
1810                 devsw_lock(dev, S_IFCHR);
1811
1812                 if (--vp->v_specinfo->si_opencount < 0)
1813                         panic("negative open count (c, %u, %u)", major(dev), minor(dev));
1814
1815                 /*
1816                  * close always, or close on last reference, or close on revoke
1817                  */
1818                 if ((D_TRACKCLOSE & cdevsw[major(dev)].d_type) != 0 ||
1819                     vcount(vp) == 0 || (flags & IO_REVOKE) != 0)
1820                         error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
1821
1822                 devsw_unlock(dev, S_IFCHR);
1823                 break;
1824
1825         case VBLK:
1826                 /*
1827                  * If there is more than one outstanding open, don't
1828                  * send the close to the device.
1829                  */
1830                 devsw_lock(dev, S_IFBLK);
1831                 if (vcount(vp) > 1) {
1832                         vp->v_specinfo->si_opencount--;
1833                         devsw_unlock(dev, S_IFBLK);
1834                         return (0);
1835                 }
1836                 devsw_unlock(dev, S_IFBLK);
1837
1838                 /*
1839                  * On last close of a block device (that isn't mounted)
1840                  * we must invalidate any in core blocks, so that
1841                  * we can, for instance, change floppy disks.
1842                  */
1843                 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
1844                         return (error);
1845
1846                 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
1847                 if (error)
1848                         return (error);
1849
1850                 devsw_lock(dev, S_IFBLK);
1851
1852                 if (--vp->v_specinfo->si_opencount < 0)
1853                         panic("negative open count (b, %u, %u)", major(dev), minor(dev));
1854
1855                 if (vcount(vp) == 0)
1856                         error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
1857
1858                 devsw_unlock(dev, S_IFBLK);
1859                 break;
1860
1861         default:
1862                 panic("spec_close: not special");
1863                 return(EBADF);
1864         }
1865
1866         return error;
1867 }
1868
1869 /*
1870  * Return POSIX pathconf information applicable to special devices.
1871  */
1872 int
1873 spec_pathconf(struct vnop_pathconf_args *ap)
1874 {
1875
1876         switch (ap->a_name) {
1877         case _PC_LINK_MAX:
1878                 *ap->a_retval = LINK_MAX;
1879                 return (0);
1880         case _PC_MAX_CANON:
1881                 *ap->a_retval = MAX_CANON;
1882                 return (0);
1883         case _PC_MAX_INPUT:
1884                 *ap->a_retval = MAX_INPUT;
1885                 return (0);
1886         case _PC_PIPE_BUF:
1887                 *ap->a_retval = PIPE_BUF;
1888                 return (0);
1889         case _PC_CHOWN_RESTRICTED:
1890                 *ap->a_retval = 200112;         /* _POSIX_CHOWN_RESTRICTED */
1891                 return (0);
1892         case _PC_VDISABLE:
1893                 *ap->a_retval = _POSIX_VDISABLE;
1894                 return (0);
1895         default:
1896                 return (EINVAL);
1897         }
1898         /* NOTREACHED */
1899 }
1900
1901 /*
1902  * Special device failed operation
1903  */
1904 int
1905 spec_ebadf(__unused void *dummy)
1906 {
1907
1908         return (EBADF);
1909 }
1910
1911 /* Blktooff derives file offset from logical block number */
1912 int
1913 spec_blktooff(struct vnop_blktooff_args *ap)
1914 {
1915         struct vnode *vp = ap->a_vp;
1916
1917         switch (vp->v_type) {
1918         case VCHR:
1919                 *ap->a_offset = (off_t)-1; /* failure */
1920                 return (ENOTSUP);
1921
1922         case VBLK:
1923                 printf("spec_blktooff: not implemented for VBLK\n");
1924                 *ap->a_offset = (off_t)-1; /* failure */
1925                 return (ENOTSUP);
1926
1927         default:
1928                 panic("spec_blktooff type");
1929         }
1930         /* NOTREACHED */
1931
1932         return (0);
1933 }
1934
1935 /* Offtoblk derives logical block number from file offset */
1936 int
1937 spec_offtoblk(struct vnop_offtoblk_args *ap)
1938 {
1939         struct vnode *vp = ap->a_vp;
1940
1941         switch (vp->v_type) {
1942         case VCHR:
1943                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
1944                 return (ENOTSUP);
1945
1946         case VBLK:
1947                 printf("spec_offtoblk: not implemented for VBLK\n");
1948                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
1949                 return (ENOTSUP);
1950
1951         default:
1952                 panic("spec_offtoblk type");
1953         }
1954         /* NOTREACHED */
1955
1956         return (0);
1957 }
1958
1959 static void filt_specdetach(struct knote *kn);
1960 static int filt_spec(struct knote *kn, long hint);
1961 static unsigned filt_specpeek(struct knote *kn);
1962
1963 struct filterops spec_filtops = {
1964         .f_isfd         = 1,
1965         .f_attach       = filt_specattach,
1966         .f_detach       = filt_specdetach,
1967         .f_event        = filt_spec,
1968         .f_peek         = filt_specpeek
1969 };
1970
1971 static int
1972 filter_to_seltype(int16_t filter)
1973 {
1974         switch (filter) {
1975         case EVFILT_READ:
1976                 return FREAD;
1977         case EVFILT_WRITE:
1978                 return FWRITE;
1979                 break;
1980         default:
1981                 panic("filt_to_seltype(): invalid filter %d\n", filter);
1982                 return 0;
1983         }
1984 }
1985
1986 static int
1987 filt_specattach(struct knote *kn)
1988 {
1989         vnode_t vp;
1990         dev_t dev;
1991
1992         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */
1993
1994         assert(vnode_ischr(vp));
1995
1996         dev = vnode_specrdev(vp);
1997
1998         if (major(dev) > nchrdev) {
1999                 return ENXIO;
2000         }
2001
2002         if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0) {
2003                 return EINVAL;
2004         }
2005
2006         /* Resulting wql is safe to unlink even if it has never been linked */
2007         kn->kn_hook = wait_queue_link_allocate();
2008         if (kn->kn_hook == NULL) {
2009                 return EAGAIN;
2010         }
2011
2012         kn->kn_fop = &spec_filtops;
2013         kn->kn_hookid = vnode_vid(vp);
2014
2015         knote_markstayqueued(kn);
2016
2017         return 0;
2018 }
2019
2020 static void
2021 filt_specdetach(struct knote *kn)
2022 {
2023         kern_return_t ret;
2024
2025         /*
2026          * Given wait queue link and wait queue set, unlink.  This is subtle.
2027          * If the device has been revoked from under us, selclearthread() will
2028          * have removed our link from the kqueue's wait queue set, which
2029          * wait_queue_set_unlink_one() will detect and handle.
2030          */
2031         ret = wait_queue_set_unlink_one(kn->kn_kq->kq_wqs, kn->kn_hook);
2032         if (ret != KERN_SUCCESS) {
2033                 panic("filt_specdetach(): failed to unlink wait queue link.");
2034         }
2035
2036         (void)wait_queue_link_free(kn->kn_hook);
2037         kn->kn_hook = NULL;
2038         kn->kn_status &= ~KN_STAYQUEUED;
2039 }
2040
2041 static int
2042 filt_spec(struct knote *kn, long hint)
2043 {
2044         vnode_t vp;
2045         uthread_t uth;
2046         wait_queue_set_t old_wqs;
2047         vfs_context_t ctx;
2048         int selres;
2049         int error;
2050         int use_offset;
2051         dev_t dev;
2052         uint64_t flags;
2053
2054         assert(kn->kn_hook != NULL);
2055
2056         if (hint != 0) {
2057                 panic("filt_spec(): nonzero hint?");
2058         }
2059
2060         uth = get_bsdthread_info(current_thread());
2061         ctx = vfs_context_current();
2062         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
2063
2064         error = vnode_getwithvid(vp, kn->kn_hookid);
2065         if (error != 0) {
2066                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
2067                 return 1;
2068         }
2069
2070         dev = vnode_specrdev(vp);
2071         flags = cdevsw_flags[major(dev)];
2072         use_offset = ((flags & CDEVSW_USE_OFFSET) != 0);
2073         assert((flags & CDEVSW_SELECT_KQUEUE) != 0);
2074
2075         /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */
2076         old_wqs = uth->uu_wqset;
2077         uth->uu_wqset = kn->kn_kq->kq_wqs;
2078         selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx);
2079         uth->uu_wqset = old_wqs;
2080
2081         if (use_offset) {
2082                 if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) {
2083                         kn->kn_data = 0;
2084                 } else {
2085                         kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset;
2086                 }
2087         } else {
2088                 kn->kn_data = selres;
2089         }
2090
2091         vnode_put(vp);
2092
2093         return (kn->kn_data != 0);
2094 }
2095
2096 static unsigned
2097 filt_specpeek(struct knote *kn)
2098 {
2099         vnode_t vp;
2100         uthread_t uth;
2101         wait_queue_set_t old_wqs;
2102         vfs_context_t ctx;
2103         int error, selres;
2104
2105         uth = get_bsdthread_info(current_thread());
2106         ctx = vfs_context_current();
2107         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
2108
2109         error = vnode_getwithvid(vp, kn->kn_hookid);
2110         if (error != 0) {
2111                 return 1; /* Just like VNOP_SELECT() on recycled vnode */
2112         }
2113
2114         /*
2115          * Why pass the link here?  Because we may not have registered in the past...
2116          */
2117         old_wqs = uth->uu_wqset;
2118         uth->uu_wqset = kn->kn_kq->kq_wqs;
2119         selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx);
2120         uth->uu_wqset = old_wqs;
2121
2122         vnode_put(vp);
2123         return selres;
2124 }
2125