bsd/miscfs/specfs/spec_vnops.c

   1 /*
   2  * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1989, 1993, 1995
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)spec_vnops.c        8.14 (Berkeley) 5/21/95
  62  */
  63
  64 #include <sys/param.h>
  65 #include <sys/proc_internal.h>
  66 #include <sys/kauth.h>
  67 #include <sys/systm.h>
  68 #include <sys/kernel.h>
  69 #include <sys/conf.h>
  70 #include <sys/buf_internal.h>
  71 #include <sys/mount_internal.h>
  72 #include <sys/vnode_internal.h>
  73 #include <sys/file_internal.h>
  74 #include <sys/namei.h>
  75 #include <sys/stat.h>
  76 #include <sys/errno.h>
  77 #include <sys/ioctl.h>
  78 #include <sys/file.h>
  79 #include <sys/user.h>
  80 #include <sys/malloc.h>
  81 #include <sys/disk.h>
  82 #include <sys/uio_internal.h>
  83 #include <sys/resource.h>
  84 #include <machine/machine_routines.h>
  85 #include <miscfs/specfs/specdev.h>
  86 #include <vfs/vfs_support.h>
  87
  88 #include <kern/assert.h>
  89 #include <kern/task.h>
  90 #include <kern/sched_prim.h>
  91 #include <kern/thread.h>
  92 #include <kern/policy_internal.h>
  93 #include <kern/timer_call.h>
  94
  95 #include <pexpert/pexpert.h>
  96
  97 #include <sys/kdebug.h>
  98
  99 /* XXX following three prototypes should be in a header file somewhere */
 100 extern dev_t    chrtoblk(dev_t dev);
 101 extern boolean_t        iskmemdev(dev_t dev);
 102 extern int      bpfkqfilter(dev_t dev, struct knote *kn);
 103 extern int      ptsd_kqfilter(dev_t dev, struct knote *kn);
 104
 105 extern int ignore_is_ssd;
 106
 107 struct vnode *speclisth[SPECHSZ];
 108
 109 /* symbolic sleep message strings for devices */
 110 char    devopn[] = "devopn";
 111 char    devio[] = "devio";
 112 char    devwait[] = "devwait";
 113 char    devin[] = "devin";
 114 char    devout[] = "devout";
 115 char    devioc[] = "devioc";
 116 char    devcls[] = "devcls";
 117
 118 #define VOPFUNC int (*)(void *)
 119
 120 int (**spec_vnodeop_p)(void *);
 121 struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
 122         { &vnop_default_desc, (VOPFUNC)vn_default_error },
 123         { &vnop_lookup_desc, (VOPFUNC)spec_lookup },            /* lookup */
 124         { &vnop_create_desc, (VOPFUNC)err_create },             /* create */
 125         { &vnop_mknod_desc, (VOPFUNC)err_mknod },               /* mknod */
 126         { &vnop_open_desc, (VOPFUNC)spec_open },                        /* open */
 127         { &vnop_close_desc, (VOPFUNC)spec_close },              /* close */
 128         { &vnop_access_desc, (VOPFUNC)spec_access },            /* access */
 129         { &vnop_getattr_desc, (VOPFUNC)spec_getattr },          /* getattr */
 130         { &vnop_setattr_desc, (VOPFUNC)spec_setattr },          /* setattr */
 131         { &vnop_read_desc, (VOPFUNC)spec_read },                        /* read */
 132         { &vnop_write_desc, (VOPFUNC)spec_write },              /* write */
 133         { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl },              /* ioctl */
 134         { &vnop_select_desc, (VOPFUNC)spec_select },            /* select */
 135         { &vnop_revoke_desc, (VOPFUNC)nop_revoke },             /* revoke */
 136         { &vnop_mmap_desc, (VOPFUNC)err_mmap },                 /* mmap */
 137         { &vnop_fsync_desc, (VOPFUNC)spec_fsync },              /* fsync */
 138         { &vnop_remove_desc, (VOPFUNC)err_remove },             /* remove */
 139         { &vnop_link_desc, (VOPFUNC)err_link },                 /* link */
 140         { &vnop_rename_desc, (VOPFUNC)err_rename },             /* rename */
 141         { &vnop_mkdir_desc, (VOPFUNC)err_mkdir },               /* mkdir */
 142         { &vnop_rmdir_desc, (VOPFUNC)err_rmdir },               /* rmdir */
 143         { &vnop_symlink_desc, (VOPFUNC)err_symlink },           /* symlink */
 144         { &vnop_readdir_desc, (VOPFUNC)err_readdir },           /* readdir */
 145         { &vnop_readlink_desc, (VOPFUNC)err_readlink },         /* readlink */
 146         { &vnop_inactive_desc, (VOPFUNC)nop_inactive },         /* inactive */
 147         { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim },           /* reclaim */
 148         { &vnop_strategy_desc, (VOPFUNC)spec_strategy },                /* strategy */
 149         { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf },                /* pathconf */
 150         { &vnop_advlock_desc, (VOPFUNC)err_advlock },           /* advlock */
 151         { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite },            /* bwrite */
 152         { &vnop_pagein_desc, (VOPFUNC)err_pagein },             /* Pagein */
 153         { &vnop_pageout_desc, (VOPFUNC)err_pageout },           /* Pageout */
 154         { &vnop_copyfile_desc, (VOPFUNC)err_copyfile },         /* Copyfile */
 155         { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff },                /* blktooff */
 156         { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk },                /* offtoblk */
 157         { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap },                /* blockmap */
 158         { (struct vnodeop_desc*)NULL, (int(*)())NULL }
 159 };
 160 struct vnodeopv_desc spec_vnodeop_opv_desc =
 161         { &spec_vnodeop_p, spec_vnodeop_entries };
 162
 163
 164 static void set_blocksize(vnode_t, dev_t);
 165
 166 #define LOWPRI_TIER1_WINDOW_MSECS         25
 167 #define LOWPRI_TIER2_WINDOW_MSECS         100
 168 #define LOWPRI_TIER3_WINDOW_MSECS         500
 169
 170 #define LOWPRI_TIER1_IO_PERIOD_MSECS      40
 171 #define LOWPRI_TIER2_IO_PERIOD_MSECS      85
 172 #define LOWPRI_TIER3_IO_PERIOD_MSECS      200
 173
 174 #define LOWPRI_TIER1_IO_PERIOD_SSD_MSECS  5
 175 #define LOWPRI_TIER2_IO_PERIOD_SSD_MSECS  15
 176 #define LOWPRI_TIER3_IO_PERIOD_SSD_MSECS  25
 177
 178
 179 int     throttle_windows_msecs[THROTTLE_LEVEL_END + 1] = {
 180         0,
 181         LOWPRI_TIER1_WINDOW_MSECS,
 182         LOWPRI_TIER2_WINDOW_MSECS,
 183         LOWPRI_TIER3_WINDOW_MSECS,
 184 };
 185
 186 int     throttle_io_period_msecs[THROTTLE_LEVEL_END + 1] = {
 187         0,
 188         LOWPRI_TIER1_IO_PERIOD_MSECS,
 189         LOWPRI_TIER2_IO_PERIOD_MSECS,
 190         LOWPRI_TIER3_IO_PERIOD_MSECS,
 191 };
 192
 193 int     throttle_io_period_ssd_msecs[THROTTLE_LEVEL_END + 1] = {
 194         0,
 195         LOWPRI_TIER1_IO_PERIOD_SSD_MSECS,
 196         LOWPRI_TIER2_IO_PERIOD_SSD_MSECS,
 197         LOWPRI_TIER3_IO_PERIOD_SSD_MSECS,
 198 };
 199
 200
 201 int     throttled_count[THROTTLE_LEVEL_END + 1];
 202
 203 struct _throttle_io_info_t {
 204         lck_mtx_t       throttle_lock;
 205
 206         struct timeval  throttle_last_write_timestamp;
 207         struct timeval  throttle_min_timer_deadline;
 208         struct timeval  throttle_window_start_timestamp[THROTTLE_LEVEL_END + 1]; /* window starts at both the beginning and completion of an I/O */
 209         struct timeval  throttle_last_IO_timestamp[THROTTLE_LEVEL_END + 1];
 210         pid_t           throttle_last_IO_pid[THROTTLE_LEVEL_END + 1];
 211         struct timeval  throttle_start_IO_period_timestamp[THROTTLE_LEVEL_END + 1];
 212         int32_t throttle_inflight_count[THROTTLE_LEVEL_END + 1];
 213
 214         TAILQ_HEAD( , uthread) throttle_uthlist[THROTTLE_LEVEL_END + 1];        /* Lists of throttled uthreads */
 215         int             throttle_next_wake_level;
 216
 217         thread_call_t   throttle_timer_call;
 218         int32_t throttle_timer_ref;
 219         int32_t throttle_timer_active;
 220
 221         int32_t throttle_io_count;
 222         int32_t throttle_io_count_begin;
 223         int    *throttle_io_periods;
 224         uint32_t throttle_io_period_num;
 225
 226         int32_t throttle_refcnt;
 227         int32_t throttle_alloc;
 228         int32_t throttle_disabled;
 229         int32_t throttle_is_fusion_with_priority;
 230 };
 231
 232 struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
 233
 234
 235 int     lowpri_throttle_enabled = 1;
 236
 237
 238 static void throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level);
 239 static int throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap);
 240 static int throttle_get_thread_throttle_level(uthread_t ut);
 241 static int throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier);
 242
 243 /*
 244  * Trivial lookup routine that always fails.
 245  */
 246 int
 247 spec_lookup(struct vnop_lookup_args *ap)
 248 {
 249
 250         *ap->a_vpp = NULL;
 251         return (ENOTDIR);
 252 }
 253
 254 static void
 255 set_blocksize(struct vnode *vp, dev_t dev)
 256 {
 257     int (*size)(dev_t);
 258     int rsize;
 259
 260     if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
 261         rsize = (*size)(dev);
 262         if (rsize <= 0)        /* did size fail? */
 263             vp->v_specsize = DEV_BSIZE;
 264         else
 265             vp->v_specsize = rsize;
 266     }
 267     else
 268             vp->v_specsize = DEV_BSIZE;
 269 }
 270
 271 void
 272 set_fsblocksize(struct vnode *vp)
 273 {
 274
 275         if (vp->v_type == VBLK) {
 276                 dev_t dev = (dev_t)vp->v_rdev;
 277                 int maj = major(dev);
 278
 279                 if ((u_int)maj >= (u_int)nblkdev)
 280                         return;
 281
 282                 vnode_lock(vp);
 283                 set_blocksize(vp, dev);
 284                 vnode_unlock(vp);
 285         }
 286
 287 }
 288
 289
 290 /*
 291  * Open a special file.
 292  */
 293 int
 294 spec_open(struct vnop_open_args *ap)
 295 {
 296         struct proc *p = vfs_context_proc(ap->a_context);
 297         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
 298         struct vnode *vp = ap->a_vp;
 299         dev_t bdev, dev = (dev_t)vp->v_rdev;
 300         int maj = major(dev);
 301         int error;
 302
 303         /*
 304          * Don't allow open if fs is mounted -nodev.
 305          */
 306         if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
 307                 return (ENXIO);
 308
 309         switch (vp->v_type) {
 310
 311         case VCHR:
 312                 if ((u_int)maj >= (u_int)nchrdev)
 313                         return (ENXIO);
 314                 if (cred != FSCRED && (ap->a_mode & FWRITE)) {
 315                         /*
 316                          * When running in very secure mode, do not allow
 317                          * opens for writing of any disk character devices.
 318                          */
 319                         if (securelevel >= 2 && isdisk(dev, VCHR))
 320                                 return (EPERM);
 321
 322                         /* Never allow writing to /dev/mem or /dev/kmem */
 323                         if (iskmemdev(dev))
 324                                 return (EPERM);
 325                         /*
 326                          * When running in secure mode, do not allow opens for
 327                          * writing of character devices whose corresponding block
 328                          * devices are currently mounted.
 329                          */
 330                         if (securelevel >= 1) {
 331                                 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error))
 332                                         return (error);
 333                         }
 334                 }
 335
 336                 devsw_lock(dev, S_IFCHR);
 337                 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
 338
 339                 if (error == 0) {
 340                         vp->v_specinfo->si_opencount++;
 341                 }
 342
 343                 devsw_unlock(dev, S_IFCHR);
 344
 345                 if (error == 0 && cdevsw[maj].d_type == D_DISK && !vp->v_un.vu_specinfo->si_initted) {
 346                         int     isssd = 0;
 347                         uint64_t throttle_mask = 0;
 348                         uint32_t devbsdunit = 0;
 349
 350                         if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL) == 0) {
 351
 352                                 if (throttle_mask != 0 &&
 353                                     VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) {
 354                                         /*
 355                                          * as a reasonable approximation, only use the lowest bit of the mask
 356                                          * to generate a disk unit number
 357                                          */
 358                                         devbsdunit = num_trailing_0(throttle_mask);
 359
 360                                         vnode_lock(vp);
 361
 362                                         vp->v_un.vu_specinfo->si_isssd = isssd;
 363                                         vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit;
 364                                         vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask;
 365                                         vp->v_un.vu_specinfo->si_throttleable = 1;
 366                                         vp->v_un.vu_specinfo->si_initted = 1;
 367
 368                                         vnode_unlock(vp);
 369                                 }
 370                         }
 371                         if (vp->v_un.vu_specinfo->si_initted == 0) {
 372                                 vnode_lock(vp);
 373                                 vp->v_un.vu_specinfo->si_initted = 1;
 374                                 vnode_unlock(vp);
 375                         }
 376                 }
 377                 return (error);
 378
 379         case VBLK:
 380                 if ((u_int)maj >= (u_int)nblkdev)
 381                         return (ENXIO);
 382                 /*
 383                  * When running in very secure mode, do not allow
 384                  * opens for writing of any disk block devices.
 385                  */
 386                 if (securelevel >= 2 && cred != FSCRED &&
 387                     (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK)
 388                         return (EPERM);
 389                 /*
 390                  * Do not allow opens of block devices that are
 391                  * currently mounted.
 392                  */
 393                 if ( (error = vfs_mountedon(vp)) )
 394                         return (error);
 395
 396                 devsw_lock(dev, S_IFBLK);
 397                 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
 398                 if (!error) {
 399                         vp->v_specinfo->si_opencount++;
 400                 }
 401                 devsw_unlock(dev, S_IFBLK);
 402
 403                 if (!error) {
 404                     u_int64_t blkcnt;
 405                     u_int32_t blksize;
 406                         int setsize = 0;
 407                         u_int32_t size512 = 512;
 408
 409
 410                     if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
 411                                 /* Switch to 512 byte sectors (temporarily) */
 412
 413                                 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
 414                                 /* Get the number of 512 byte physical blocks. */
 415                                 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
 416                                                 setsize = 1;
 417                                 }
 418                                 }
 419                                 /* If it doesn't set back, we can't recover */
 420                                 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context))
 421                                 error = ENXIO;
 422                     }
 423
 424
 425                         vnode_lock(vp);
 426                     set_blocksize(vp, dev);
 427
 428                     /*
 429                      * Cache the size in bytes of the block device for later
 430                      * use by spec_write().
 431                      */
 432                         if (setsize)
 433                                 vp->v_specdevsize = blkcnt * (u_int64_t)size512;
 434                         else
 435                         vp->v_specdevsize = (u_int64_t)0;       /* Default: Can't get */
 436
 437                         vnode_unlock(vp);
 438
 439                 }
 440                 return(error);
 441         default:
 442                 panic("spec_open type");
 443         }
 444         return (0);
 445 }
 446
 447 /*
 448  * Vnode op for read
 449  */
 450 int
 451 spec_read(struct vnop_read_args *ap)
 452 {
 453         struct vnode *vp = ap->a_vp;
 454         struct uio *uio = ap->a_uio;
 455         struct buf *bp;
 456         daddr64_t bn, nextbn;
 457         long bsize, bscale;
 458         int devBlockSize=0;
 459         int n, on;
 460         int error = 0;
 461         dev_t dev;
 462
 463 #if DIAGNOSTIC
 464         if (uio->uio_rw != UIO_READ)
 465                 panic("spec_read mode");
 466         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
 467                 panic("spec_read proc");
 468 #endif
 469         if (uio_resid(uio) == 0)
 470                 return (0);
 471
 472         switch (vp->v_type) {
 473
 474         case VCHR:
 475                 {
 476                         struct _throttle_io_info_t *throttle_info = NULL;
 477                         int thread_throttle_level;
 478                 if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
 479                         throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
 480                                 thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
 481                 }
 482                 error = (*cdevsw[major(vp->v_rdev)].d_read)
 483                         (vp->v_rdev, uio, ap->a_ioflag);
 484
 485                         if (throttle_info) {
 486                                 throttle_info_end_io_internal(throttle_info, thread_throttle_level);
 487                         }
 488
 489                 return (error);
 490                 }
 491
 492         case VBLK:
 493                 if (uio->uio_offset < 0)
 494                         return (EINVAL);
 495
 496                 dev = vp->v_rdev;
 497
 498                 devBlockSize = vp->v_specsize;
 499
 500                 if (devBlockSize > PAGE_SIZE)
 501                         return (EINVAL);
 502
 503                 bscale = PAGE_SIZE / devBlockSize;
 504                 bsize = bscale * devBlockSize;
 505
 506                 do {
 507                         on = uio->uio_offset % bsize;
 508
 509                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1));
 510
 511                         if (vp->v_speclastr + bscale == bn) {
 512                                 nextbn = bn + bscale;
 513                                 error = buf_breadn(vp, bn, (int)bsize, &nextbn,
 514                                                (int *)&bsize, 1, NOCRED, &bp);
 515                         } else
 516                                 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
 517
 518                         vnode_lock(vp);
 519                         vp->v_speclastr = bn;
 520                         vnode_unlock(vp);
 521
 522                         n = bsize - buf_resid(bp);
 523                         if ((on > n) || error) {
 524                                 if (!error)
 525                                         error = EINVAL;
 526                                 buf_brelse(bp);
 527                                 return (error);
 528                         }
 529                         n = min((unsigned)(n  - on), uio_resid(uio));
 530
 531                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
 532                         if (n + on == bsize)
 533                                 buf_markaged(bp);
 534                         buf_brelse(bp);
 535                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
 536                 return (error);
 537
 538         default:
 539                 panic("spec_read type");
 540         }
 541         /* NOTREACHED */
 542
 543         return (0);
 544 }
 545
 546 /*
 547  * Vnode op for write
 548  */
 549 int
 550 spec_write(struct vnop_write_args *ap)
 551 {
 552         struct vnode *vp = ap->a_vp;
 553         struct uio *uio = ap->a_uio;
 554         struct buf *bp;
 555         daddr64_t bn;
 556         int bsize, blkmask, bscale;
 557         int io_sync;
 558         int devBlockSize=0;
 559         int n, on;
 560         int error = 0;
 561         dev_t dev;
 562
 563 #if DIAGNOSTIC
 564         if (uio->uio_rw != UIO_WRITE)
 565                 panic("spec_write mode");
 566         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
 567                 panic("spec_write proc");
 568 #endif
 569
 570         switch (vp->v_type) {
 571
 572         case VCHR:
 573                 {
 574                         struct _throttle_io_info_t *throttle_info = NULL;
 575                         int thread_throttle_level;
 576                 if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
 577                         throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
 578
 579                                 thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
 580
 581                         microuptime(&throttle_info->throttle_last_write_timestamp);
 582                 }
 583                 error = (*cdevsw[major(vp->v_rdev)].d_write)
 584                         (vp->v_rdev, uio, ap->a_ioflag);
 585
 586                         if (throttle_info) {
 587                                 throttle_info_end_io_internal(throttle_info, thread_throttle_level);
 588                         }
 589
 590                 return (error);
 591                 }
 592
 593         case VBLK:
 594                 if (uio_resid(uio) == 0)
 595                         return (0);
 596                 if (uio->uio_offset < 0)
 597                         return (EINVAL);
 598
 599                 io_sync = (ap->a_ioflag & IO_SYNC);
 600
 601                 dev = (vp->v_rdev);
 602
 603                 devBlockSize = vp->v_specsize;
 604                 if (devBlockSize > PAGE_SIZE)
 605                         return(EINVAL);
 606
 607                 bscale = PAGE_SIZE / devBlockSize;
 608                 blkmask = bscale - 1;
 609                 bsize = bscale * devBlockSize;
 610
 611
 612                 do {
 613                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask);
 614                         on = uio->uio_offset % bsize;
 615
 616                         n = min((unsigned)(bsize - on), uio_resid(uio));
 617
 618                         /*
 619                          * Use buf_getblk() as an optimization IFF:
 620                          *
 621                          * 1)   We are reading exactly a block on a block
 622                          *      aligned boundary
 623                          * 2)   We know the size of the device from spec_open
 624                          * 3)   The read doesn't span the end of the device
 625                          *
 626                          * Otherwise, we fall back on buf_bread().
 627                          */
 628                         if (n == bsize &&
 629                             vp->v_specdevsize != (u_int64_t)0 &&
 630                             (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
 631                             /* reduce the size of the read to what is there */
 632                             n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
 633                         }
 634
 635                         if (n == bsize)
 636                                 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
 637                         else
 638                                 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
 639
 640                         /* Translate downstream error for upstream, if needed */
 641                         if (!error)
 642                                 error = (int)buf_error(bp);
 643                         if (error) {
 644                                 buf_brelse(bp);
 645                                 return (error);
 646                         }
 647                         n = min(n, bsize - buf_resid(bp));
 648
 649                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
 650                         if (error) {
 651                                 buf_brelse(bp);
 652                                 return (error);
 653                         }
 654                         buf_markaged(bp);
 655
 656                         if (io_sync)
 657                                 error = buf_bwrite(bp);
 658                         else {
 659                                 if ((n + on) == bsize)
 660                                         error = buf_bawrite(bp);
 661                                 else
 662                                         error = buf_bdwrite(bp);
 663                         }
 664                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
 665                 return (error);
 666
 667         default:
 668                 panic("spec_write type");
 669         }
 670         /* NOTREACHED */
 671
 672         return (0);
 673 }
 674
 675 /*
 676  * Device ioctl operation.
 677  */
 678 int
 679 spec_ioctl(struct vnop_ioctl_args *ap)
 680 {
 681         proc_t p = vfs_context_proc(ap->a_context);
 682         dev_t dev = ap->a_vp->v_rdev;
 683         int     retval = 0;
 684
 685         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START,
 686                 dev, ap->a_command, ap->a_fflag, ap->a_vp->v_type, 0);
 687
 688         switch (ap->a_vp->v_type) {
 689
 690         case VCHR:
 691                 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
 692                                                        ap->a_fflag, p);
 693                 break;
 694
 695         case VBLK:
 696                 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, p);
 697                 if (!retval && ap->a_command == DKIOCSETBLOCKSIZE)
 698                         ap->a_vp->v_specsize = *(uint32_t *)ap->a_data;
 699                 break;
 700
 701         default:
 702                 panic("spec_ioctl");
 703                 /* NOTREACHED */
 704         }
 705         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END,
 706                 dev, ap->a_command, ap->a_fflag, retval, 0);
 707
 708         return (retval);
 709 }
 710
 711 int
 712 spec_select(struct vnop_select_args *ap)
 713 {
 714         proc_t p = vfs_context_proc(ap->a_context);
 715         dev_t dev;
 716
 717         switch (ap->a_vp->v_type) {
 718
 719         default:
 720                 return (1);             /* XXX */
 721
 722         case VCHR:
 723                 dev = ap->a_vp->v_rdev;
 724                 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
 725         }
 726 }
 727
 728 static int filt_specattach(struct knote *kn);
 729
 730 int
 731 spec_kqfilter(vnode_t vp, struct knote *kn)
 732 {
 733         dev_t dev;
 734
 735         assert(vnode_ischr(vp));
 736
 737         dev = vnode_specrdev(vp);
 738
 739 #if NETWORKING
 740         /*
 741          * Try a bpf device, as defined in bsd/net/bpf.c
 742          * If it doesn't error out the attach, then it
 743          * claimed it. Otherwise, fall through and try
 744          * a regular spec attach.
 745          */
 746         int32_t tmp_flags = kn->kn_flags;
 747         int64_t tmp_data = kn->kn_data;
 748         int res;
 749
 750         res = bpfkqfilter(dev, kn);
 751         if ((kn->kn_flags & EV_ERROR) == 0) {
 752                 return res;
 753         }
 754         kn->kn_flags = tmp_flags;
 755         kn->kn_data = tmp_data;
 756 #endif
 757
 758         /* Try to attach to other char special devices */
 759         return filt_specattach(kn);
 760 }
 761
 762 /*
 763  * Synch buffers associated with a block device
 764  */
 765 int
 766 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
 767 {
 768         if (vp->v_type == VCHR)
 769                 return (0);
 770         /*
 771          * Flush all dirty buffers associated with a block device.
 772          */
 773         buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync");
 774
 775         return (0);
 776 }
 777
 778 int
 779 spec_fsync(struct vnop_fsync_args *ap)
 780 {
 781         return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
 782 }
 783
 784
 785 /*
 786  * Just call the device strategy routine
 787  */
 788 void throttle_init(void);
 789
 790
 791 #if 0
 792 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)  \
 793         do {                                                    \
 794                if ((debug_info)->alloc)                           \
 795                printf("%s: "format, __FUNCTION__, ## args);     \
 796        } while(0)
 797
 798 #else
 799 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
 800 #endif
 801
 802
 803 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER1], 0, "");
 804 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER2], 0, "");
 805 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER3], 0, "");
 806
 807 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER1], 0, "");
 808 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER2], 0, "");
 809 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER3], 0, "");
 810
 811 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER1], 0, "");
 812 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER2], 0, "");
 813 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER3], 0, "");
 814
 815 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_throttle_enabled, 0, "");
 816
 817
 818 static lck_grp_t        *throttle_lock_grp;
 819 static lck_attr_t       *throttle_lock_attr;
 820 static lck_grp_attr_t   *throttle_lock_grp_attr;
 821
 822
 823 /*
 824  * throttled I/O helper function
 825  * convert the index of the lowest set bit to a device index
 826  */
 827 int
 828 num_trailing_0(uint64_t n)
 829 {
 830         /*
 831          * since in most cases the number of trailing 0s is very small,
 832          * we simply counting sequentially from the lowest bit
 833          */
 834         if (n == 0)
 835                 return sizeof(n) * 8;
 836         int count = 0;
 837         while (!ISSET(n, 1)) {
 838                 n >>= 1;
 839                 ++count;
 840         }
 841         return count;
 842 }
 843
 844
 845 /*
 846  * Release the reference and if the item was allocated and this is the last
 847  * reference then free it.
 848  *
 849  * This routine always returns the old value.
 850  */
 851 static int
 852 throttle_info_rel(struct _throttle_io_info_t *info)
 853 {
 854         SInt32 oldValue = OSDecrementAtomic(&info->throttle_refcnt);
 855
 856         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
 857                 info, (int)(oldValue -1), info );
 858
 859         /* The reference count just went negative, very bad */
 860         if (oldValue == 0)
 861                 panic("throttle info ref cnt went negative!");
 862
 863         /*
 864          * Once reference count is zero, no one else should be able to take a
 865          * reference
 866          */
 867         if ((info->throttle_refcnt == 0) && (info->throttle_alloc)) {
 868                 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info);
 869
 870                 lck_mtx_destroy(&info->throttle_lock, throttle_lock_grp);
 871                 FREE(info, M_TEMP);
 872         }
 873         return oldValue;
 874 }
 875
 876
 877 /*
 878  * Just take a reference on the throttle info structure.
 879  *
 880  * This routine always returns the old value.
 881  */
 882 static SInt32
 883 throttle_info_ref(struct _throttle_io_info_t *info)
 884 {
 885         SInt32 oldValue = OSIncrementAtomic(&info->throttle_refcnt);
 886
 887         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
 888                 info, (int)(oldValue -1), info );
 889         /* Allocated items should never have a reference of zero */
 890         if (info->throttle_alloc && (oldValue == 0))
 891                 panic("Taking a reference without calling create throttle info!\n");
 892
 893         return oldValue;
 894 }
 895
 896 /*
 897  * on entry the throttle_lock is held...
 898  * this function is responsible for taking
 899  * and dropping the reference on the info
 900  * structure which will keep it from going
 901  * away while the timer is running if it
 902  * happens to have been dynamically allocated by
 903  * a network fileystem kext which is now trying
 904  * to free it
 905  */
 906 static uint32_t
 907 throttle_timer_start(struct _throttle_io_info_t *info, boolean_t update_io_count, int wakelevel)
 908 {
 909         struct timeval  elapsed;
 910         struct timeval  now;
 911         struct timeval  period;
 912         uint64_t        elapsed_msecs;
 913         int             throttle_level;
 914         int             level;
 915         int             msecs;
 916         boolean_t       throttled = FALSE;
 917         boolean_t       need_timer = FALSE;
 918
 919         microuptime(&now);
 920
 921         if (update_io_count == TRUE) {
 922                 info->throttle_io_count_begin = info->throttle_io_count;
 923                 info->throttle_io_period_num++;
 924
 925                 while (wakelevel >= THROTTLE_LEVEL_THROTTLED)
 926                         info->throttle_start_IO_period_timestamp[wakelevel--] = now;
 927
 928                 info->throttle_min_timer_deadline = now;
 929
 930                 msecs = info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED];
 931                 period.tv_sec = msecs / 1000;
 932                 period.tv_usec = (msecs % 1000) * 1000;
 933
 934                 timevaladd(&info->throttle_min_timer_deadline, &period);
 935         }
 936         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < THROTTLE_LEVEL_END; throttle_level++) {
 937
 938                 elapsed = now;
 939                 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
 940                 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
 941
 942                 for (level = throttle_level + 1; level <= THROTTLE_LEVEL_END; level++) {
 943
 944                         if (!TAILQ_EMPTY(&info->throttle_uthlist[level])) {
 945
 946                                 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[level] || info->throttle_inflight_count[throttle_level]) {
 947                                         /*
 948                                          * we had an I/O occur at a higher priority tier within
 949                                          * this tier's throttle window
 950                                          */
 951                                         throttled = TRUE;
 952                                 }
 953                                 /*
 954                                  * we assume that the windows are the same or longer
 955                                  * as we drop through the throttling tiers...  thus
 956                                  * we can stop looking once we run into a tier with
 957                                  * threads to schedule regardless of whether it's
 958                                  * still in its throttling window or not
 959                                  */
 960                                 break;
 961                         }
 962                 }
 963                 if (throttled == TRUE)
 964                         break;
 965         }
 966         if (throttled == TRUE) {
 967                 uint64_t        deadline = 0;
 968                 struct timeval  target;
 969                 struct timeval  min_target;
 970
 971                 /*
 972                  * we've got at least one tier still in a throttled window
 973                  * so we need a timer running... compute the next deadline
 974                  * and schedule it
 975                  */
 976                 for (level = throttle_level+1; level <= THROTTLE_LEVEL_END; level++) {
 977
 978                         if (TAILQ_EMPTY(&info->throttle_uthlist[level]))
 979                                 continue;
 980
 981                         target = info->throttle_start_IO_period_timestamp[level];
 982
 983                         msecs = info->throttle_io_periods[level];
 984                         period.tv_sec = msecs / 1000;
 985                         period.tv_usec = (msecs % 1000) * 1000;
 986
 987                         timevaladd(&target, &period);
 988
 989                         if (need_timer == FALSE || timevalcmp(&target, &min_target, <)) {
 990                                 min_target = target;
 991                                 need_timer = TRUE;
 992                         }
 993                 }
 994                 if (timevalcmp(&info->throttle_min_timer_deadline, &now, >)) {
 995                         if (timevalcmp(&info->throttle_min_timer_deadline, &min_target, >))
 996                                 min_target = info->throttle_min_timer_deadline;
 997                 }
 998
 999                 if (info->throttle_timer_active) {
1000                         if (thread_call_cancel(info->throttle_timer_call) == FALSE) {
1001                                 /*
1002                                  * couldn't kill the timer because it's already
1003                                  * been dispatched, so don't try to start a new
1004                                  * one... once we drop the lock, the timer will
1005                                  * proceed and eventually re-run this function
1006                                  */
1007                                 need_timer = FALSE;
1008                         } else
1009                                 info->throttle_timer_active = 0;
1010                 }
1011                 if (need_timer == TRUE) {
1012                         /*
1013                          * This is defined as an int (32-bit) rather than a 64-bit
1014                          * value because it would need a really big period in the
1015                          * order of ~500 days to overflow this. So, we let this be
1016                          * 32-bit which allows us to use the clock_interval_to_deadline()
1017                          * routine.
1018                          */
1019                         int     target_msecs;
1020
1021                         if (info->throttle_timer_ref == 0) {
1022                                 /*
1023                                  * take a reference for the timer
1024                                  */
1025                                 throttle_info_ref(info);
1026
1027                                 info->throttle_timer_ref = 1;
1028                         }
1029                         elapsed = min_target;
1030                         timevalsub(&elapsed, &now);
1031                         target_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
1032
1033                         if (target_msecs <= 0) {
1034                                 /*
1035                                  * we may have computed a deadline slightly in the past
1036                                  * due to various factors... if so, just set the timer
1037                                  * to go off in the near future (we don't need to be precise)
1038                                  */
1039                                 target_msecs = 1;
1040                         }
1041                         clock_interval_to_deadline(target_msecs, 1000000, &deadline);
1042
1043                         thread_call_enter_delayed(info->throttle_timer_call, deadline);
1044                         info->throttle_timer_active = 1;
1045                 }
1046         }
1047         return (throttle_level);
1048 }
1049
1050
1051 static void
1052 throttle_timer(struct _throttle_io_info_t *info)
1053 {
1054         uthread_t       ut, utlist;
1055         struct timeval  elapsed;
1056         struct timeval  now;
1057         uint64_t        elapsed_msecs;
1058         int             throttle_level;
1059         int             level;
1060         int             wake_level;
1061         caddr_t         wake_address = NULL;
1062         boolean_t       update_io_count = FALSE;
1063         boolean_t       need_wakeup = FALSE;
1064         boolean_t       need_release = FALSE;
1065
1066         ut = NULL;
1067         lck_mtx_lock(&info->throttle_lock);
1068
1069         info->throttle_timer_active = 0;
1070         microuptime(&now);
1071
1072         elapsed = now;
1073         timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[THROTTLE_LEVEL_THROTTLED]);
1074         elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1075
1076         if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED]) {
1077
1078                 wake_level = info->throttle_next_wake_level;
1079
1080                 for (level = THROTTLE_LEVEL_START; level < THROTTLE_LEVEL_END; level++) {
1081
1082                         elapsed = now;
1083                         timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[wake_level]);
1084                         elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1085
1086                         if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[wake_level] && !TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
1087                                 /*
1088                                  * we're closing out the current IO period...
1089                                  * if we have a waiting thread, wake it up
1090                                  * after we have reset the I/O window info
1091                                  */
1092                                 need_wakeup = TRUE;
1093                                 update_io_count = TRUE;
1094
1095                                 info->throttle_next_wake_level = wake_level - 1;
1096
1097                                 if (info->throttle_next_wake_level == THROTTLE_LEVEL_START)
1098                                         info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1099
1100                                 break;
1101                         }
1102                         wake_level--;
1103
1104                         if (wake_level == THROTTLE_LEVEL_START)
1105                                 wake_level = THROTTLE_LEVEL_END;
1106                 }
1107         }
1108         if (need_wakeup == TRUE) {
1109                 if (!TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
1110
1111                         ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist[wake_level]);
1112                         TAILQ_REMOVE(&info->throttle_uthlist[wake_level], ut, uu_throttlelist);
1113                         ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1114                         ut->uu_is_throttled = FALSE;
1115
1116                         wake_address = (caddr_t)&ut->uu_on_throttlelist;
1117                 }
1118         } else
1119                 wake_level = THROTTLE_LEVEL_START;
1120
1121         throttle_level = throttle_timer_start(info, update_io_count, wake_level);
1122
1123         if (wake_address != NULL)
1124                 wakeup(wake_address);
1125
1126         for (level = THROTTLE_LEVEL_THROTTLED; level <= throttle_level; level++) {
1127
1128                 TAILQ_FOREACH_SAFE(ut, &info->throttle_uthlist[level], uu_throttlelist, utlist) {
1129
1130                         TAILQ_REMOVE(&info->throttle_uthlist[level], ut, uu_throttlelist);
1131                         ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1132                         ut->uu_is_throttled = FALSE;
1133
1134                         wakeup(&ut->uu_on_throttlelist);
1135                 }
1136         }
1137         if (info->throttle_timer_active == 0 && info->throttle_timer_ref) {
1138                 info->throttle_timer_ref = 0;
1139                 need_release = TRUE;
1140         }
1141         lck_mtx_unlock(&info->throttle_lock);
1142
1143         if (need_release == TRUE)
1144                 throttle_info_rel(info);
1145 }
1146
1147
1148 static int
1149 throttle_add_to_list(struct _throttle_io_info_t *info, uthread_t ut, int mylevel, boolean_t insert_tail)
1150 {
1151         boolean_t start_timer = FALSE;
1152         int level = THROTTLE_LEVEL_START;
1153
1154         if (TAILQ_EMPTY(&info->throttle_uthlist[mylevel])) {
1155                 info->throttle_start_IO_period_timestamp[mylevel] = info->throttle_last_IO_timestamp[mylevel];
1156                 start_timer = TRUE;
1157         }
1158
1159         if (insert_tail == TRUE)
1160                 TAILQ_INSERT_TAIL(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
1161         else
1162                 TAILQ_INSERT_HEAD(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
1163
1164         ut->uu_on_throttlelist = mylevel;
1165
1166         if (start_timer == TRUE) {
1167                 /* we may need to start or rearm the timer */
1168                 level = throttle_timer_start(info, FALSE, THROTTLE_LEVEL_START);
1169
1170                 if (level == THROTTLE_LEVEL_END) {
1171                         if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
1172                                 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1173
1174                                 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1175                         }
1176                 }
1177         }
1178         return (level);
1179 }
1180
1181 static void
1182 throttle_init_throttle_window(void)
1183 {
1184         int throttle_window_size;
1185
1186         /*
1187          * The hierarchy of throttle window values is as follows:
1188          * - Global defaults
1189          * - Device tree properties
1190          * - Boot-args
1191          * All values are specified in msecs.
1192          */
1193
1194         /* Override global values with device-tree properties */
1195         if (PE_get_default("kern.io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size)))
1196                 throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
1197
1198         if (PE_get_default("kern.io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size)))
1199                 throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
1200
1201         if (PE_get_default("kern.io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size)))
1202                 throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
1203
1204         /* Override with boot-args */
1205         if (PE_parse_boot_argn("io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size)))
1206                 throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
1207
1208         if (PE_parse_boot_argn("io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size)))
1209                 throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
1210
1211         if (PE_parse_boot_argn("io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size)))
1212                 throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
1213 }
1214
1215 static void
1216 throttle_init_throttle_period(struct _throttle_io_info_t *info, boolean_t isssd)
1217 {
1218         int throttle_period_size;
1219
1220         /*
1221          * The hierarchy of throttle period values is as follows:
1222          * - Global defaults
1223          * - Device tree properties
1224          * - Boot-args
1225          * All values are specified in msecs.
1226          */
1227
1228         /* Assign global defaults */
1229         if ((isssd == TRUE) && (info->throttle_is_fusion_with_priority == 0))
1230                 info->throttle_io_periods = &throttle_io_period_ssd_msecs[0];
1231         else
1232                 info->throttle_io_periods = &throttle_io_period_msecs[0];
1233
1234         /* Override global values with device-tree properties */
1235         if (PE_get_default("kern.io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size)))
1236                 info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
1237
1238         if (PE_get_default("kern.io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size)))
1239                 info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
1240
1241         if (PE_get_default("kern.io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size)))
1242                 info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
1243
1244         /* Override with boot-args */
1245         if (PE_parse_boot_argn("io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size)))
1246                 info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
1247
1248         if (PE_parse_boot_argn("io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size)))
1249                 info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
1250
1251         if (PE_parse_boot_argn("io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size)))
1252                 info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
1253
1254 }
1255
1256 #if CONFIG_IOSCHED
1257 extern  void vm_io_reprioritize_init(void);
1258 int     iosched_enabled = 1;
1259 #endif
1260
1261 void
1262 throttle_init(void)
1263 {
1264         struct _throttle_io_info_t *info;
1265         int     i;
1266         int     level;
1267 #if CONFIG_IOSCHED
1268         int     iosched;
1269 #endif
1270         /*
1271          * allocate lock group attribute and group
1272          */
1273         throttle_lock_grp_attr = lck_grp_attr_alloc_init();
1274         throttle_lock_grp = lck_grp_alloc_init("throttle I/O", throttle_lock_grp_attr);
1275
1276         /* Update throttle parameters based on device tree configuration */
1277         throttle_init_throttle_window();
1278
1279         /*
1280          * allocate the lock attribute
1281          */
1282         throttle_lock_attr = lck_attr_alloc_init();
1283
1284         for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) {
1285                 info = &_throttle_io_info[i];
1286
1287                 lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr);
1288                 info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1289
1290                 for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
1291                         TAILQ_INIT(&info->throttle_uthlist[level]);
1292                         info->throttle_last_IO_pid[level] = 0;
1293                         info->throttle_inflight_count[level] = 0;
1294                 }
1295                 info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1296                 info->throttle_disabled = 0;
1297                 info->throttle_is_fusion_with_priority = 0;
1298         }
1299 #if CONFIG_IOSCHED
1300         if (PE_parse_boot_argn("iosched", &iosched, sizeof(iosched))) {
1301                 iosched_enabled = iosched;
1302         }
1303         if (iosched_enabled) {
1304                 /* Initialize I/O Reprioritization mechanism */
1305                 vm_io_reprioritize_init();
1306         }
1307 #endif
1308 }
1309
1310 void
1311 sys_override_io_throttle(int flag)
1312 {
1313         if (flag == THROTTLE_IO_ENABLE)
1314                 lowpri_throttle_enabled = 1;
1315
1316         if (flag == THROTTLE_IO_DISABLE)
1317                 lowpri_throttle_enabled = 0;
1318 }
1319
1320 int rethrottle_wakeups = 0;
1321
1322 /*
1323  * the uu_rethrottle_lock is used to synchronize this function
1324  * with "throttle_lowpri_io" which is where a throttled thread
1325  * will block... that function will grab this lock before beginning
1326  * it's decision making process concerning the need to block, and
1327  * hold it through the assert_wait.  When that thread is awakened
1328  * for any reason (timer or rethrottle), it will reacquire the
1329  * uu_rethrottle_lock before determining if it really is ok for
1330  * it to now run.  This is the point at which the thread could
1331  * enter a different throttling queue and reblock or return from
1332  * the throttle w/o having waited out it's entire throttle if
1333  * the rethrottle has now moved it out of any currently
1334  * active throttle window.
1335  *
1336  *
1337  * NOTES:
1338  * 1 - This may be called with the task lock held.
1339  * 2 - This may be called with preemption and interrupts disabled
1340  *     in the kqueue wakeup path so we can't take the throttle_lock which is a mutex
1341  * 3 - This cannot safely dereference uu_throttle_info, as it may
1342  *     get deallocated out from under us
1343  */
1344
1345 void
1346 rethrottle_thread(uthread_t ut)
1347 {
1348         /*
1349          * If uthread doesn't have throttle state, then there's no chance
1350          * of it needing a rethrottle.
1351          */
1352         if (ut->uu_throttle_info == NULL)
1353                 return;
1354
1355         boolean_t s = ml_set_interrupts_enabled(FALSE);
1356         lck_spin_lock(&ut->uu_rethrottle_lock);
1357
1358         if (ut->uu_is_throttled == FALSE)
1359                 ut->uu_was_rethrottled = TRUE;
1360         else {
1361                 int my_new_level = throttle_get_thread_throttle_level(ut);
1362
1363                 if (my_new_level != ut->uu_on_throttlelist) {
1364                         /*
1365                          * ut is currently blocked (as indicated by
1366                          * ut->uu_is_throttled == TRUE)
1367                          * and we're changing it's throttle level, so
1368                          * we need to wake it up.
1369                          */
1370                         ut->uu_is_throttled = FALSE;
1371                         wakeup(&ut->uu_on_throttlelist);
1372
1373                         rethrottle_wakeups++;
1374                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 102)), thread_tid(ut->uu_thread), ut->uu_on_throttlelist, my_new_level, 0, 0);
1375                 }
1376         }
1377         lck_spin_unlock(&ut->uu_rethrottle_lock);
1378         ml_set_interrupts_enabled(s);
1379 }
1380
1381
1382 /*
1383  * KPI routine
1384  *
1385  * Create and take a reference on a throttle info structure and return a
1386  * pointer for the file system to use when calling throttle_info_update.
1387  * Calling file system must have a matching release for every create.
1388  */
1389 void *
1390 throttle_info_create(void)
1391 {
1392         struct _throttle_io_info_t *info;
1393         int     level;
1394
1395         MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK);
1396         /* Should never happen but just in case */
1397         if (info == NULL)
1398                 return NULL;
1399         /* Mark that this one was allocated and needs to be freed */
1400         DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
1401         info->throttle_alloc = TRUE;
1402
1403         lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr);
1404         info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1405
1406         for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
1407                 TAILQ_INIT(&info->throttle_uthlist[level]);
1408         }
1409         info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1410
1411         /* Take a reference */
1412         OSIncrementAtomic(&info->throttle_refcnt);
1413         return info;
1414 }
1415
1416 /*
1417  * KPI routine
1418  *
1419  * Release the throttle info pointer if all the reference are gone. Should be
1420  * called to release reference taken by throttle_info_create
1421  */
1422 void
1423 throttle_info_release(void *throttle_info)
1424 {
1425         DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
1426                 (struct _throttle_io_info_t *)throttle_info,
1427                 (struct _throttle_io_info_t *)throttle_info);
1428         if (throttle_info) /* Just to be careful */
1429                 throttle_info_rel(throttle_info);
1430 }
1431
1432 /*
1433  * KPI routine
1434  *
1435  * File Systems that create an info structure, need to call this routine in
1436  * their mount routine (used by cluster code). File Systems that call this in
1437  * their mount routines must call throttle_info_mount_rel in their unmount
1438  * routines.
1439  */
1440 void
1441 throttle_info_mount_ref(mount_t mp, void *throttle_info)
1442 {
1443         if ((throttle_info == NULL) || (mp == NULL))
1444                 return;
1445         throttle_info_ref(throttle_info);
1446
1447         /*
1448          * We already have a reference release it before adding the new one
1449          */
1450         if (mp->mnt_throttle_info)
1451                 throttle_info_rel(mp->mnt_throttle_info);
1452         mp->mnt_throttle_info = throttle_info;
1453 }
1454
1455 /*
1456  * Private KPI routine
1457  *
1458  * return a handle for accessing throttle_info given a throttle_mask.  The
1459  * handle must be released by throttle_info_rel_by_mask
1460  */
1461 int
1462 throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle)
1463 {
1464         int     dev_index;
1465         struct _throttle_io_info_t *info;
1466
1467         if (throttle_info_handle == NULL)
1468                 return EINVAL;
1469
1470         dev_index = num_trailing_0(throttle_mask);
1471         info = &_throttle_io_info[dev_index];
1472         throttle_info_ref(info);
1473         *(struct _throttle_io_info_t**)throttle_info_handle = info;
1474
1475         return 0;
1476 }
1477
1478 /*
1479  * Private KPI routine
1480  *
1481  * release the handle obtained by throttle_info_ref_by_mask
1482  */
1483 void
1484 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
1485 {
1486         /*
1487          * for now the handle is just a pointer to _throttle_io_info_t
1488          */
1489         throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
1490 }
1491
1492 /*
1493  * KPI routine
1494  *
1495  * File Systems that throttle_info_mount_ref, must call this routine in their
1496  * umount routine.
1497  */
1498 void
1499 throttle_info_mount_rel(mount_t mp)
1500 {
1501         if (mp->mnt_throttle_info)
1502                 throttle_info_rel(mp->mnt_throttle_info);
1503         mp->mnt_throttle_info = NULL;
1504 }
1505
1506 void
1507 throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
1508 {
1509         struct _throttle_io_info_t *info;
1510
1511         if (mp == NULL)
1512                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1513         else if (mp->mnt_throttle_info == NULL)
1514                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1515         else
1516                 info = mp->mnt_throttle_info;
1517
1518         *tv = info->throttle_last_write_timestamp;
1519 }
1520
1521 void
1522 update_last_io_time(mount_t mp)
1523 {
1524         struct _throttle_io_info_t *info;
1525
1526         if (mp == NULL)
1527                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1528         else if (mp->mnt_throttle_info == NULL)
1529                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1530         else
1531                 info = mp->mnt_throttle_info;
1532
1533         microuptime(&info->throttle_last_write_timestamp);
1534         if (mp != NULL)
1535                 mp->mnt_last_write_completed_timestamp = info->throttle_last_write_timestamp;
1536 }
1537
1538
1539 int
1540 throttle_get_io_policy(uthread_t *ut)
1541 {
1542         if (ut != NULL)
1543                 *ut = get_bsdthread_info(current_thread());
1544
1545         return (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO));
1546 }
1547
1548 int
1549 throttle_get_passive_io_policy(uthread_t *ut)
1550 {
1551         if (ut != NULL)
1552                 *ut = get_bsdthread_info(current_thread());
1553
1554         return (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_PASSIVE_IO));
1555 }
1556
1557
1558 static int
1559 throttle_get_thread_throttle_level(uthread_t ut)
1560 {
1561         uthread_t *ut_p = (ut == NULL) ? &ut : NULL;
1562         int io_tier = throttle_get_io_policy(ut_p);
1563
1564         return throttle_get_thread_throttle_level_internal(ut, io_tier);
1565 }
1566
1567 /*
1568  * Return a throttle level given an existing I/O tier (such as returned by throttle_get_io_policy)
1569  */
1570 static int
1571 throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier) {
1572         int thread_throttle_level = io_tier;
1573         int user_idle_level;
1574
1575         assert(ut != NULL);
1576
1577         /* Bootcache misses should always be throttled */
1578         if (ut->uu_throttle_bc == TRUE)
1579                 thread_throttle_level = THROTTLE_LEVEL_TIER3;
1580
1581         /*
1582          * Issue tier3 I/O as tier2 when the user is idle
1583          * to allow maintenance tasks to make more progress.
1584          *
1585          * Assume any positive idle level is enough... for now it's
1586          * only ever 0 or 128 but this is not defined anywhere.
1587          */
1588         if (thread_throttle_level >= THROTTLE_LEVEL_TIER3) {
1589                 user_idle_level = timer_get_user_idle_level();
1590                 if (user_idle_level > 0) {
1591                         thread_throttle_level--;
1592                 }
1593         }
1594
1595         return (thread_throttle_level);
1596 }
1597
1598 /*
1599  * I/O will be throttled if either of the following are true:
1600  *   - Higher tiers have in-flight I/O
1601  *   - The time delta since the last start/completion of a higher tier is within the throttle window interval
1602  *
1603  * In-flight I/O is bookended by throttle_info_update_internal/throttle_info_end_io_internal
1604  */
1605 static int
1606 throttle_io_will_be_throttled_internal(void * throttle_info, int * mylevel, int * throttling_level)
1607 {
1608         struct _throttle_io_info_t *info = throttle_info;
1609         struct timeval elapsed;
1610         struct timeval now;
1611         uint64_t elapsed_msecs;
1612         int     thread_throttle_level;
1613         int     throttle_level;
1614
1615         if ((thread_throttle_level = throttle_get_thread_throttle_level(NULL)) < THROTTLE_LEVEL_THROTTLED)
1616                 return (THROTTLE_DISENGAGED);
1617
1618         microuptime(&now);
1619
1620         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
1621                 if (info->throttle_inflight_count[throttle_level]) {
1622                         break;
1623                 }
1624                 elapsed = now;
1625                 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
1626                 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1627
1628                 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level])
1629                         break;
1630         }
1631         if (throttle_level >= thread_throttle_level) {
1632                 /*
1633                  * we're beyond all of the throttle windows
1634                  * that affect the throttle level of this thread,
1635                  * so go ahead and treat as normal I/O
1636                  */
1637                 return (THROTTLE_DISENGAGED);
1638         }
1639         if (mylevel)
1640                 *mylevel = thread_throttle_level;
1641         if (throttling_level)
1642                 *throttling_level = throttle_level;
1643
1644         if (info->throttle_io_count != info->throttle_io_count_begin) {
1645                 /*
1646                  * we've already issued at least one throttleable I/O
1647                  * in the current I/O window, so avoid issuing another one
1648                  */
1649                 return (THROTTLE_NOW);
1650         }
1651         /*
1652          * we're in the throttle window, so
1653          * cut the I/O size back
1654          */
1655         return (THROTTLE_ENGAGED);
1656 }
1657
1658 /*
1659  * If we have a mount point and it has a throttle info pointer then
1660  * use it to do the check, otherwise use the device unit number to find
1661  * the correct throttle info array element.
1662  */
1663 int
1664 throttle_io_will_be_throttled(__unused int lowpri_window_msecs, mount_t mp)
1665 {
1666         struct _throttle_io_info_t      *info;
1667
1668         /*
1669          * Should we just return zero if no mount point
1670          */
1671         if (mp == NULL)
1672                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1673         else if (mp->mnt_throttle_info == NULL)
1674                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1675         else
1676                 info = mp->mnt_throttle_info;
1677
1678         if (info->throttle_is_fusion_with_priority) {
1679                 uthread_t ut = get_bsdthread_info(current_thread());
1680                 if (ut->uu_lowpri_window == 0)
1681                         return (THROTTLE_DISENGAGED);
1682         }
1683
1684         if (info->throttle_disabled)
1685                 return (THROTTLE_DISENGAGED);
1686         else
1687                 return throttle_io_will_be_throttled_internal(info, NULL, NULL);
1688 }
1689
1690 /*
1691  * Routine to increment I/O throttling counters maintained in the proc
1692  */
1693
1694 static void
1695 throttle_update_proc_stats(pid_t throttling_pid, int count)
1696 {
1697         proc_t throttling_proc;
1698         proc_t throttled_proc = current_proc();
1699
1700         /* The throttled_proc is always the current proc; so we are not concerned with refs */
1701         OSAddAtomic64(count, &(throttled_proc->was_throttled));
1702
1703         /* The throttling pid might have exited by now */
1704         throttling_proc = proc_find(throttling_pid);
1705         if (throttling_proc != PROC_NULL) {
1706                 OSAddAtomic64(count, &(throttling_proc->did_throttle));
1707                 proc_rele(throttling_proc);
1708         }
1709 }
1710
1711 /*
1712  * Block until woken up by the throttle timer or by a rethrottle call.
1713  * As long as we hold the throttle_lock while querying the throttle tier, we're
1714  * safe against seeing an old throttle tier after a rethrottle.
1715  */
1716 uint32_t
1717 throttle_lowpri_io(int sleep_amount)
1718 {
1719         uthread_t ut;
1720         struct _throttle_io_info_t *info;
1721         int     throttle_type = 0;
1722         int     mylevel = 0;
1723         int     throttling_level = THROTTLE_LEVEL_NONE;
1724         int     sleep_cnt = 0;
1725         uint32_t  throttle_io_period_num = 0;
1726         boolean_t insert_tail = TRUE;
1727         boolean_t s;
1728
1729         ut = get_bsdthread_info(current_thread());
1730
1731         if (ut->uu_lowpri_window == 0)
1732                 return (0);
1733
1734         info = ut->uu_throttle_info;
1735
1736         if (info == NULL) {
1737                 ut->uu_throttle_bc = FALSE;
1738                 ut->uu_lowpri_window = 0;
1739                 return (0);
1740         }
1741         lck_mtx_lock(&info->throttle_lock);
1742         assert(ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED);
1743
1744         if (sleep_amount == 0)
1745                 goto done;
1746
1747         if (sleep_amount == 1 && ut->uu_throttle_bc == FALSE)
1748                 sleep_amount = 0;
1749
1750         throttle_io_period_num = info->throttle_io_period_num;
1751
1752         ut->uu_was_rethrottled = FALSE;
1753
1754         while ( (throttle_type = throttle_io_will_be_throttled_internal(info, &mylevel, &throttling_level)) ) {
1755
1756                 if (throttle_type == THROTTLE_ENGAGED) {
1757                         if (sleep_amount == 0)
1758                                 break;
1759                         if (info->throttle_io_period_num < throttle_io_period_num)
1760                                 break;
1761                         if ((info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount)
1762                                 break;
1763                 }
1764                 /*
1765                  * keep the same position in the list if "rethrottle_thread" changes our throttle level  and
1766                  * then puts us back to the original level before we get a chance to run
1767                  */
1768                 if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED && ut->uu_on_throttlelist != mylevel) {
1769                         /*
1770                          * must have been awakened via "rethrottle_thread" (the timer pulls us off the list)
1771                          * and we've changed our throttling level, so pull ourselves off of the appropriate list
1772                          * and make sure we get put on the tail of the new list since we're starting anew w/r to
1773                          * the throttling engine
1774                          */
1775                         TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1776                         ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1777                         insert_tail = TRUE;
1778                 }
1779                 if (ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED) {
1780                         if (throttle_add_to_list(info, ut, mylevel, insert_tail) == THROTTLE_LEVEL_END)
1781                                 goto done;
1782                 }
1783                 assert(throttling_level >= THROTTLE_LEVEL_START && throttling_level <= THROTTLE_LEVEL_END);
1784
1785                 s = ml_set_interrupts_enabled(FALSE);
1786                 lck_spin_lock(&ut->uu_rethrottle_lock);
1787
1788                 /*
1789                  * this is the critical section w/r to our interaction
1790                  * with "rethrottle_thread"
1791                  */
1792                 if (ut->uu_was_rethrottled == TRUE) {
1793
1794                         lck_spin_unlock(&ut->uu_rethrottle_lock);
1795                         ml_set_interrupts_enabled(s);
1796                         lck_mtx_yield(&info->throttle_lock);
1797
1798                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 103)), thread_tid(ut->uu_thread), ut->uu_on_throttlelist, 0, 0, 0);
1799
1800                         ut->uu_was_rethrottled = FALSE;
1801                         continue;
1802                 }
1803                 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, PROCESS_THROTTLED)) | DBG_FUNC_NONE,
1804                                 info->throttle_last_IO_pid[throttling_level], throttling_level, proc_selfpid(), mylevel, 0);
1805
1806                 if (sleep_cnt == 0) {
1807                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
1808                                               throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0);
1809                         throttled_count[mylevel]++;
1810                 }
1811                 ut->uu_wmesg = "throttle_lowpri_io";
1812
1813                 assert_wait((caddr_t)&ut->uu_on_throttlelist, THREAD_UNINT);
1814
1815                 ut->uu_is_throttled = TRUE;
1816                 lck_spin_unlock(&ut->uu_rethrottle_lock);
1817                 ml_set_interrupts_enabled(s);
1818
1819                 lck_mtx_unlock(&info->throttle_lock);
1820
1821                 thread_block(THREAD_CONTINUE_NULL);
1822
1823                 ut->uu_wmesg = NULL;
1824
1825                 ut->uu_is_throttled = FALSE;
1826                 ut->uu_was_rethrottled = FALSE;
1827
1828                 lck_mtx_lock(&info->throttle_lock);
1829
1830                 sleep_cnt++;
1831
1832                 if (sleep_amount == 0)
1833                         insert_tail = FALSE;
1834                 else if (info->throttle_io_period_num < throttle_io_period_num ||
1835                          (info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) {
1836                         insert_tail = FALSE;
1837                         sleep_amount = 0;
1838                 }
1839         }
1840 done:
1841         if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
1842                 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1843                 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1844         }
1845         lck_mtx_unlock(&info->throttle_lock);
1846
1847         if (sleep_cnt) {
1848                 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
1849                                       throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0);
1850                 /*
1851                  * We update the stats for the last pid which opened a throttle window for the throttled thread.
1852                  * This might not be completely accurate since the multiple throttles seen by the lower tier pid
1853                  * might have been caused by various higher prio pids. However, updating these stats accurately
1854                  * means doing a proc_find while holding the throttle lock which leads to deadlock.
1855                  */
1856                 throttle_update_proc_stats(info->throttle_last_IO_pid[throttling_level], sleep_cnt);
1857         }
1858
1859         ut->uu_throttle_info = NULL;
1860         ut->uu_throttle_bc = FALSE;
1861         ut->uu_lowpri_window = 0;
1862
1863         throttle_info_rel(info);
1864
1865         return (sleep_cnt);
1866 }
1867
1868 /*
1869  * KPI routine
1870  *
1871  * set a kernel thread's IO policy.  policy can be:
1872  * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE, IOPOL_UTILITY, IOPOL_STANDARD
1873  *
1874  * explanations about these policies are in the man page of setiopolicy_np
1875  */
1876 void throttle_set_thread_io_policy(int policy)
1877 {
1878         proc_set_thread_policy(current_thread(), TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, policy);
1879 }
1880
1881 void throttle_info_reset_window(uthread_t ut)
1882 {
1883         struct _throttle_io_info_t *info;
1884
1885         if (ut == NULL)
1886                 ut = get_bsdthread_info(current_thread());
1887
1888         if ( (info = ut->uu_throttle_info) ) {
1889                 throttle_info_rel(info);
1890
1891                 ut->uu_throttle_info = NULL;
1892                 ut->uu_lowpri_window = 0;
1893                 ut->uu_throttle_bc = FALSE;
1894         }
1895 }
1896
1897 static
1898 void throttle_info_set_initial_window(uthread_t ut, struct _throttle_io_info_t *info, boolean_t BC_throttle, boolean_t isssd)
1899 {
1900         if (lowpri_throttle_enabled == 0 || info->throttle_disabled)
1901                 return;
1902
1903         if (info->throttle_io_periods == 0) {
1904                 throttle_init_throttle_period(info, isssd);
1905         }
1906         if (ut->uu_throttle_info == NULL) {
1907
1908                 ut->uu_throttle_info = info;
1909                 throttle_info_ref(info);
1910                 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
1911
1912                 ut->uu_lowpri_window = 1;
1913                 ut->uu_throttle_bc = BC_throttle;
1914         }
1915 }
1916
1917 /*
1918  * Update inflight IO count and throttling window
1919  * Should be called when an IO is done
1920  *
1921  * Only affects IO that was sent through spec_strategy
1922  */
1923 void throttle_info_end_io(buf_t bp) {
1924         mount_t mp;
1925         struct bufattr *bap;
1926         struct _throttle_io_info_t *info;
1927         int io_tier;
1928
1929         bap = &bp->b_attr;
1930         if (!ISSET(bap->ba_flags, BA_STRATEGY_TRACKED_IO)) {
1931                 return;
1932         }
1933         CLR(bap->ba_flags, BA_STRATEGY_TRACKED_IO);
1934
1935         mp = buf_vnode(bp)->v_mount;
1936         if (mp != NULL) {
1937                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1938         } else {
1939                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1940         }
1941
1942         io_tier = GET_BUFATTR_IO_TIER(bap);
1943         if (ISSET(bap->ba_flags, BA_IO_TIER_UPGRADE)) {
1944                 io_tier--;
1945         }
1946
1947         throttle_info_end_io_internal(info, io_tier);
1948 }
1949
1950 /*
1951  * Decrement inflight count initially incremented by throttle_info_update_internal
1952  */
1953 static
1954 void throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level) {
1955         if (throttle_level == THROTTLE_LEVEL_NONE) {
1956                 return;
1957         }
1958
1959         microuptime(&info->throttle_window_start_timestamp[throttle_level]);
1960         OSDecrementAtomic(&info->throttle_inflight_count[throttle_level]);
1961         assert(info->throttle_inflight_count[throttle_level] >= 0);
1962 }
1963
1964 /*
1965  * If inflight is TRUE and bap is NULL then the caller is responsible for calling
1966  * throttle_info_end_io_internal to avoid leaking in-flight I/O.
1967  */
1968 static
1969 int throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap)
1970 {
1971         int     thread_throttle_level;
1972
1973         if (lowpri_throttle_enabled == 0 || info->throttle_disabled)
1974                 return THROTTLE_LEVEL_NONE;
1975
1976         if (ut == NULL)
1977                 ut = get_bsdthread_info(current_thread());
1978
1979         if (bap && inflight && !ut->uu_throttle_bc) {
1980                 thread_throttle_level = GET_BUFATTR_IO_TIER(bap);
1981                 if (ISSET(bap->ba_flags, BA_IO_TIER_UPGRADE)) {
1982                         thread_throttle_level--;
1983                 }
1984         } else {
1985                 thread_throttle_level = throttle_get_thread_throttle_level(ut);
1986         }
1987
1988         if (thread_throttle_level != THROTTLE_LEVEL_NONE) {
1989         if(!ISSET(flags, B_PASSIVE)) {
1990                         info->throttle_last_IO_pid[thread_throttle_level] = proc_selfpid();
1991                         if (inflight && !ut->uu_throttle_bc) {
1992                                 if (NULL != bap) {
1993                                         SET(bap->ba_flags, BA_STRATEGY_TRACKED_IO);
1994                                 }
1995                                 OSIncrementAtomic(&info->throttle_inflight_count[thread_throttle_level]);
1996                         } else {
1997                                 microuptime(&info->throttle_window_start_timestamp[thread_throttle_level]);
1998                         }
1999                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, OPEN_THROTTLE_WINDOW)) | DBG_FUNC_NONE,
2000                                         current_proc()->p_pid, thread_throttle_level, 0, 0, 0);
2001                 }
2002                 microuptime(&info->throttle_last_IO_timestamp[thread_throttle_level]);
2003         }
2004
2005
2006         if (thread_throttle_level >= THROTTLE_LEVEL_THROTTLED) {
2007                 /*
2008                  * I'd really like to do the IOSleep here, but
2009                  * we may be holding all kinds of filesystem related locks
2010                  * and the pages for this I/O marked 'busy'...
2011                  * we don't want to cause a normal task to block on
2012                  * one of these locks while we're throttling a task marked
2013                  * for low priority I/O... we'll mark the uthread and
2014                  * do the delay just before we return from the system
2015                  * call that triggered this I/O or from vnode_pagein
2016                  */
2017                 OSAddAtomic(1, &info->throttle_io_count);
2018
2019                 throttle_info_set_initial_window(ut, info, FALSE, isssd);
2020         }
2021
2022         return thread_throttle_level;
2023 }
2024
2025 void *throttle_info_update_by_mount(mount_t mp)
2026 {
2027         struct _throttle_io_info_t *info;
2028         uthread_t ut;
2029         boolean_t isssd = FALSE;
2030
2031         ut = get_bsdthread_info(current_thread());
2032
2033         if (mp != NULL) {
2034                 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
2035                         isssd = TRUE;
2036                 info = &_throttle_io_info[mp->mnt_devbsdunit];
2037         } else
2038                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2039
2040         if (!ut->uu_lowpri_window)
2041                 throttle_info_set_initial_window(ut, info, FALSE, isssd);
2042
2043         return info;
2044 }
2045
2046
2047 /*
2048  * KPI routine
2049  *
2050  * this is usually called before every I/O, used for throttled I/O
2051  * book keeping.  This routine has low overhead and does not sleep
2052  */
2053 void throttle_info_update(void *throttle_info, int flags)
2054 {
2055         if (throttle_info)
2056                 throttle_info_update_internal(throttle_info, NULL, flags, FALSE, FALSE, NULL);
2057 }
2058
2059 /*
2060  * KPI routine
2061  *
2062  * this is usually called before every I/O, used for throttled I/O
2063  * book keeping.  This routine has low overhead and does not sleep
2064  */
2065 void throttle_info_update_by_mask(void *throttle_info_handle, int flags)
2066 {
2067         void *throttle_info = throttle_info_handle;
2068
2069         /*
2070          * for now we only use the lowest bit of the throttle mask, so the
2071          * handle is the same as the throttle_info.  Later if we store a
2072          * set of throttle infos in the handle, we will want to loop through
2073          * them and call throttle_info_update in a loop
2074          */
2075         throttle_info_update(throttle_info, flags);
2076 }
2077 /*
2078  * KPI routine
2079  *
2080  * This routine marks the throttle info as disabled. Used for mount points which
2081  * support I/O scheduling.
2082  */
2083
2084 void throttle_info_disable_throttle(int devno, boolean_t isfusion)
2085 {
2086         struct _throttle_io_info_t *info;
2087
2088         if (devno < 0 || devno >= LOWPRI_MAX_NUM_DEV)
2089                 panic("Illegal devno (%d) passed into throttle_info_disable_throttle()", devno);
2090
2091         info = &_throttle_io_info[devno];
2092         // don't disable software throttling on devices that are part of a fusion device
2093         // and override the software throttle periods to use HDD periods
2094         if (isfusion) {
2095                 info->throttle_is_fusion_with_priority = isfusion;
2096                 throttle_init_throttle_period(info, FALSE);
2097         }
2098         info->throttle_disabled = !info->throttle_is_fusion_with_priority;
2099         return;
2100 }
2101
2102
2103 /*
2104  * KPI routine (private)
2105  * Called to determine if this IO is being throttled to this level so that it can be treated specially
2106  */
2107 int throttle_info_io_will_be_throttled(void * throttle_info, int policy)
2108 {
2109         struct _throttle_io_info_t *info = throttle_info;
2110         struct timeval elapsed;
2111         uint64_t elapsed_msecs;
2112         int     throttle_level;
2113         int     thread_throttle_level;
2114
2115         switch (policy) {
2116
2117         case IOPOL_THROTTLE:
2118                 thread_throttle_level = THROTTLE_LEVEL_TIER3;
2119                 break;
2120         case IOPOL_UTILITY:
2121                 thread_throttle_level = THROTTLE_LEVEL_TIER2;
2122                 break;
2123         case IOPOL_STANDARD:
2124                 thread_throttle_level = THROTTLE_LEVEL_TIER1;
2125                 break;
2126         default:
2127                 thread_throttle_level = THROTTLE_LEVEL_TIER0;
2128                 break;
2129         }
2130         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
2131                 if (info->throttle_inflight_count[throttle_level]) {
2132                         break;
2133                 }
2134
2135                 microuptime(&elapsed);
2136                 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
2137                 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
2138
2139                 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level])
2140                         break;
2141         }
2142         if (throttle_level >= thread_throttle_level) {
2143                 /*
2144                  * we're beyond all of the throttle windows
2145                  * so go ahead and treat as normal I/O
2146                  */
2147                 return (THROTTLE_DISENGAGED);
2148         }
2149         /*
2150          * we're in the throttle window
2151          */
2152         return (THROTTLE_ENGAGED);
2153 }
2154
2155 int throttle_lowpri_window(void)
2156 {
2157         struct uthread *ut = get_bsdthread_info(current_thread());
2158         return ut->uu_lowpri_window;
2159 }
2160
2161 int
2162 spec_strategy(struct vnop_strategy_args *ap)
2163 {
2164         buf_t   bp;
2165         int     bflags;
2166         int     io_tier;
2167         int     passive;
2168         dev_t   bdev;
2169         uthread_t ut;
2170         mount_t mp;
2171         struct  bufattr *bap;
2172         int     strategy_ret;
2173         struct _throttle_io_info_t *throttle_info;
2174         boolean_t isssd = FALSE;
2175         boolean_t inflight = FALSE;
2176         boolean_t upgrade = FALSE;
2177         int code = 0;
2178
2179         proc_t curproc = current_proc();
2180
2181         bp = ap->a_bp;
2182         bdev = buf_device(bp);
2183         mp = buf_vnode(bp)->v_mount;
2184         bap = &bp->b_attr;
2185
2186         io_tier = throttle_get_io_policy(&ut);
2187         passive = throttle_get_passive_io_policy(&ut);
2188
2189         /*
2190          * Mark if the I/O was upgraded by throttle_get_thread_throttle_level
2191          * while preserving the original issued tier (throttle_get_io_policy
2192          * does not return upgraded tiers)
2193          */
2194         if (mp && io_tier > throttle_get_thread_throttle_level_internal(ut, io_tier)) {
2195 #if CONFIG_IOSCHED
2196                 if (!(mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) {
2197                         upgrade = TRUE;
2198                 }
2199 #else /* CONFIG_IOSCHED */
2200                 upgrade = TRUE;
2201 #endif /* CONFIG_IOSCHED */
2202         }
2203
2204         if (bp->b_flags & B_META)
2205                 bap->ba_flags |= BA_META;
2206
2207 #if CONFIG_IOSCHED
2208         /*
2209          * For I/O Scheduling, we currently do not have a way to track and expedite metadata I/Os.
2210          * To ensure we dont get into priority inversions due to metadata I/Os, we use the following rules:
2211          * For metadata reads, ceil all I/Os to IOSCHED_METADATA_TIER & mark them passive if the I/O tier was upgraded
2212          * For metadata writes, unconditionally mark them as IOSCHED_METADATA_TIER and passive
2213          */
2214         if (bap->ba_flags & BA_META) {
2215                 if (mp && (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) {
2216                         if (bp->b_flags & B_READ) {
2217                                 if (io_tier > IOSCHED_METADATA_TIER) {
2218                                         io_tier = IOSCHED_METADATA_TIER;
2219                                         passive = 1;
2220                                 }
2221                         } else {
2222                                 io_tier = IOSCHED_METADATA_TIER;
2223                                 passive = 1;
2224                         }
2225                 }
2226         }
2227 #endif /* CONFIG_IOSCHED */
2228
2229         SET_BUFATTR_IO_TIER(bap, io_tier);
2230
2231         if (passive) {
2232                 bp->b_flags |= B_PASSIVE;
2233                 bap->ba_flags |= BA_PASSIVE;
2234         }
2235
2236         if ((curproc != NULL) && ((curproc->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP))
2237                 bap->ba_flags |= BA_DELAYIDLESLEEP;
2238
2239         bflags = bp->b_flags;
2240
2241         if (((bflags & B_READ) == 0) && ((bflags & B_ASYNC) == 0))
2242                 bufattr_markquickcomplete(bap);
2243
2244         if (bflags & B_READ)
2245                 code |= DKIO_READ;
2246         if (bflags & B_ASYNC)
2247                 code |= DKIO_ASYNC;
2248
2249         if (bap->ba_flags & BA_META)
2250                 code |= DKIO_META;
2251         else if (bflags & B_PAGEIO)
2252                 code |= DKIO_PAGING;
2253
2254         if (io_tier != 0)
2255                 code |= DKIO_THROTTLE;
2256
2257         code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
2258
2259         if (bflags & B_PASSIVE)
2260                 code |= DKIO_PASSIVE;
2261
2262         if (bap->ba_flags & BA_NOCACHE)
2263                 code |= DKIO_NOCACHE;
2264
2265         if (upgrade) {
2266                 code |= DKIO_TIER_UPGRADE;
2267                 SET(bap->ba_flags, BA_IO_TIER_UPGRADE);
2268         }
2269
2270         if (kdebug_enable) {
2271                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
2272                                           buf_kernel_addrperm_addr(bp), bdev, (int)buf_blkno(bp), buf_count(bp), 0);
2273         }
2274
2275         thread_update_io_stats(current_thread(), buf_count(bp), code);
2276
2277         if (mp != NULL) {
2278                 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
2279                         isssd = TRUE;
2280                 /*
2281                  * Partially initialized mounts don't have a final devbsdunit and should not be tracked.
2282                  * Verify that devbsdunit is initialized (non-zero) or that 0 is the correct initialized value
2283                  * (mnt_throttle_mask is initialized and num_trailing_0 would be 0)
2284                  */
2285                 if (mp->mnt_devbsdunit || (mp->mnt_throttle_mask != LOWPRI_MAX_NUM_DEV - 1 && mp->mnt_throttle_mask & 0x1)) {
2286                         inflight = TRUE;
2287                 }
2288                 throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
2289
2290         } else
2291                 throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2292
2293         throttle_info_update_internal(throttle_info, ut, bflags, isssd, inflight, bap);
2294
2295         if ((bflags & B_READ) == 0) {
2296                 microuptime(&throttle_info->throttle_last_write_timestamp);
2297
2298                 if (mp) {
2299                         mp->mnt_last_write_issued_timestamp = throttle_info->throttle_last_write_timestamp;
2300                         INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
2301                 }
2302         } else if (mp) {
2303                 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
2304         }
2305         /*
2306          * The BootCache may give us special information about
2307          * the IO, so it returns special values that we check
2308          * for here.
2309          *
2310          * IO_SATISFIED_BY_CACHE
2311          * The read has been satisfied by the boot cache. Don't
2312          * throttle the thread unnecessarily.
2313          *
2314          * IO_SHOULD_BE_THROTTLED
2315          * The boot cache is playing back a playlist and this IO
2316          * cut through. Throttle it so we're not cutting through
2317          * the boot cache too often.
2318          *
2319          * Note that typical strategy routines are defined with
2320          * a void return so we'll get garbage here. In the
2321          * unlikely case the garbage matches our special return
2322          * value, it's not a big deal since we're only adjusting
2323          * the throttling delay.
2324          */
2325 #define IO_SATISFIED_BY_CACHE  ((int)0xcafefeed)
2326 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
2327         typedef int strategy_fcn_ret_t(struct buf *bp);
2328
2329         strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp);
2330
2331         if (IO_SATISFIED_BY_CACHE == strategy_ret) {
2332                 /*
2333                  * If this was a throttled IO satisfied by the boot cache,
2334                  * don't delay the thread.
2335                  */
2336                 throttle_info_reset_window(ut);
2337
2338         } else if (IO_SHOULD_BE_THROTTLED == strategy_ret) {
2339                 /*
2340                  * If the boot cache indicates this IO should be throttled,
2341                  * delay the thread.
2342                  */
2343                 throttle_info_set_initial_window(ut, throttle_info, TRUE, isssd);
2344         }
2345         return (0);
2346 }
2347
2348
2349 /*
2350  * This is a noop, simply returning what one has been given.
2351  */
2352 int
2353 spec_blockmap(__unused struct vnop_blockmap_args *ap)
2354 {
2355         return (ENOTSUP);
2356 }
2357
2358
2359 /*
2360  * Device close routine
2361  */
2362 int
2363 spec_close(struct vnop_close_args *ap)
2364 {
2365         struct vnode *vp = ap->a_vp;
2366         dev_t dev = vp->v_rdev;
2367         int error = 0;
2368         int flags = ap->a_fflag;
2369         struct proc *p = vfs_context_proc(ap->a_context);
2370         struct session *sessp;
2371
2372         switch (vp->v_type) {
2373
2374         case VCHR:
2375                 /*
2376                  * Hack: a tty device that is a controlling terminal
2377                  * has a reference from the session structure.
2378                  * We cannot easily tell that a character device is
2379                  * a controlling terminal, unless it is the closing
2380                  * process' controlling terminal.  In that case,
2381                  * if the reference count is 1 (this is the very
2382                  * last close)
2383                  */
2384                 sessp = proc_session(p);
2385                 devsw_lock(dev, S_IFCHR);
2386                 if (sessp != SESSION_NULL) {
2387                         if (vp == sessp->s_ttyvp && vcount(vp) == 1) {
2388                                 struct tty *tp = TTY_NULL;
2389
2390                                 devsw_unlock(dev, S_IFCHR);
2391                                 session_lock(sessp);
2392                                 if (vp == sessp->s_ttyvp) {
2393                                         tp = SESSION_TP(sessp);
2394                                         sessp->s_ttyvp = NULL;
2395                                         sessp->s_ttyvid = 0;
2396                                         sessp->s_ttyp = TTY_NULL;
2397                                         sessp->s_ttypgrpid = NO_PID;
2398                                 }
2399                                 session_unlock(sessp);
2400
2401                                 if (tp != TTY_NULL) {
2402                                         /*
2403                                          * We may have won a race with a proc_exit
2404                                          * of the session leader, the winner
2405                                          * clears the flag (even if not set)
2406                                          */
2407                                         tty_lock(tp);
2408                                         ttyclrpgrphup(tp);
2409                                         tty_unlock(tp);
2410
2411                                         ttyfree(tp);
2412                                 }
2413                                 devsw_lock(dev, S_IFCHR);
2414                         }
2415                         session_rele(sessp);
2416                 }
2417
2418                 if (--vp->v_specinfo->si_opencount < 0)
2419                         panic("negative open count (c, %u, %u)", major(dev), minor(dev));
2420
2421                 /*
2422                  * close on last reference or on vnode revoke call
2423                  */
2424                 if (vcount(vp) == 0 || (flags & IO_REVOKE) != 0)
2425                         error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
2426
2427                 devsw_unlock(dev, S_IFCHR);
2428                 break;
2429
2430         case VBLK:
2431                 /*
2432                  * If there is more than one outstanding open, don't
2433                  * send the close to the device.
2434                  */
2435                 devsw_lock(dev, S_IFBLK);
2436                 if (vcount(vp) > 1) {
2437                         vp->v_specinfo->si_opencount--;
2438                         devsw_unlock(dev, S_IFBLK);
2439                         return (0);
2440                 }
2441                 devsw_unlock(dev, S_IFBLK);
2442
2443                 /*
2444                  * On last close of a block device (that isn't mounted)
2445                  * we must invalidate any in core blocks, so that
2446                  * we can, for instance, change floppy disks.
2447                  */
2448                 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
2449                         return (error);
2450
2451                 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
2452                 if (error)
2453                         return (error);
2454
2455                 devsw_lock(dev, S_IFBLK);
2456
2457                 if (--vp->v_specinfo->si_opencount < 0)
2458                         panic("negative open count (b, %u, %u)", major(dev), minor(dev));
2459
2460                 if (vcount(vp) == 0)
2461                         error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
2462
2463                 devsw_unlock(dev, S_IFBLK);
2464                 break;
2465
2466         default:
2467                 panic("spec_close: not special");
2468                 return(EBADF);
2469         }
2470
2471         return error;
2472 }
2473
2474 /*
2475  * Return POSIX pathconf information applicable to special devices.
2476  */
2477 int
2478 spec_pathconf(struct vnop_pathconf_args *ap)
2479 {
2480
2481         switch (ap->a_name) {
2482         case _PC_LINK_MAX:
2483                 *ap->a_retval = LINK_MAX;
2484                 return (0);
2485         case _PC_MAX_CANON:
2486                 *ap->a_retval = MAX_CANON;
2487                 return (0);
2488         case _PC_MAX_INPUT:
2489                 *ap->a_retval = MAX_INPUT;
2490                 return (0);
2491         case _PC_PIPE_BUF:
2492                 *ap->a_retval = PIPE_BUF;
2493                 return (0);
2494         case _PC_CHOWN_RESTRICTED:
2495                 *ap->a_retval = 200112;         /* _POSIX_CHOWN_RESTRICTED */
2496                 return (0);
2497         case _PC_VDISABLE:
2498                 *ap->a_retval = _POSIX_VDISABLE;
2499                 return (0);
2500         default:
2501                 return (EINVAL);
2502         }
2503         /* NOTREACHED */
2504 }
2505
2506 /*
2507  * Special device failed operation
2508  */
2509 int
2510 spec_ebadf(__unused void *dummy)
2511 {
2512
2513         return (EBADF);
2514 }
2515
2516 /* Blktooff derives file offset from logical block number */
2517 int
2518 spec_blktooff(struct vnop_blktooff_args *ap)
2519 {
2520         struct vnode *vp = ap->a_vp;
2521
2522         switch (vp->v_type) {
2523         case VCHR:
2524                 *ap->a_offset = (off_t)-1; /* failure */
2525                 return (ENOTSUP);
2526
2527         case VBLK:
2528                 printf("spec_blktooff: not implemented for VBLK\n");
2529                 *ap->a_offset = (off_t)-1; /* failure */
2530                 return (ENOTSUP);
2531
2532         default:
2533                 panic("spec_blktooff type");
2534         }
2535         /* NOTREACHED */
2536
2537         return (0);
2538 }
2539
2540 /* Offtoblk derives logical block number from file offset */
2541 int
2542 spec_offtoblk(struct vnop_offtoblk_args *ap)
2543 {
2544         struct vnode *vp = ap->a_vp;
2545
2546         switch (vp->v_type) {
2547         case VCHR:
2548                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
2549                 return (ENOTSUP);
2550
2551         case VBLK:
2552                 printf("spec_offtoblk: not implemented for VBLK\n");
2553                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
2554                 return (ENOTSUP);
2555
2556         default:
2557                 panic("spec_offtoblk type");
2558         }
2559         /* NOTREACHED */
2560
2561         return (0);
2562 }
2563
2564 static void filt_specdetach(struct knote *kn);
2565 static int filt_spec(struct knote *kn, long hint);
2566 static int filt_spectouch(struct knote *kn, struct kevent_internal_s *kev);
2567 static int filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
2568 static unsigned filt_specpeek(struct knote *kn);
2569
2570 struct filterops spec_filtops = {
2571         .f_isfd         = 1,
2572         .f_attach       = filt_specattach,
2573         .f_detach       = filt_specdetach,
2574         .f_event        = filt_spec,
2575         .f_touch        = filt_spectouch,
2576         .f_process      = filt_specprocess,
2577         .f_peek         = filt_specpeek
2578 };
2579
2580 static int
2581 filter_to_seltype(int16_t filter)
2582 {
2583         switch (filter) {
2584         case EVFILT_READ:
2585                 return FREAD;
2586         case EVFILT_WRITE:
2587                 return FWRITE;
2588         default:
2589                 panic("filt_to_seltype(): invalid filter %d\n", filter);
2590                 return 0;
2591         }
2592 }
2593
2594 static int
2595 filt_specattach(struct knote *kn)
2596 {
2597         vnode_t vp;
2598         dev_t dev;
2599
2600         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */
2601
2602         assert(vnode_ischr(vp));
2603
2604         dev = vnode_specrdev(vp);
2605
2606         if (major(dev) > nchrdev) {
2607                 kn->kn_flags |= EV_ERROR;
2608                 kn->kn_data = ENXIO;
2609                 return 0;
2610         }
2611
2612         /*
2613          * For a few special kinds of devices, we can attach knotes with
2614          * no restrictions because their "select" vectors return the amount
2615          * of data available.  Others require an explicit NOTE_LOWAT with
2616          * data of 1, indicating that the caller doesn't care about actual
2617          * data counts, just an indication that the device has data.
2618          */
2619
2620         if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0 &&
2621             ((kn->kn_sfflags & NOTE_LOWAT) == 0 || kn->kn_sdata != 1)) {
2622                 kn->kn_flags |= EV_ERROR;
2623                 kn->kn_data = EINVAL;
2624                 return 0;
2625         }
2626
2627         kn->kn_hook_data = 0;
2628
2629         kn->kn_filtid = EVFILTID_SPEC;
2630         kn->kn_hookid = vnode_vid(vp);
2631
2632         knote_markstayactive(kn);
2633
2634         return 0;
2635 }
2636
2637 static void
2638 filt_specdetach(struct knote *kn)
2639 {
2640         knote_clearstayactive(kn);
2641
2642         /*
2643          * This is potentially tricky: the device's selinfo waitq that was
2644          * tricked into being part of this knote's waitq set may not be a part
2645          * of any other set, and the device itself may have revoked the memory
2646          * in which the waitq was held. We use the knote's kn_hook_data field
2647          * to keep the ID of the waitq's prepost table object. This
2648          * object keeps a pointer back to the waitq, and gives us a safe way
2649          * to decouple the dereferencing of driver allocated memory: if the
2650          * driver goes away (taking the waitq with it) then the prepost table
2651          * object will be invalidated. The waitq details are handled in the
2652          * waitq API invoked here.
2653          */
2654         if (kn->kn_hook_data) {
2655                 waitq_unlink_by_prepost_id(kn->kn_hook_data, &(knote_get_kq(kn)->kq_wqs));
2656                 kn->kn_hook_data = 0;
2657         }
2658 }
2659
2660 static int
2661 filt_spec(__unused struct knote *kn, __unused long hint)
2662 {
2663         panic("filt_spec()");
2664         return 0;
2665 }
2666
2667
2668
2669 static int
2670 filt_spectouch(struct knote *kn, struct kevent_internal_s *kev)
2671 {
2672         kn->kn_sdata = kev->data;
2673         kn->kn_sfflags = kev->fflags;
2674         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
2675                 kn->kn_udata = kev->udata;
2676
2677         /* stayqueued knotes don't need hints from touch */
2678         return 0;
2679 }
2680
2681 static int
2682 filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
2683 {
2684 #pragma unused(data)
2685         vnode_t vp;
2686         uthread_t uth;
2687         struct waitq_set *old_wqs;
2688         vfs_context_t ctx;
2689         int res;
2690         int selres;
2691         int error;
2692         int use_offset;
2693         dev_t dev;
2694         uint64_t flags;
2695         uint64_t rsvd, rsvd_arg;
2696         uint64_t *rlptr = NULL;
2697
2698         uth = get_bsdthread_info(current_thread());
2699         ctx = vfs_context_current();
2700         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
2701
2702         /* JMM - locking against touches? */
2703
2704         error = vnode_getwithvid(vp, kn->kn_hookid);
2705         if (error != 0) {
2706                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
2707                 *kev = kn->kn_kevent;
2708                 return 1;
2709         }
2710
2711         dev = vnode_specrdev(vp);
2712         flags = cdevsw_flags[major(dev)];
2713         use_offset = ((flags & CDEVSW_USE_OFFSET) != 0);
2714
2715         /*
2716          * This function may be called many times to link or re-link the
2717          * underlying vnode to the kqueue.  If we've already linked the two,
2718          * we will have a valid kn_hook_data which ties us to the underlying
2719          * device's waitq via a the waitq's prepost table object. However,
2720          * devices can abort any select action by calling selthreadclear().
2721          * This is OK because the table object will be invalidated by the
2722          * driver (through a call to selthreadclear), so any attempt to access
2723          * the associated waitq will fail because the table object is invalid.
2724          *
2725          * Even if we've already registered, we need to pass a pointer
2726          * to a reserved link structure. Otherwise, selrecord() will
2727          * infer that we're in the second pass of select() and won't
2728          * actually do anything!
2729          */
2730         rsvd = rsvd_arg = waitq_link_reserve(NULL);
2731         rlptr = (void *)&rsvd_arg;
2732
2733         /*
2734          * Trick selrecord() into hooking kqueue's wait queue set
2735          * set into device's selinfo wait queue
2736          */
2737         old_wqs = uth->uu_wqset;
2738         uth->uu_wqset = &(knote_get_kq(kn)->kq_wqs);
2739         selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter),
2740                              0, rlptr, ctx);
2741         uth->uu_wqset = old_wqs;
2742
2743         /*
2744          * make sure to cleanup the reserved link - this guards against
2745          * drivers that may not actually call selrecord().
2746          */
2747         waitq_link_release(rsvd);
2748         if (rsvd != rsvd_arg) {
2749                 /* the driver / handler called selrecord() */
2750                 struct waitq *wq;
2751                 memcpy(&wq, rlptr, sizeof(void *));
2752
2753                 /*
2754                  * The waitq_get_prepost_id() function will (potentially)
2755                  * allocate a prepost table object for the waitq and return
2756                  * the table object's ID to us.  It will also set the
2757                  * waitq_prepost_id field within the waitq structure.
2758                  *
2759                  * We can just overwrite kn_hook_data because it's simply a
2760                  * table ID used to grab a reference when needed.
2761                  *
2762                  * We have a reference on the vnode, so we know that the
2763                  * device won't go away while we get this ID.
2764                  */
2765                 kn->kn_hook_data = waitq_get_prepost_id(wq);
2766         }
2767
2768         if (use_offset) {
2769                 if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) {
2770                         kn->kn_data = 0;
2771                 } else {
2772                         kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset;
2773                 }
2774         } else {
2775                 kn->kn_data = selres;
2776         }
2777
2778         vnode_put(vp);
2779
2780         res = ((kn->kn_sfflags & NOTE_LOWAT) != 0) ?
2781                 (kn->kn_data >= kn->kn_sdata) : kn->kn_data;
2782
2783         if (res) {
2784                 *kev = kn->kn_kevent;
2785                 if (kn->kn_flags & EV_CLEAR) {
2786                         kn->kn_fflags = 0;
2787                         kn->kn_data = 0;
2788                 }
2789         }
2790
2791         return res;
2792 }
2793
2794 static unsigned
2795 filt_specpeek(struct knote *kn)
2796 {
2797         vnode_t vp;
2798         uthread_t uth;
2799         struct waitq_set *old_wqs;
2800         vfs_context_t ctx;
2801         int error, selres;
2802         uint64_t rsvd, rsvd_arg;
2803         uint64_t *rlptr = NULL;
2804
2805         uth = get_bsdthread_info(current_thread());
2806         ctx = vfs_context_current();
2807         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
2808
2809         error = vnode_getwithvid(vp, kn->kn_hookid);
2810         if (error != 0) {
2811                 return 1; /* Just like VNOP_SELECT() on recycled vnode */
2812         }
2813
2814         /*
2815          * Even if we've already registered, we need to pass a pointer
2816          * to a reserved link structure. Otherwise, selrecord() will
2817          * infer that we're in the second pass of select() and won't
2818          * actually do anything!
2819          */
2820         rsvd = rsvd_arg = waitq_link_reserve(NULL);
2821         rlptr = (void *)&rsvd_arg;
2822
2823         old_wqs = uth->uu_wqset;
2824         uth->uu_wqset = &(knote_get_kq(kn)->kq_wqs);
2825         selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter),
2826                              0, (void *)rlptr, ctx);
2827         uth->uu_wqset = old_wqs;
2828
2829         /*
2830          * make sure to cleanup the reserved link - this guards against
2831          * drivers that may not actually call selrecord()
2832          */
2833         waitq_link_release(rsvd);
2834         if (rsvd != rsvd_arg) {
2835                 /* the driver / handler called selrecord() */
2836                 struct waitq *wq;
2837                 memcpy(&wq, rlptr, sizeof(void *));
2838
2839                 /*
2840                  * The waitq_get_prepost_id() function will (potentially)
2841                  * allocate a prepost table object for the waitq and return
2842                  * the table object's ID to us.  It will also set the
2843                  * waitq_prepost_id field within the waitq structure.
2844                  *
2845                  * We can just overwrite kn_hook_data because it's simply a
2846                  * table ID used to grab a reference when needed.
2847                  *
2848                  * We have a reference on the vnode, so we know that the
2849                  * device won't go away while we get this ID.
2850                  */
2851                 kn->kn_hook_data = waitq_get_prepost_id(wq);
2852         }
2853
2854         vnode_put(vp);
2855         return selres;
2856 }
2857