bsd/miscfs/specfs/spec_vnops.c

   1 /*
   2  * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1989, 1993, 1995
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)spec_vnops.c        8.14 (Berkeley) 5/21/95
  62  */
  63
  64 #include <sys/param.h>
  65 #include <sys/proc_internal.h>
  66 #include <sys/kauth.h>
  67 #include <sys/systm.h>
  68 #include <sys/kernel.h>
  69 #include <sys/conf.h>
  70 #include <sys/buf_internal.h>
  71 #include <sys/mount_internal.h>
  72 #include <sys/vnode_internal.h>
  73 #include <sys/file_internal.h>
  74 #include <sys/namei.h>
  75 #include <sys/stat.h>
  76 #include <sys/errno.h>
  77 #include <sys/ioctl.h>
  78 #include <sys/file.h>
  79 #include <sys/user.h>
  80 #include <sys/malloc.h>
  81 #include <sys/disk.h>
  82 #include <sys/uio_internal.h>
  83 #include <sys/resource.h>
  84 #include <machine/machine_routines.h>
  85 #include <miscfs/specfs/specdev.h>
  86 #include <vfs/vfs_support.h>
  87 #include <vfs/vfs_disk_conditioner.h>
  88
  89 #include <kern/assert.h>
  90 #include <kern/task.h>
  91 #include <kern/sched_prim.h>
  92 #include <kern/thread.h>
  93 #include <kern/policy_internal.h>
  94 #include <kern/timer_call.h>
  95 #include <kern/waitq.h>
  96
  97 #include <pexpert/pexpert.h>
  98
  99 #include <sys/kdebug.h>
 100 #include <libkern/section_keywords.h>
 101
 102 /* XXX following three prototypes should be in a header file somewhere */
 103 extern dev_t    chrtoblk(dev_t dev);
 104 extern boolean_t        iskmemdev(dev_t dev);
 105 extern int      bpfkqfilter(dev_t dev, struct knote *kn);
 106 extern int ptsd_kqfilter(dev_t, struct knote *);
 107 extern int ptmx_kqfilter(dev_t, struct knote *);
 108
 109 struct vnode *speclisth[SPECHSZ];
 110
 111 /* symbolic sleep message strings for devices */
 112 char    devopn[] = "devopn";
 113 char    devio[] = "devio";
 114 char    devwait[] = "devwait";
 115 char    devin[] = "devin";
 116 char    devout[] = "devout";
 117 char    devioc[] = "devioc";
 118 char    devcls[] = "devcls";
 119
 120 #define VOPFUNC int (*)(void *)
 121
 122 int(**spec_vnodeop_p)(void *);
 123 struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
 124         { &vnop_default_desc, (VOPFUNC)vn_default_error },
 125         { &vnop_lookup_desc, (VOPFUNC)spec_lookup },            /* lookup */
 126         { &vnop_create_desc, (VOPFUNC)err_create },             /* create */
 127         { &vnop_mknod_desc, (VOPFUNC)err_mknod },               /* mknod */
 128         { &vnop_open_desc, (VOPFUNC)spec_open },                        /* open */
 129         { &vnop_close_desc, (VOPFUNC)spec_close },              /* close */
 130         { &vnop_access_desc, (VOPFUNC)spec_access },            /* access */
 131         { &vnop_getattr_desc, (VOPFUNC)spec_getattr },          /* getattr */
 132         { &vnop_setattr_desc, (VOPFUNC)spec_setattr },          /* setattr */
 133         { &vnop_read_desc, (VOPFUNC)spec_read },                        /* read */
 134         { &vnop_write_desc, (VOPFUNC)spec_write },              /* write */
 135         { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl },              /* ioctl */
 136         { &vnop_select_desc, (VOPFUNC)spec_select },            /* select */
 137         { &vnop_revoke_desc, (VOPFUNC)nop_revoke },             /* revoke */
 138         { &vnop_mmap_desc, (VOPFUNC)err_mmap },                 /* mmap */
 139         { &vnop_fsync_desc, (VOPFUNC)spec_fsync },              /* fsync */
 140         { &vnop_remove_desc, (VOPFUNC)err_remove },             /* remove */
 141         { &vnop_link_desc, (VOPFUNC)err_link },                 /* link */
 142         { &vnop_rename_desc, (VOPFUNC)err_rename },             /* rename */
 143         { &vnop_mkdir_desc, (VOPFUNC)err_mkdir },               /* mkdir */
 144         { &vnop_rmdir_desc, (VOPFUNC)err_rmdir },               /* rmdir */
 145         { &vnop_symlink_desc, (VOPFUNC)err_symlink },           /* symlink */
 146         { &vnop_readdir_desc, (VOPFUNC)err_readdir },           /* readdir */
 147         { &vnop_readlink_desc, (VOPFUNC)err_readlink },         /* readlink */
 148         { &vnop_inactive_desc, (VOPFUNC)nop_inactive },         /* inactive */
 149         { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim },           /* reclaim */
 150         { &vnop_strategy_desc, (VOPFUNC)spec_strategy },                /* strategy */
 151         { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf },                /* pathconf */
 152         { &vnop_advlock_desc, (VOPFUNC)err_advlock },           /* advlock */
 153         { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite },            /* bwrite */
 154         { &vnop_pagein_desc, (VOPFUNC)err_pagein },             /* Pagein */
 155         { &vnop_pageout_desc, (VOPFUNC)err_pageout },           /* Pageout */
 156         { &vnop_copyfile_desc, (VOPFUNC)err_copyfile },         /* Copyfile */
 157         { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff },                /* blktooff */
 158         { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk },                /* offtoblk */
 159         { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap },                /* blockmap */
 160         { (struct vnodeop_desc*)NULL, (int (*)(void *))NULL }
 161 };
 162 struct vnodeopv_desc spec_vnodeop_opv_desc =
 163 { &spec_vnodeop_p, spec_vnodeop_entries };
 164
 165
 166 static void set_blocksize(vnode_t, dev_t);
 167
 168 #define LOWPRI_TIER1_WINDOW_MSECS         25
 169 #define LOWPRI_TIER2_WINDOW_MSECS         100
 170 #define LOWPRI_TIER3_WINDOW_MSECS         500
 171
 172 #define LOWPRI_TIER1_IO_PERIOD_MSECS      40
 173 #define LOWPRI_TIER2_IO_PERIOD_MSECS      85
 174 #define LOWPRI_TIER3_IO_PERIOD_MSECS      200
 175
 176 #define LOWPRI_TIER1_IO_PERIOD_SSD_MSECS  5
 177 #define LOWPRI_TIER2_IO_PERIOD_SSD_MSECS  15
 178 #define LOWPRI_TIER3_IO_PERIOD_SSD_MSECS  25
 179
 180
 181 int     throttle_windows_msecs[THROTTLE_LEVEL_END + 1] = {
 182         0,
 183         LOWPRI_TIER1_WINDOW_MSECS,
 184         LOWPRI_TIER2_WINDOW_MSECS,
 185         LOWPRI_TIER3_WINDOW_MSECS,
 186 };
 187
 188 int     throttle_io_period_msecs[THROTTLE_LEVEL_END + 1] = {
 189         0,
 190         LOWPRI_TIER1_IO_PERIOD_MSECS,
 191         LOWPRI_TIER2_IO_PERIOD_MSECS,
 192         LOWPRI_TIER3_IO_PERIOD_MSECS,
 193 };
 194
 195 int     throttle_io_period_ssd_msecs[THROTTLE_LEVEL_END + 1] = {
 196         0,
 197         LOWPRI_TIER1_IO_PERIOD_SSD_MSECS,
 198         LOWPRI_TIER2_IO_PERIOD_SSD_MSECS,
 199         LOWPRI_TIER3_IO_PERIOD_SSD_MSECS,
 200 };
 201
 202
 203 int     throttled_count[THROTTLE_LEVEL_END + 1];
 204
 205 struct _throttle_io_info_t {
 206         lck_mtx_t       throttle_lock;
 207
 208         struct timeval  throttle_last_write_timestamp;
 209         struct timeval  throttle_min_timer_deadline;
 210         struct timeval  throttle_window_start_timestamp[THROTTLE_LEVEL_END + 1]; /* window starts at both the beginning and completion of an I/O */
 211         struct timeval  throttle_last_IO_timestamp[THROTTLE_LEVEL_END + 1];
 212         pid_t           throttle_last_IO_pid[THROTTLE_LEVEL_END + 1];
 213         struct timeval  throttle_start_IO_period_timestamp[THROTTLE_LEVEL_END + 1];
 214         int32_t throttle_inflight_count[THROTTLE_LEVEL_END + 1];
 215
 216         TAILQ_HEAD(, uthread) throttle_uthlist[THROTTLE_LEVEL_END + 1];         /* Lists of throttled uthreads */
 217         int             throttle_next_wake_level;
 218
 219         thread_call_t   throttle_timer_call;
 220         int32_t throttle_timer_ref;
 221         int32_t throttle_timer_active;
 222
 223         int32_t throttle_io_count;
 224         int32_t throttle_io_count_begin;
 225         int    *throttle_io_periods;
 226         uint32_t throttle_io_period_num;
 227
 228         int32_t throttle_refcnt;
 229         int32_t throttle_alloc;
 230         int32_t throttle_disabled;
 231         int32_t throttle_is_fusion_with_priority;
 232 };
 233
 234 struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
 235
 236
 237 int     lowpri_throttle_enabled = 1;
 238
 239
 240 static void throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level);
 241 static int throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap);
 242 static int throttle_get_thread_throttle_level(uthread_t ut);
 243 static int throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier);
 244 void throttle_info_mount_reset_period(mount_t mp, int isssd);
 245
 246 /*
 247  * Trivial lookup routine that always fails.
 248  */
 249 int
 250 spec_lookup(struct vnop_lookup_args *ap)
 251 {
 252         *ap->a_vpp = NULL;
 253         return ENOTDIR;
 254 }
 255
 256 static void
 257 set_blocksize(struct vnode *vp, dev_t dev)
 258 {
 259         int (*size)(dev_t);
 260         int rsize;
 261
 262         if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
 263                 rsize = (*size)(dev);
 264                 if (rsize <= 0) { /* did size fail? */
 265                         vp->v_specsize = DEV_BSIZE;
 266                 } else {
 267                         vp->v_specsize = rsize;
 268                 }
 269         } else {
 270                 vp->v_specsize = DEV_BSIZE;
 271         }
 272 }
 273
 274 void
 275 set_fsblocksize(struct vnode *vp)
 276 {
 277         if (vp->v_type == VBLK) {
 278                 dev_t dev = (dev_t)vp->v_rdev;
 279                 int maj = major(dev);
 280
 281                 if ((u_int)maj >= (u_int)nblkdev) {
 282                         return;
 283                 }
 284
 285                 vnode_lock(vp);
 286                 set_blocksize(vp, dev);
 287                 vnode_unlock(vp);
 288         }
 289 }
 290
 291
 292 /*
 293  * Open a special file.
 294  */
 295 int
 296 spec_open(struct vnop_open_args *ap)
 297 {
 298         struct proc *p = vfs_context_proc(ap->a_context);
 299         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
 300         struct vnode *vp = ap->a_vp;
 301         dev_t bdev, dev = (dev_t)vp->v_rdev;
 302         int maj = major(dev);
 303         int error;
 304
 305         /*
 306          * Don't allow open if fs is mounted -nodev.
 307          */
 308         if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) {
 309                 return ENXIO;
 310         }
 311
 312         switch (vp->v_type) {
 313         case VCHR:
 314                 if ((u_int)maj >= (u_int)nchrdev) {
 315                         return ENXIO;
 316                 }
 317                 if (cred != FSCRED && (ap->a_mode & FWRITE)) {
 318                         /*
 319                          * When running in very secure mode, do not allow
 320                          * opens for writing of any disk character devices.
 321                          */
 322                         if (securelevel >= 2 && isdisk(dev, VCHR)) {
 323                                 return EPERM;
 324                         }
 325
 326                         /* Never allow writing to /dev/mem or /dev/kmem */
 327                         if (iskmemdev(dev)) {
 328                                 return EPERM;
 329                         }
 330                         /*
 331                          * When running in secure mode, do not allow opens for
 332                          * writing of character devices whose corresponding block
 333                          * devices are currently mounted.
 334                          */
 335                         if (securelevel >= 1) {
 336                                 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error)) {
 337                                         return error;
 338                                 }
 339                         }
 340                 }
 341
 342                 devsw_lock(dev, S_IFCHR);
 343                 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
 344
 345                 if (error == 0) {
 346                         vp->v_specinfo->si_opencount++;
 347                 }
 348
 349                 devsw_unlock(dev, S_IFCHR);
 350
 351                 if (error == 0 && cdevsw[maj].d_type == D_DISK && !vp->v_un.vu_specinfo->si_initted) {
 352                         int     isssd = 0;
 353                         uint64_t throttle_mask = 0;
 354                         uint32_t devbsdunit = 0;
 355
 356                         if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL) == 0) {
 357                                 if (throttle_mask != 0 &&
 358                                     VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) {
 359                                         /*
 360                                          * as a reasonable approximation, only use the lowest bit of the mask
 361                                          * to generate a disk unit number
 362                                          */
 363                                         devbsdunit = num_trailing_0(throttle_mask);
 364
 365                                         vnode_lock(vp);
 366
 367                                         vp->v_un.vu_specinfo->si_isssd = isssd;
 368                                         vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit;
 369                                         vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask;
 370                                         vp->v_un.vu_specinfo->si_throttleable = 1;
 371                                         vp->v_un.vu_specinfo->si_initted = 1;
 372
 373                                         vnode_unlock(vp);
 374                                 }
 375                         }
 376                         if (vp->v_un.vu_specinfo->si_initted == 0) {
 377                                 vnode_lock(vp);
 378                                 vp->v_un.vu_specinfo->si_initted = 1;
 379                                 vnode_unlock(vp);
 380                         }
 381                 }
 382                 return error;
 383
 384         case VBLK:
 385                 if ((u_int)maj >= (u_int)nblkdev) {
 386                         return ENXIO;
 387                 }
 388                 /*
 389                  * When running in very secure mode, do not allow
 390                  * opens for writing of any disk block devices.
 391                  */
 392                 if (securelevel >= 2 && cred != FSCRED &&
 393                     (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK) {
 394                         return EPERM;
 395                 }
 396                 /*
 397                  * Do not allow opens of block devices that are
 398                  * currently mounted.
 399                  */
 400                 if ((error = vfs_mountedon(vp))) {
 401                         return error;
 402                 }
 403
 404                 devsw_lock(dev, S_IFBLK);
 405                 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
 406                 if (!error) {
 407                         vp->v_specinfo->si_opencount++;
 408                 }
 409                 devsw_unlock(dev, S_IFBLK);
 410
 411                 if (!error) {
 412                         u_int64_t blkcnt;
 413                         u_int32_t blksize;
 414                         int setsize = 0;
 415                         u_int32_t size512 = 512;
 416
 417
 418                         if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
 419                                 /* Switch to 512 byte sectors (temporarily) */
 420
 421                                 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
 422                                         /* Get the number of 512 byte physical blocks. */
 423                                         if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
 424                                                 setsize = 1;
 425                                         }
 426                                 }
 427                                 /* If it doesn't set back, we can't recover */
 428                                 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context)) {
 429                                         error = ENXIO;
 430                                 }
 431                         }
 432
 433
 434                         vnode_lock(vp);
 435                         set_blocksize(vp, dev);
 436
 437                         /*
 438                          * Cache the size in bytes of the block device for later
 439                          * use by spec_write().
 440                          */
 441                         if (setsize) {
 442                                 vp->v_specdevsize = blkcnt * (u_int64_t)size512;
 443                         } else {
 444                                 vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */
 445                         }
 446                         vnode_unlock(vp);
 447                 }
 448                 return error;
 449         default:
 450                 panic("spec_open type");
 451         }
 452         return 0;
 453 }
 454
 455 /*
 456  * Vnode op for read
 457  */
 458 int
 459 spec_read(struct vnop_read_args *ap)
 460 {
 461         struct vnode *vp = ap->a_vp;
 462         struct uio *uio = ap->a_uio;
 463         struct buf *bp;
 464         daddr64_t bn, nextbn;
 465         long bsize, bscale;
 466         int devBlockSize = 0;
 467         int n, on;
 468         int error = 0;
 469         dev_t dev;
 470
 471 #if DIAGNOSTIC
 472         if (uio->uio_rw != UIO_READ) {
 473                 panic("spec_read mode");
 474         }
 475         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
 476                 panic("spec_read proc");
 477         }
 478 #endif
 479         if (uio_resid(uio) == 0) {
 480                 return 0;
 481         }
 482
 483         switch (vp->v_type) {
 484         case VCHR:
 485         {
 486                 struct _throttle_io_info_t *throttle_info = NULL;
 487                 int thread_throttle_level;
 488                 if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
 489                         throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
 490                         thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
 491                 }
 492                 error = (*cdevsw[major(vp->v_rdev)].d_read)
 493                     (vp->v_rdev, uio, ap->a_ioflag);
 494
 495                 if (throttle_info) {
 496                         throttle_info_end_io_internal(throttle_info, thread_throttle_level);
 497                 }
 498
 499                 return error;
 500         }
 501
 502         case VBLK:
 503                 if (uio->uio_offset < 0) {
 504                         return EINVAL;
 505                 }
 506
 507                 dev = vp->v_rdev;
 508
 509                 devBlockSize = vp->v_specsize;
 510
 511                 if (devBlockSize > PAGE_SIZE) {
 512                         return EINVAL;
 513                 }
 514
 515                 bscale = PAGE_SIZE / devBlockSize;
 516                 bsize = bscale * devBlockSize;
 517
 518                 do {
 519                         on = uio->uio_offset % bsize;
 520
 521                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) & ~(bscale - 1));
 522
 523                         if (vp->v_speclastr + bscale == bn) {
 524                                 nextbn = bn + bscale;
 525                                 error = buf_breadn(vp, bn, (int)bsize, &nextbn,
 526                                     (int *)&bsize, 1, NOCRED, &bp);
 527                         } else {
 528                                 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
 529                         }
 530
 531                         vnode_lock(vp);
 532                         vp->v_speclastr = bn;
 533                         vnode_unlock(vp);
 534
 535                         n = bsize - buf_resid(bp);
 536                         if ((on > n) || error) {
 537                                 if (!error) {
 538                                         error = EINVAL;
 539                                 }
 540                                 buf_brelse(bp);
 541                                 return error;
 542                         }
 543                         n = min((unsigned)(n  - on), uio_resid(uio));
 544
 545                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
 546                         if (n + on == bsize) {
 547                                 buf_markaged(bp);
 548                         }
 549                         buf_brelse(bp);
 550                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
 551                 return error;
 552
 553         default:
 554                 panic("spec_read type");
 555         }
 556         /* NOTREACHED */
 557
 558         return 0;
 559 }
 560
 561 /*
 562  * Vnode op for write
 563  */
 564 int
 565 spec_write(struct vnop_write_args *ap)
 566 {
 567         struct vnode *vp = ap->a_vp;
 568         struct uio *uio = ap->a_uio;
 569         struct buf *bp;
 570         daddr64_t bn;
 571         int bsize, blkmask, bscale;
 572         int io_sync;
 573         int devBlockSize = 0;
 574         int n, on;
 575         int error = 0;
 576         dev_t dev;
 577
 578 #if DIAGNOSTIC
 579         if (uio->uio_rw != UIO_WRITE) {
 580                 panic("spec_write mode");
 581         }
 582         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
 583                 panic("spec_write proc");
 584         }
 585 #endif
 586
 587         switch (vp->v_type) {
 588         case VCHR:
 589         {
 590                 struct _throttle_io_info_t *throttle_info = NULL;
 591                 int thread_throttle_level;
 592                 if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
 593                         throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
 594
 595                         thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
 596
 597                         microuptime(&throttle_info->throttle_last_write_timestamp);
 598                 }
 599                 error = (*cdevsw[major(vp->v_rdev)].d_write)
 600                     (vp->v_rdev, uio, ap->a_ioflag);
 601
 602                 if (throttle_info) {
 603                         throttle_info_end_io_internal(throttle_info, thread_throttle_level);
 604                 }
 605
 606                 return error;
 607         }
 608
 609         case VBLK:
 610                 if (uio_resid(uio) == 0) {
 611                         return 0;
 612                 }
 613                 if (uio->uio_offset < 0) {
 614                         return EINVAL;
 615                 }
 616
 617                 io_sync = (ap->a_ioflag & IO_SYNC);
 618
 619                 dev = (vp->v_rdev);
 620
 621                 devBlockSize = vp->v_specsize;
 622                 if (devBlockSize > PAGE_SIZE) {
 623                         return EINVAL;
 624                 }
 625
 626                 bscale = PAGE_SIZE / devBlockSize;
 627                 blkmask = bscale - 1;
 628                 bsize = bscale * devBlockSize;
 629
 630
 631                 do {
 632                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) & ~blkmask);
 633                         on = uio->uio_offset % bsize;
 634
 635                         n = min((unsigned)(bsize - on), uio_resid(uio));
 636
 637                         /*
 638                          * Use buf_getblk() as an optimization IFF:
 639                          *
 640                          * 1)   We are reading exactly a block on a block
 641                          *      aligned boundary
 642                          * 2)   We know the size of the device from spec_open
 643                          * 3)   The read doesn't span the end of the device
 644                          *
 645                          * Otherwise, we fall back on buf_bread().
 646                          */
 647                         if (n == bsize &&
 648                             vp->v_specdevsize != (u_int64_t)0 &&
 649                             (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
 650                                 /* reduce the size of the read to what is there */
 651                                 n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
 652                         }
 653
 654                         if (n == bsize) {
 655                                 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
 656                         } else {
 657                                 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
 658                         }
 659
 660                         /* Translate downstream error for upstream, if needed */
 661                         if (!error) {
 662                                 error = (int)buf_error(bp);
 663                         }
 664                         if (error) {
 665                                 buf_brelse(bp);
 666                                 return error;
 667                         }
 668                         n = min(n, bsize - buf_resid(bp));
 669
 670                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
 671                         if (error) {
 672                                 buf_brelse(bp);
 673                                 return error;
 674                         }
 675                         buf_markaged(bp);
 676
 677                         if (io_sync) {
 678                                 error = buf_bwrite(bp);
 679                         } else {
 680                                 if ((n + on) == bsize) {
 681                                         error = buf_bawrite(bp);
 682                                 } else {
 683                                         error = buf_bdwrite(bp);
 684                                 }
 685                         }
 686                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
 687                 return error;
 688
 689         default:
 690                 panic("spec_write type");
 691         }
 692         /* NOTREACHED */
 693
 694         return 0;
 695 }
 696
 697 /*
 698  * Device ioctl operation.
 699  */
 700 int
 701 spec_ioctl(struct vnop_ioctl_args *ap)
 702 {
 703         proc_t p = vfs_context_proc(ap->a_context);
 704         dev_t dev = ap->a_vp->v_rdev;
 705         int     retval = 0;
 706
 707         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START,
 708             dev, ap->a_command, ap->a_fflag, ap->a_vp->v_type, 0);
 709
 710         switch (ap->a_vp->v_type) {
 711         case VCHR:
 712                 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
 713                     ap->a_fflag, p);
 714                 break;
 715
 716         case VBLK:
 717                 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, p);
 718                 if (!retval && ap->a_command == DKIOCSETBLOCKSIZE) {
 719                         ap->a_vp->v_specsize = *(uint32_t *)ap->a_data;
 720                 }
 721                 break;
 722
 723         default:
 724                 panic("spec_ioctl");
 725                 /* NOTREACHED */
 726         }
 727         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END,
 728             dev, ap->a_command, ap->a_fflag, retval, 0);
 729
 730         return retval;
 731 }
 732
 733 int
 734 spec_select(struct vnop_select_args *ap)
 735 {
 736         proc_t p = vfs_context_proc(ap->a_context);
 737         dev_t dev;
 738
 739         switch (ap->a_vp->v_type) {
 740         default:
 741                 return 1;             /* XXX */
 742
 743         case VCHR:
 744                 dev = ap->a_vp->v_rdev;
 745                 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
 746         }
 747 }
 748
 749 static int filt_specattach(struct knote *kn, struct kevent_internal_s *kev);
 750
 751 int
 752 spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_internal_s *kev)
 753 {
 754         dev_t dev;
 755
 756         assert(vnode_ischr(vp));
 757
 758         dev = vnode_specrdev(vp);
 759
 760 #if NETWORKING
 761         /*
 762          * Try a bpf device, as defined in bsd/net/bpf.c
 763          * If it doesn't error out the attach, then it
 764          * claimed it. Otherwise, fall through and try
 765          * other attaches.
 766          */
 767         int32_t tmp_flags = kn->kn_flags;
 768         int64_t tmp_data = kn->kn_data;
 769         int res;
 770
 771         res = bpfkqfilter(dev, kn);
 772         if ((kn->kn_flags & EV_ERROR) == 0) {
 773                 return res;
 774         }
 775         kn->kn_flags = tmp_flags;
 776         kn->kn_data = tmp_data;
 777 #endif
 778
 779         if (major(dev) > nchrdev) {
 780                 knote_set_error(kn, ENXIO);
 781                 return 0;
 782         }
 783
 784         kn->kn_vnode_kqok = !!(cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE);
 785         kn->kn_vnode_use_ofst = !!(cdevsw_flags[major(dev)] & CDEVSW_USE_OFFSET);
 786
 787         if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTS) {
 788                 kn->kn_filtid = EVFILTID_PTSD;
 789                 return ptsd_kqfilter(dev, kn);
 790         } else if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTC) {
 791                 kn->kn_filtid = EVFILTID_PTMX;
 792                 return ptmx_kqfilter(dev, kn);
 793         } else if (cdevsw[major(dev)].d_type == D_TTY && kn->kn_vnode_kqok) {
 794                 /*
 795                  * TTYs from drivers that use struct ttys use their own filter
 796                  * routines.  The PTC driver doesn't use the tty for character
 797                  * counts, so it must go through the select fallback.
 798                  */
 799                 kn->kn_filtid = EVFILTID_TTY;
 800                 return knote_fops(kn)->f_attach(kn, kev);
 801         }
 802
 803         /* Try to attach to other char special devices */
 804         return filt_specattach(kn, kev);
 805 }
 806
 807 /*
 808  * Synch buffers associated with a block device
 809  */
 810 int
 811 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
 812 {
 813         if (vp->v_type == VCHR) {
 814                 return 0;
 815         }
 816         /*
 817          * Flush all dirty buffers associated with a block device.
 818          */
 819         buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync");
 820
 821         return 0;
 822 }
 823
 824 int
 825 spec_fsync(struct vnop_fsync_args *ap)
 826 {
 827         return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
 828 }
 829
 830
 831 /*
 832  * Just call the device strategy routine
 833  */
 834 void throttle_init(void);
 835
 836
 837 #if 0
 838 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)  \
 839         do {                                                    \
 840                if ((debug_info)->alloc)                           \
 841                printf("%s: "format, __FUNCTION__, ## args);     \
 842        } while(0)
 843
 844 #else
 845 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
 846 #endif
 847
 848
 849 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER1], 0, "");
 850 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER2], 0, "");
 851 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER3], 0, "");
 852
 853 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER1], 0, "");
 854 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER2], 0, "");
 855 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER3], 0, "");
 856
 857 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER1], 0, "");
 858 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER2], 0, "");
 859 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER3], 0, "");
 860
 861 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_throttle_enabled, 0, "");
 862
 863
 864 static lck_grp_t        *throttle_lock_grp;
 865 static lck_attr_t       *throttle_lock_attr;
 866 static lck_grp_attr_t   *throttle_lock_grp_attr;
 867
 868
 869 /*
 870  * throttled I/O helper function
 871  * convert the index of the lowest set bit to a device index
 872  */
 873 int
 874 num_trailing_0(uint64_t n)
 875 {
 876         /*
 877          * since in most cases the number of trailing 0s is very small,
 878          * we simply counting sequentially from the lowest bit
 879          */
 880         if (n == 0) {
 881                 return sizeof(n) * 8;
 882         }
 883         int count = 0;
 884         while (!ISSET(n, 1)) {
 885                 n >>= 1;
 886                 ++count;
 887         }
 888         return count;
 889 }
 890
 891
 892 /*
 893  * Release the reference and if the item was allocated and this is the last
 894  * reference then free it.
 895  *
 896  * This routine always returns the old value.
 897  */
 898 static int
 899 throttle_info_rel(struct _throttle_io_info_t *info)
 900 {
 901         SInt32 oldValue = OSDecrementAtomic(&info->throttle_refcnt);
 902
 903         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
 904             info, (int)(oldValue - 1), info );
 905
 906         /* The reference count just went negative, very bad */
 907         if (oldValue == 0) {
 908                 panic("throttle info ref cnt went negative!");
 909         }
 910
 911         /*
 912          * Once reference count is zero, no one else should be able to take a
 913          * reference
 914          */
 915         if ((info->throttle_refcnt == 0) && (info->throttle_alloc)) {
 916                 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info);
 917
 918                 lck_mtx_destroy(&info->throttle_lock, throttle_lock_grp);
 919                 FREE(info, M_TEMP);
 920         }
 921         return oldValue;
 922 }
 923
 924
 925 /*
 926  * Just take a reference on the throttle info structure.
 927  *
 928  * This routine always returns the old value.
 929  */
 930 static SInt32
 931 throttle_info_ref(struct _throttle_io_info_t *info)
 932 {
 933         SInt32 oldValue = OSIncrementAtomic(&info->throttle_refcnt);
 934
 935         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
 936             info, (int)(oldValue - 1), info );
 937         /* Allocated items should never have a reference of zero */
 938         if (info->throttle_alloc && (oldValue == 0)) {
 939                 panic("Taking a reference without calling create throttle info!\n");
 940         }
 941
 942         return oldValue;
 943 }
 944
 945 /*
 946  * on entry the throttle_lock is held...
 947  * this function is responsible for taking
 948  * and dropping the reference on the info
 949  * structure which will keep it from going
 950  * away while the timer is running if it
 951  * happens to have been dynamically allocated by
 952  * a network fileystem kext which is now trying
 953  * to free it
 954  */
 955 static uint32_t
 956 throttle_timer_start(struct _throttle_io_info_t *info, boolean_t update_io_count, int wakelevel)
 957 {
 958         struct timeval  elapsed;
 959         struct timeval  now;
 960         struct timeval  period;
 961         uint64_t        elapsed_msecs;
 962         int             throttle_level;
 963         int             level;
 964         int             msecs;
 965         boolean_t       throttled = FALSE;
 966         boolean_t       need_timer = FALSE;
 967
 968         microuptime(&now);
 969
 970         if (update_io_count == TRUE) {
 971                 info->throttle_io_count_begin = info->throttle_io_count;
 972                 info->throttle_io_period_num++;
 973
 974                 while (wakelevel >= THROTTLE_LEVEL_THROTTLED) {
 975                         info->throttle_start_IO_period_timestamp[wakelevel--] = now;
 976                 }
 977
 978                 info->throttle_min_timer_deadline = now;
 979
 980                 msecs = info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED];
 981                 period.tv_sec = msecs / 1000;
 982                 period.tv_usec = (msecs % 1000) * 1000;
 983
 984                 timevaladd(&info->throttle_min_timer_deadline, &period);
 985         }
 986         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < THROTTLE_LEVEL_END; throttle_level++) {
 987                 elapsed = now;
 988                 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
 989                 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
 990
 991                 for (level = throttle_level + 1; level <= THROTTLE_LEVEL_END; level++) {
 992                         if (!TAILQ_EMPTY(&info->throttle_uthlist[level])) {
 993                                 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[level] || info->throttle_inflight_count[throttle_level]) {
 994                                         /*
 995                                          * we had an I/O occur at a higher priority tier within
 996                                          * this tier's throttle window
 997                                          */
 998                                         throttled = TRUE;
 999                                 }
1000                                 /*
1001                                  * we assume that the windows are the same or longer
1002                                  * as we drop through the throttling tiers...  thus
1003                                  * we can stop looking once we run into a tier with
1004                                  * threads to schedule regardless of whether it's
1005                                  * still in its throttling window or not
1006                                  */
1007                                 break;
1008                         }
1009                 }
1010                 if (throttled == TRUE) {
1011                         break;
1012                 }
1013         }
1014         if (throttled == TRUE) {
1015                 uint64_t        deadline = 0;
1016                 struct timeval  target;
1017                 struct timeval  min_target;
1018
1019                 /*
1020                  * we've got at least one tier still in a throttled window
1021                  * so we need a timer running... compute the next deadline
1022                  * and schedule it
1023                  */
1024                 for (level = throttle_level + 1; level <= THROTTLE_LEVEL_END; level++) {
1025                         if (TAILQ_EMPTY(&info->throttle_uthlist[level])) {
1026                                 continue;
1027                         }
1028
1029                         target = info->throttle_start_IO_period_timestamp[level];
1030
1031                         msecs = info->throttle_io_periods[level];
1032                         period.tv_sec = msecs / 1000;
1033                         period.tv_usec = (msecs % 1000) * 1000;
1034
1035                         timevaladd(&target, &period);
1036
1037                         if (need_timer == FALSE || timevalcmp(&target, &min_target, <)) {
1038                                 min_target = target;
1039                                 need_timer = TRUE;
1040                         }
1041                 }
1042                 if (timevalcmp(&info->throttle_min_timer_deadline, &now, >)) {
1043                         if (timevalcmp(&info->throttle_min_timer_deadline, &min_target, >)) {
1044                                 min_target = info->throttle_min_timer_deadline;
1045                         }
1046                 }
1047
1048                 if (info->throttle_timer_active) {
1049                         if (thread_call_cancel(info->throttle_timer_call) == FALSE) {
1050                                 /*
1051                                  * couldn't kill the timer because it's already
1052                                  * been dispatched, so don't try to start a new
1053                                  * one... once we drop the lock, the timer will
1054                                  * proceed and eventually re-run this function
1055                                  */
1056                                 need_timer = FALSE;
1057                         } else {
1058                                 info->throttle_timer_active = 0;
1059                         }
1060                 }
1061                 if (need_timer == TRUE) {
1062                         /*
1063                          * This is defined as an int (32-bit) rather than a 64-bit
1064                          * value because it would need a really big period in the
1065                          * order of ~500 days to overflow this. So, we let this be
1066                          * 32-bit which allows us to use the clock_interval_to_deadline()
1067                          * routine.
1068                          */
1069                         int     target_msecs;
1070
1071                         if (info->throttle_timer_ref == 0) {
1072                                 /*
1073                                  * take a reference for the timer
1074                                  */
1075                                 throttle_info_ref(info);
1076
1077                                 info->throttle_timer_ref = 1;
1078                         }
1079                         elapsed = min_target;
1080                         timevalsub(&elapsed, &now);
1081                         target_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
1082
1083                         if (target_msecs <= 0) {
1084                                 /*
1085                                  * we may have computed a deadline slightly in the past
1086                                  * due to various factors... if so, just set the timer
1087                                  * to go off in the near future (we don't need to be precise)
1088                                  */
1089                                 target_msecs = 1;
1090                         }
1091                         clock_interval_to_deadline(target_msecs, 1000000, &deadline);
1092
1093                         thread_call_enter_delayed(info->throttle_timer_call, deadline);
1094                         info->throttle_timer_active = 1;
1095                 }
1096         }
1097         return throttle_level;
1098 }
1099
1100
1101 static void
1102 throttle_timer(struct _throttle_io_info_t *info)
1103 {
1104         uthread_t       ut, utlist;
1105         struct timeval  elapsed;
1106         struct timeval  now;
1107         uint64_t        elapsed_msecs;
1108         int             throttle_level;
1109         int             level;
1110         int             wake_level;
1111         caddr_t         wake_address = NULL;
1112         boolean_t       update_io_count = FALSE;
1113         boolean_t       need_wakeup = FALSE;
1114         boolean_t       need_release = FALSE;
1115
1116         ut = NULL;
1117         lck_mtx_lock(&info->throttle_lock);
1118
1119         info->throttle_timer_active = 0;
1120         microuptime(&now);
1121
1122         elapsed = now;
1123         timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[THROTTLE_LEVEL_THROTTLED]);
1124         elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1125
1126         if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED]) {
1127                 wake_level = info->throttle_next_wake_level;
1128
1129                 for (level = THROTTLE_LEVEL_START; level < THROTTLE_LEVEL_END; level++) {
1130                         elapsed = now;
1131                         timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[wake_level]);
1132                         elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1133
1134                         if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[wake_level] && !TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
1135                                 /*
1136                                  * we're closing out the current IO period...
1137                                  * if we have a waiting thread, wake it up
1138                                  * after we have reset the I/O window info
1139                                  */
1140                                 need_wakeup = TRUE;
1141                                 update_io_count = TRUE;
1142
1143                                 info->throttle_next_wake_level = wake_level - 1;
1144
1145                                 if (info->throttle_next_wake_level == THROTTLE_LEVEL_START) {
1146                                         info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1147                                 }
1148
1149                                 break;
1150                         }
1151                         wake_level--;
1152
1153                         if (wake_level == THROTTLE_LEVEL_START) {
1154                                 wake_level = THROTTLE_LEVEL_END;
1155                         }
1156                 }
1157         }
1158         if (need_wakeup == TRUE) {
1159                 if (!TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
1160                         ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist[wake_level]);
1161                         TAILQ_REMOVE(&info->throttle_uthlist[wake_level], ut, uu_throttlelist);
1162                         ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1163                         ut->uu_is_throttled = false;
1164
1165                         wake_address = (caddr_t)&ut->uu_on_throttlelist;
1166                 }
1167         } else {
1168                 wake_level = THROTTLE_LEVEL_START;
1169         }
1170
1171         throttle_level = throttle_timer_start(info, update_io_count, wake_level);
1172
1173         if (wake_address != NULL) {
1174                 wakeup(wake_address);
1175         }
1176
1177         for (level = THROTTLE_LEVEL_THROTTLED; level <= throttle_level; level++) {
1178                 TAILQ_FOREACH_SAFE(ut, &info->throttle_uthlist[level], uu_throttlelist, utlist) {
1179                         TAILQ_REMOVE(&info->throttle_uthlist[level], ut, uu_throttlelist);
1180                         ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1181                         ut->uu_is_throttled = false;
1182
1183                         wakeup(&ut->uu_on_throttlelist);
1184                 }
1185         }
1186         if (info->throttle_timer_active == 0 && info->throttle_timer_ref) {
1187                 info->throttle_timer_ref = 0;
1188                 need_release = TRUE;
1189         }
1190         lck_mtx_unlock(&info->throttle_lock);
1191
1192         if (need_release == TRUE) {
1193                 throttle_info_rel(info);
1194         }
1195 }
1196
1197
1198 static int
1199 throttle_add_to_list(struct _throttle_io_info_t *info, uthread_t ut, int mylevel, boolean_t insert_tail)
1200 {
1201         boolean_t start_timer = FALSE;
1202         int level = THROTTLE_LEVEL_START;
1203
1204         if (TAILQ_EMPTY(&info->throttle_uthlist[mylevel])) {
1205                 info->throttle_start_IO_period_timestamp[mylevel] = info->throttle_last_IO_timestamp[mylevel];
1206                 start_timer = TRUE;
1207         }
1208
1209         if (insert_tail == TRUE) {
1210                 TAILQ_INSERT_TAIL(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
1211         } else {
1212                 TAILQ_INSERT_HEAD(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
1213         }
1214
1215         ut->uu_on_throttlelist = mylevel;
1216
1217         if (start_timer == TRUE) {
1218                 /* we may need to start or rearm the timer */
1219                 level = throttle_timer_start(info, FALSE, THROTTLE_LEVEL_START);
1220
1221                 if (level == THROTTLE_LEVEL_END) {
1222                         if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
1223                                 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1224
1225                                 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1226                         }
1227                 }
1228         }
1229         return level;
1230 }
1231
1232 static void
1233 throttle_init_throttle_window(void)
1234 {
1235         int throttle_window_size;
1236
1237         /*
1238          * The hierarchy of throttle window values is as follows:
1239          * - Global defaults
1240          * - Device tree properties
1241          * - Boot-args
1242          * All values are specified in msecs.
1243          */
1244
1245         /* Override global values with device-tree properties */
1246         if (PE_get_default("kern.io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size))) {
1247                 throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
1248         }
1249
1250         if (PE_get_default("kern.io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size))) {
1251                 throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
1252         }
1253
1254         if (PE_get_default("kern.io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size))) {
1255                 throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
1256         }
1257
1258         /* Override with boot-args */
1259         if (PE_parse_boot_argn("io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size))) {
1260                 throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
1261         }
1262
1263         if (PE_parse_boot_argn("io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size))) {
1264                 throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
1265         }
1266
1267         if (PE_parse_boot_argn("io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size))) {
1268                 throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
1269         }
1270 }
1271
1272 static void
1273 throttle_init_throttle_period(struct _throttle_io_info_t *info, boolean_t isssd)
1274 {
1275         int throttle_period_size;
1276
1277         /*
1278          * The hierarchy of throttle period values is as follows:
1279          * - Global defaults
1280          * - Device tree properties
1281          * - Boot-args
1282          * All values are specified in msecs.
1283          */
1284
1285         /* Assign global defaults */
1286         if ((isssd == TRUE) && (info->throttle_is_fusion_with_priority == 0)) {
1287                 info->throttle_io_periods = &throttle_io_period_ssd_msecs[0];
1288         } else {
1289                 info->throttle_io_periods = &throttle_io_period_msecs[0];
1290         }
1291
1292         /* Override global values with device-tree properties */
1293         if (PE_get_default("kern.io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size))) {
1294                 info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
1295         }
1296
1297         if (PE_get_default("kern.io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size))) {
1298                 info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
1299         }
1300
1301         if (PE_get_default("kern.io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size))) {
1302                 info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
1303         }
1304
1305         /* Override with boot-args */
1306         if (PE_parse_boot_argn("io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size))) {
1307                 info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
1308         }
1309
1310         if (PE_parse_boot_argn("io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size))) {
1311                 info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
1312         }
1313
1314         if (PE_parse_boot_argn("io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size))) {
1315                 info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
1316         }
1317 }
1318
1319 #if CONFIG_IOSCHED
1320 extern  void vm_io_reprioritize_init(void);
1321 int     iosched_enabled = 1;
1322 #endif
1323
1324 void
1325 throttle_init(void)
1326 {
1327         struct _throttle_io_info_t *info;
1328         int     i;
1329         int     level;
1330 #if CONFIG_IOSCHED
1331         int     iosched;
1332 #endif
1333         /*
1334          * allocate lock group attribute and group
1335          */
1336         throttle_lock_grp_attr = lck_grp_attr_alloc_init();
1337         throttle_lock_grp = lck_grp_alloc_init("throttle I/O", throttle_lock_grp_attr);
1338
1339         /* Update throttle parameters based on device tree configuration */
1340         throttle_init_throttle_window();
1341
1342         /*
1343          * allocate the lock attribute
1344          */
1345         throttle_lock_attr = lck_attr_alloc_init();
1346
1347         for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) {
1348                 info = &_throttle_io_info[i];
1349
1350                 lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr);
1351                 info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1352
1353                 for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
1354                         TAILQ_INIT(&info->throttle_uthlist[level]);
1355                         info->throttle_last_IO_pid[level] = 0;
1356                         info->throttle_inflight_count[level] = 0;
1357                 }
1358                 info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1359                 info->throttle_disabled = 0;
1360                 info->throttle_is_fusion_with_priority = 0;
1361         }
1362 #if CONFIG_IOSCHED
1363         if (PE_parse_boot_argn("iosched", &iosched, sizeof(iosched))) {
1364                 iosched_enabled = iosched;
1365         }
1366         if (iosched_enabled) {
1367                 /* Initialize I/O Reprioritization mechanism */
1368                 vm_io_reprioritize_init();
1369         }
1370 #endif
1371 }
1372
1373 void
1374 sys_override_io_throttle(boolean_t enable_override)
1375 {
1376         if (enable_override) {
1377                 lowpri_throttle_enabled = 0;
1378         } else {
1379                 lowpri_throttle_enabled = 1;
1380         }
1381 }
1382
1383 int rethrottle_wakeups = 0;
1384
1385 /*
1386  * the uu_rethrottle_lock is used to synchronize this function
1387  * with "throttle_lowpri_io" which is where a throttled thread
1388  * will block... that function will grab this lock before beginning
1389  * it's decision making process concerning the need to block, and
1390  * hold it through the assert_wait.  When that thread is awakened
1391  * for any reason (timer or rethrottle), it will reacquire the
1392  * uu_rethrottle_lock before determining if it really is ok for
1393  * it to now run.  This is the point at which the thread could
1394  * enter a different throttling queue and reblock or return from
1395  * the throttle w/o having waited out it's entire throttle if
1396  * the rethrottle has now moved it out of any currently
1397  * active throttle window.
1398  *
1399  *
1400  * NOTES:
1401  * 1 - This may be called with the task lock held.
1402  * 2 - This may be called with preemption and interrupts disabled
1403  *     in the kqueue wakeup path so we can't take the throttle_lock which is a mutex
1404  * 3 - This cannot safely dereference uu_throttle_info, as it may
1405  *     get deallocated out from under us
1406  */
1407
1408 void
1409 rethrottle_thread(uthread_t ut)
1410 {
1411         /*
1412          * If uthread doesn't have throttle state, then there's no chance
1413          * of it needing a rethrottle.
1414          */
1415         if (ut->uu_throttle_info == NULL) {
1416                 return;
1417         }
1418
1419         boolean_t s = ml_set_interrupts_enabled(FALSE);
1420         lck_spin_lock(&ut->uu_rethrottle_lock);
1421
1422         if (!ut->uu_is_throttled) {
1423                 ut->uu_was_rethrottled = true;
1424         } else {
1425                 int my_new_level = throttle_get_thread_throttle_level(ut);
1426
1427                 if (my_new_level != ut->uu_on_throttlelist) {
1428                         /*
1429                          * ut is currently blocked (as indicated by
1430                          * ut->uu_is_throttled == true)
1431                          * and we're changing it's throttle level, so
1432                          * we need to wake it up.
1433                          */
1434                         ut->uu_is_throttled = false;
1435                         wakeup(&ut->uu_on_throttlelist);
1436
1437                         rethrottle_wakeups++;
1438                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 102)), thread_tid(ut->uu_thread), ut->uu_on_throttlelist, my_new_level, 0, 0);
1439                 }
1440         }
1441         lck_spin_unlock(&ut->uu_rethrottle_lock);
1442         ml_set_interrupts_enabled(s);
1443 }
1444
1445
1446 /*
1447  * KPI routine
1448  *
1449  * Create and take a reference on a throttle info structure and return a
1450  * pointer for the file system to use when calling throttle_info_update.
1451  * Calling file system must have a matching release for every create.
1452  */
1453 void *
1454 throttle_info_create(void)
1455 {
1456         struct _throttle_io_info_t *info;
1457         int     level;
1458
1459         MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK);
1460         /* Should never happen but just in case */
1461         if (info == NULL) {
1462                 return NULL;
1463         }
1464         /* Mark that this one was allocated and needs to be freed */
1465         DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
1466         info->throttle_alloc = TRUE;
1467
1468         lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr);
1469         info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1470
1471         for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
1472                 TAILQ_INIT(&info->throttle_uthlist[level]);
1473         }
1474         info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1475
1476         /* Take a reference */
1477         OSIncrementAtomic(&info->throttle_refcnt);
1478         return info;
1479 }
1480
1481 /*
1482  * KPI routine
1483  *
1484  * Release the throttle info pointer if all the reference are gone. Should be
1485  * called to release reference taken by throttle_info_create
1486  */
1487 void
1488 throttle_info_release(void *throttle_info)
1489 {
1490         DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
1491             (struct _throttle_io_info_t *)throttle_info,
1492             (struct _throttle_io_info_t *)throttle_info);
1493         if (throttle_info) { /* Just to be careful */
1494                 throttle_info_rel(throttle_info);
1495         }
1496 }
1497
1498 /*
1499  * KPI routine
1500  *
1501  * File Systems that create an info structure, need to call this routine in
1502  * their mount routine (used by cluster code). File Systems that call this in
1503  * their mount routines must call throttle_info_mount_rel in their unmount
1504  * routines.
1505  */
1506 void
1507 throttle_info_mount_ref(mount_t mp, void *throttle_info)
1508 {
1509         if ((throttle_info == NULL) || (mp == NULL)) {
1510                 return;
1511         }
1512         throttle_info_ref(throttle_info);
1513
1514         /*
1515          * We already have a reference release it before adding the new one
1516          */
1517         if (mp->mnt_throttle_info) {
1518                 throttle_info_rel(mp->mnt_throttle_info);
1519         }
1520         mp->mnt_throttle_info = throttle_info;
1521 }
1522
1523 /*
1524  * Private KPI routine
1525  *
1526  * return a handle for accessing throttle_info given a throttle_mask.  The
1527  * handle must be released by throttle_info_rel_by_mask
1528  */
1529 int
1530 throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle)
1531 {
1532         int     dev_index;
1533         struct _throttle_io_info_t *info;
1534
1535         if (throttle_info_handle == NULL) {
1536                 return EINVAL;
1537         }
1538
1539         dev_index = num_trailing_0(throttle_mask);
1540         info = &_throttle_io_info[dev_index];
1541         throttle_info_ref(info);
1542         *(struct _throttle_io_info_t**)throttle_info_handle = info;
1543
1544         return 0;
1545 }
1546
1547 /*
1548  * Private KPI routine
1549  *
1550  * release the handle obtained by throttle_info_ref_by_mask
1551  */
1552 void
1553 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
1554 {
1555         /*
1556          * for now the handle is just a pointer to _throttle_io_info_t
1557          */
1558         throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
1559 }
1560
1561 /*
1562  * KPI routine
1563  *
1564  * File Systems that throttle_info_mount_ref, must call this routine in their
1565  * umount routine.
1566  */
1567 void
1568 throttle_info_mount_rel(mount_t mp)
1569 {
1570         if (mp->mnt_throttle_info) {
1571                 throttle_info_rel(mp->mnt_throttle_info);
1572         }
1573         mp->mnt_throttle_info = NULL;
1574 }
1575
1576 /*
1577  * Reset throttling periods for the given mount point
1578  *
1579  * private interface used by disk conditioner to reset
1580  * throttling periods when 'is_ssd' status changes
1581  */
1582 void
1583 throttle_info_mount_reset_period(mount_t mp, int isssd)
1584 {
1585         struct _throttle_io_info_t *info;
1586
1587         if (mp == NULL) {
1588                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1589         } else if (mp->mnt_throttle_info == NULL) {
1590                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1591         } else {
1592                 info = mp->mnt_throttle_info;
1593         }
1594
1595         throttle_init_throttle_period(info, isssd);
1596 }
1597
1598 void
1599 throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
1600 {
1601         struct _throttle_io_info_t *info;
1602
1603         if (mp == NULL) {
1604                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1605         } else if (mp->mnt_throttle_info == NULL) {
1606                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1607         } else {
1608                 info = mp->mnt_throttle_info;
1609         }
1610
1611         *tv = info->throttle_last_write_timestamp;
1612 }
1613
1614 void
1615 update_last_io_time(mount_t mp)
1616 {
1617         struct _throttle_io_info_t *info;
1618
1619         if (mp == NULL) {
1620                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1621         } else if (mp->mnt_throttle_info == NULL) {
1622                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1623         } else {
1624                 info = mp->mnt_throttle_info;
1625         }
1626
1627         microuptime(&info->throttle_last_write_timestamp);
1628         if (mp != NULL) {
1629                 mp->mnt_last_write_completed_timestamp = info->throttle_last_write_timestamp;
1630         }
1631 }
1632
1633 int
1634 throttle_get_io_policy(uthread_t *ut)
1635 {
1636         if (ut != NULL) {
1637                 *ut = get_bsdthread_info(current_thread());
1638         }
1639
1640         return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
1641 }
1642
1643 int
1644 throttle_get_passive_io_policy(uthread_t *ut)
1645 {
1646         if (ut != NULL) {
1647                 *ut = get_bsdthread_info(current_thread());
1648         }
1649
1650         return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_PASSIVE_IO);
1651 }
1652
1653
1654 static int
1655 throttle_get_thread_throttle_level(uthread_t ut)
1656 {
1657         uthread_t *ut_p = (ut == NULL) ? &ut : NULL;
1658         int io_tier = throttle_get_io_policy(ut_p);
1659
1660         return throttle_get_thread_throttle_level_internal(ut, io_tier);
1661 }
1662
1663 /*
1664  * Return a throttle level given an existing I/O tier (such as returned by throttle_get_io_policy)
1665  */
1666 static int
1667 throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier)
1668 {
1669         int thread_throttle_level = io_tier;
1670         int user_idle_level;
1671
1672         assert(ut != NULL);
1673
1674         /* Bootcache misses should always be throttled */
1675         if (ut->uu_throttle_bc) {
1676                 thread_throttle_level = THROTTLE_LEVEL_TIER3;
1677         }
1678
1679         /*
1680          * Issue tier3 I/O as tier2 when the user is idle
1681          * to allow maintenance tasks to make more progress.
1682          *
1683          * Assume any positive idle level is enough... for now it's
1684          * only ever 0 or 128 but this is not defined anywhere.
1685          */
1686         if (thread_throttle_level >= THROTTLE_LEVEL_TIER3) {
1687                 user_idle_level = timer_get_user_idle_level();
1688                 if (user_idle_level > 0) {
1689                         thread_throttle_level--;
1690                 }
1691         }
1692
1693         return thread_throttle_level;
1694 }
1695
1696 /*
1697  * I/O will be throttled if either of the following are true:
1698  *   - Higher tiers have in-flight I/O
1699  *   - The time delta since the last start/completion of a higher tier is within the throttle window interval
1700  *
1701  * In-flight I/O is bookended by throttle_info_update_internal/throttle_info_end_io_internal
1702  */
1703 static int
1704 throttle_io_will_be_throttled_internal(void * throttle_info, int * mylevel, int * throttling_level)
1705 {
1706         struct _throttle_io_info_t *info = throttle_info;
1707         struct timeval elapsed;
1708         struct timeval now;
1709         uint64_t elapsed_msecs;
1710         int     thread_throttle_level;
1711         int     throttle_level;
1712
1713         if ((thread_throttle_level = throttle_get_thread_throttle_level(NULL)) < THROTTLE_LEVEL_THROTTLED) {
1714                 return THROTTLE_DISENGAGED;
1715         }
1716
1717         microuptime(&now);
1718
1719         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
1720                 if (info->throttle_inflight_count[throttle_level]) {
1721                         break;
1722                 }
1723                 elapsed = now;
1724                 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
1725                 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1726
1727                 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level]) {
1728                         break;
1729                 }
1730         }
1731         if (throttle_level >= thread_throttle_level) {
1732                 /*
1733                  * we're beyond all of the throttle windows
1734                  * that affect the throttle level of this thread,
1735                  * so go ahead and treat as normal I/O
1736                  */
1737                 return THROTTLE_DISENGAGED;
1738         }
1739         if (mylevel) {
1740                 *mylevel = thread_throttle_level;
1741         }
1742         if (throttling_level) {
1743                 *throttling_level = throttle_level;
1744         }
1745
1746         if (info->throttle_io_count != info->throttle_io_count_begin) {
1747                 /*
1748                  * we've already issued at least one throttleable I/O
1749                  * in the current I/O window, so avoid issuing another one
1750                  */
1751                 return THROTTLE_NOW;
1752         }
1753         /*
1754          * we're in the throttle window, so
1755          * cut the I/O size back
1756          */
1757         return THROTTLE_ENGAGED;
1758 }
1759
1760 /*
1761  * If we have a mount point and it has a throttle info pointer then
1762  * use it to do the check, otherwise use the device unit number to find
1763  * the correct throttle info array element.
1764  */
1765 int
1766 throttle_io_will_be_throttled(__unused int lowpri_window_msecs, mount_t mp)
1767 {
1768         struct _throttle_io_info_t      *info;
1769
1770         /*
1771          * Should we just return zero if no mount point
1772          */
1773         if (mp == NULL) {
1774                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1775         } else if (mp->mnt_throttle_info == NULL) {
1776                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1777         } else {
1778                 info = mp->mnt_throttle_info;
1779         }
1780
1781         if (info->throttle_is_fusion_with_priority) {
1782                 uthread_t ut = get_bsdthread_info(current_thread());
1783                 if (ut->uu_lowpri_window == 0) {
1784                         return THROTTLE_DISENGAGED;
1785                 }
1786         }
1787
1788         if (info->throttle_disabled) {
1789                 return THROTTLE_DISENGAGED;
1790         } else {
1791                 return throttle_io_will_be_throttled_internal(info, NULL, NULL);
1792         }
1793 }
1794
1795 /*
1796  * Routine to increment I/O throttling counters maintained in the proc
1797  */
1798
1799 static void
1800 throttle_update_proc_stats(pid_t throttling_pid, int count)
1801 {
1802         proc_t throttling_proc;
1803         proc_t throttled_proc = current_proc();
1804
1805         /* The throttled_proc is always the current proc; so we are not concerned with refs */
1806         OSAddAtomic64(count, &(throttled_proc->was_throttled));
1807
1808         /* The throttling pid might have exited by now */
1809         throttling_proc = proc_find(throttling_pid);
1810         if (throttling_proc != PROC_NULL) {
1811                 OSAddAtomic64(count, &(throttling_proc->did_throttle));
1812                 proc_rele(throttling_proc);
1813         }
1814 }
1815
1816 /*
1817  * Block until woken up by the throttle timer or by a rethrottle call.
1818  * As long as we hold the throttle_lock while querying the throttle tier, we're
1819  * safe against seeing an old throttle tier after a rethrottle.
1820  */
1821 uint32_t
1822 throttle_lowpri_io(int sleep_amount)
1823 {
1824         uthread_t ut;
1825         struct _throttle_io_info_t *info;
1826         int     throttle_type = 0;
1827         int     mylevel = 0;
1828         int     throttling_level = THROTTLE_LEVEL_NONE;
1829         int     sleep_cnt = 0;
1830         uint32_t  throttle_io_period_num = 0;
1831         boolean_t insert_tail = TRUE;
1832         boolean_t s;
1833
1834         ut = get_bsdthread_info(current_thread());
1835
1836         if (ut->uu_lowpri_window == 0) {
1837                 return 0;
1838         }
1839
1840         info = ut->uu_throttle_info;
1841
1842         if (info == NULL) {
1843                 ut->uu_throttle_bc = false;
1844                 ut->uu_lowpri_window = 0;
1845                 return 0;
1846         }
1847         lck_mtx_lock(&info->throttle_lock);
1848         assert(ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED);
1849
1850         if (sleep_amount == 0) {
1851                 goto done;
1852         }
1853
1854         if (sleep_amount == 1 && !ut->uu_throttle_bc) {
1855                 sleep_amount = 0;
1856         }
1857
1858         throttle_io_period_num = info->throttle_io_period_num;
1859
1860         ut->uu_was_rethrottled = false;
1861
1862         while ((throttle_type = throttle_io_will_be_throttled_internal(info, &mylevel, &throttling_level))) {
1863                 if (throttle_type == THROTTLE_ENGAGED) {
1864                         if (sleep_amount == 0) {
1865                                 break;
1866                         }
1867                         if (info->throttle_io_period_num < throttle_io_period_num) {
1868                                 break;
1869                         }
1870                         if ((info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) {
1871                                 break;
1872                         }
1873                 }
1874                 /*
1875                  * keep the same position in the list if "rethrottle_thread" changes our throttle level  and
1876                  * then puts us back to the original level before we get a chance to run
1877                  */
1878                 if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED && ut->uu_on_throttlelist != mylevel) {
1879                         /*
1880                          * must have been awakened via "rethrottle_thread" (the timer pulls us off the list)
1881                          * and we've changed our throttling level, so pull ourselves off of the appropriate list
1882                          * and make sure we get put on the tail of the new list since we're starting anew w/r to
1883                          * the throttling engine
1884                          */
1885                         TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1886                         ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1887                         insert_tail = TRUE;
1888                 }
1889                 if (ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED) {
1890                         if (throttle_add_to_list(info, ut, mylevel, insert_tail) == THROTTLE_LEVEL_END) {
1891                                 goto done;
1892                         }
1893                 }
1894                 assert(throttling_level >= THROTTLE_LEVEL_START && throttling_level <= THROTTLE_LEVEL_END);
1895
1896                 s = ml_set_interrupts_enabled(FALSE);
1897                 lck_spin_lock(&ut->uu_rethrottle_lock);
1898
1899                 /*
1900                  * this is the critical section w/r to our interaction
1901                  * with "rethrottle_thread"
1902                  */
1903                 if (ut->uu_was_rethrottled) {
1904                         lck_spin_unlock(&ut->uu_rethrottle_lock);
1905                         ml_set_interrupts_enabled(s);
1906                         lck_mtx_yield(&info->throttle_lock);
1907
1908                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 103)), thread_tid(ut->uu_thread), ut->uu_on_throttlelist, 0, 0, 0);
1909
1910                         ut->uu_was_rethrottled = false;
1911                         continue;
1912                 }
1913                 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, PROCESS_THROTTLED)) | DBG_FUNC_NONE,
1914                     info->throttle_last_IO_pid[throttling_level], throttling_level, proc_selfpid(), mylevel, 0);
1915
1916                 if (sleep_cnt == 0) {
1917                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
1918                             throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0);
1919                         throttled_count[mylevel]++;
1920                 }
1921                 ut->uu_wmesg = "throttle_lowpri_io";
1922
1923                 assert_wait((caddr_t)&ut->uu_on_throttlelist, THREAD_UNINT);
1924
1925                 ut->uu_is_throttled = true;
1926                 lck_spin_unlock(&ut->uu_rethrottle_lock);
1927                 ml_set_interrupts_enabled(s);
1928
1929                 lck_mtx_unlock(&info->throttle_lock);
1930
1931                 thread_block(THREAD_CONTINUE_NULL);
1932
1933                 ut->uu_wmesg = NULL;
1934
1935                 ut->uu_is_throttled = false;
1936                 ut->uu_was_rethrottled = false;
1937
1938                 lck_mtx_lock(&info->throttle_lock);
1939
1940                 sleep_cnt++;
1941
1942                 if (sleep_amount == 0) {
1943                         insert_tail = FALSE;
1944                 } else if (info->throttle_io_period_num < throttle_io_period_num ||
1945                     (info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) {
1946                         insert_tail = FALSE;
1947                         sleep_amount = 0;
1948                 }
1949         }
1950 done:
1951         if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
1952                 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1953                 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1954         }
1955         lck_mtx_unlock(&info->throttle_lock);
1956
1957         if (sleep_cnt) {
1958                 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
1959                     throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0);
1960                 /*
1961                  * We update the stats for the last pid which opened a throttle window for the throttled thread.
1962                  * This might not be completely accurate since the multiple throttles seen by the lower tier pid
1963                  * might have been caused by various higher prio pids. However, updating these stats accurately
1964                  * means doing a proc_find while holding the throttle lock which leads to deadlock.
1965                  */
1966                 throttle_update_proc_stats(info->throttle_last_IO_pid[throttling_level], sleep_cnt);
1967         }
1968
1969         ut->uu_throttle_info = NULL;
1970         ut->uu_throttle_bc = false;
1971         ut->uu_lowpri_window = 0;
1972
1973         throttle_info_rel(info);
1974
1975         return sleep_cnt;
1976 }
1977
1978 /*
1979  * KPI routine
1980  *
1981  * set a kernel thread's IO policy.  policy can be:
1982  * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE, IOPOL_UTILITY, IOPOL_STANDARD
1983  *
1984  * explanations about these policies are in the man page of setiopolicy_np
1985  */
1986 void
1987 throttle_set_thread_io_policy(int policy)
1988 {
1989         proc_set_thread_policy(current_thread(), TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, policy);
1990 }
1991
1992 int
1993 throttle_get_thread_effective_io_policy()
1994 {
1995         return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
1996 }
1997
1998 void
1999 throttle_info_reset_window(uthread_t ut)
2000 {
2001         struct _throttle_io_info_t *info;
2002
2003         if (ut == NULL) {
2004                 ut = get_bsdthread_info(current_thread());
2005         }
2006
2007         if ((info = ut->uu_throttle_info)) {
2008                 throttle_info_rel(info);
2009
2010                 ut->uu_throttle_info = NULL;
2011                 ut->uu_lowpri_window = 0;
2012                 ut->uu_throttle_bc = false;
2013         }
2014 }
2015
2016 static
2017 void
2018 throttle_info_set_initial_window(uthread_t ut, struct _throttle_io_info_t *info, boolean_t BC_throttle, boolean_t isssd)
2019 {
2020         if (lowpri_throttle_enabled == 0 || info->throttle_disabled) {
2021                 return;
2022         }
2023
2024         if (info->throttle_io_periods == 0) {
2025                 throttle_init_throttle_period(info, isssd);
2026         }
2027         if (ut->uu_throttle_info == NULL) {
2028                 ut->uu_throttle_info = info;
2029                 throttle_info_ref(info);
2030                 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
2031
2032                 ut->uu_lowpri_window = 1;
2033                 ut->uu_throttle_bc = BC_throttle;
2034         }
2035 }
2036
2037 /*
2038  * Update inflight IO count and throttling window
2039  * Should be called when an IO is done
2040  *
2041  * Only affects IO that was sent through spec_strategy
2042  */
2043 void
2044 throttle_info_end_io(buf_t bp)
2045 {
2046         mount_t mp;
2047         struct bufattr *bap;
2048         struct _throttle_io_info_t *info;
2049         int io_tier;
2050
2051         bap = &bp->b_attr;
2052         if (!ISSET(bap->ba_flags, BA_STRATEGY_TRACKED_IO)) {
2053                 return;
2054         }
2055         CLR(bap->ba_flags, BA_STRATEGY_TRACKED_IO);
2056
2057         mp = buf_vnode(bp)->v_mount;
2058         if (mp != NULL) {
2059                 info = &_throttle_io_info[mp->mnt_devbsdunit];
2060         } else {
2061                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2062         }
2063
2064         io_tier = GET_BUFATTR_IO_TIER(bap);
2065         if (ISSET(bap->ba_flags, BA_IO_TIER_UPGRADE)) {
2066                 io_tier--;
2067         }
2068
2069         throttle_info_end_io_internal(info, io_tier);
2070 }
2071
2072 /*
2073  * Decrement inflight count initially incremented by throttle_info_update_internal
2074  */
2075 static
2076 void
2077 throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level)
2078 {
2079         if (throttle_level == THROTTLE_LEVEL_NONE) {
2080                 return;
2081         }
2082
2083         microuptime(&info->throttle_window_start_timestamp[throttle_level]);
2084         OSDecrementAtomic(&info->throttle_inflight_count[throttle_level]);
2085         assert(info->throttle_inflight_count[throttle_level] >= 0);
2086 }
2087
2088 /*
2089  * If inflight is TRUE and bap is NULL then the caller is responsible for calling
2090  * throttle_info_end_io_internal to avoid leaking in-flight I/O.
2091  */
2092 static
2093 int
2094 throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap)
2095 {
2096         int     thread_throttle_level;
2097
2098         if (lowpri_throttle_enabled == 0 || info->throttle_disabled) {
2099                 return THROTTLE_LEVEL_NONE;
2100         }
2101
2102         if (ut == NULL) {
2103                 ut = get_bsdthread_info(current_thread());
2104         }
2105
2106         if (bap && inflight && !ut->uu_throttle_bc) {
2107                 thread_throttle_level = GET_BUFATTR_IO_TIER(bap);
2108                 if (ISSET(bap->ba_flags, BA_IO_TIER_UPGRADE)) {
2109                         thread_throttle_level--;
2110                 }
2111         } else {
2112                 thread_throttle_level = throttle_get_thread_throttle_level(ut);
2113         }
2114
2115         if (thread_throttle_level != THROTTLE_LEVEL_NONE) {
2116                 if (!ISSET(flags, B_PASSIVE)) {
2117                         info->throttle_last_IO_pid[thread_throttle_level] = proc_selfpid();
2118                         if (inflight && !ut->uu_throttle_bc) {
2119                                 if (NULL != bap) {
2120                                         SET(bap->ba_flags, BA_STRATEGY_TRACKED_IO);
2121                                 }
2122                                 OSIncrementAtomic(&info->throttle_inflight_count[thread_throttle_level]);
2123                         } else {
2124                                 microuptime(&info->throttle_window_start_timestamp[thread_throttle_level]);
2125                         }
2126                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, OPEN_THROTTLE_WINDOW)) | DBG_FUNC_NONE,
2127                             current_proc()->p_pid, thread_throttle_level, 0, 0, 0);
2128                 }
2129                 microuptime(&info->throttle_last_IO_timestamp[thread_throttle_level]);
2130         }
2131
2132
2133         if (thread_throttle_level >= THROTTLE_LEVEL_THROTTLED) {
2134                 /*
2135                  * I'd really like to do the IOSleep here, but
2136                  * we may be holding all kinds of filesystem related locks
2137                  * and the pages for this I/O marked 'busy'...
2138                  * we don't want to cause a normal task to block on
2139                  * one of these locks while we're throttling a task marked
2140                  * for low priority I/O... we'll mark the uthread and
2141                  * do the delay just before we return from the system
2142                  * call that triggered this I/O or from vnode_pagein
2143                  */
2144                 OSAddAtomic(1, &info->throttle_io_count);
2145
2146                 throttle_info_set_initial_window(ut, info, FALSE, isssd);
2147         }
2148
2149         return thread_throttle_level;
2150 }
2151
2152 void *
2153 throttle_info_update_by_mount(mount_t mp)
2154 {
2155         struct _throttle_io_info_t *info;
2156         uthread_t ut;
2157         boolean_t isssd = FALSE;
2158
2159         ut = get_bsdthread_info(current_thread());
2160
2161         if (mp != NULL) {
2162                 if (disk_conditioner_mount_is_ssd(mp)) {
2163                         isssd = TRUE;
2164                 }
2165                 info = &_throttle_io_info[mp->mnt_devbsdunit];
2166         } else {
2167                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2168         }
2169
2170         if (!ut->uu_lowpri_window) {
2171                 throttle_info_set_initial_window(ut, info, FALSE, isssd);
2172         }
2173
2174         return info;
2175 }
2176
2177
2178 /*
2179  * KPI routine
2180  *
2181  * this is usually called before every I/O, used for throttled I/O
2182  * book keeping.  This routine has low overhead and does not sleep
2183  */
2184 void
2185 throttle_info_update(void *throttle_info, int flags)
2186 {
2187         if (throttle_info) {
2188                 throttle_info_update_internal(throttle_info, NULL, flags, FALSE, FALSE, NULL);
2189         }
2190 }
2191
2192 /*
2193  * KPI routine
2194  *
2195  * this is usually called before every I/O, used for throttled I/O
2196  * book keeping.  This routine has low overhead and does not sleep
2197  */
2198 void
2199 throttle_info_update_by_mask(void *throttle_info_handle, int flags)
2200 {
2201         void *throttle_info = throttle_info_handle;
2202
2203         /*
2204          * for now we only use the lowest bit of the throttle mask, so the
2205          * handle is the same as the throttle_info.  Later if we store a
2206          * set of throttle infos in the handle, we will want to loop through
2207          * them and call throttle_info_update in a loop
2208          */
2209         throttle_info_update(throttle_info, flags);
2210 }
2211 /*
2212  * KPI routine
2213  *
2214  * This routine marks the throttle info as disabled. Used for mount points which
2215  * support I/O scheduling.
2216  */
2217
2218 void
2219 throttle_info_disable_throttle(int devno, boolean_t isfusion)
2220 {
2221         struct _throttle_io_info_t *info;
2222
2223         if (devno < 0 || devno >= LOWPRI_MAX_NUM_DEV) {
2224                 panic("Illegal devno (%d) passed into throttle_info_disable_throttle()", devno);
2225         }
2226
2227         info = &_throttle_io_info[devno];
2228         // don't disable software throttling on devices that are part of a fusion device
2229         // and override the software throttle periods to use HDD periods
2230         if (isfusion) {
2231                 info->throttle_is_fusion_with_priority = isfusion;
2232                 throttle_init_throttle_period(info, FALSE);
2233         }
2234         info->throttle_disabled = !info->throttle_is_fusion_with_priority;
2235         return;
2236 }
2237
2238
2239 /*
2240  * KPI routine (private)
2241  * Called to determine if this IO is being throttled to this level so that it can be treated specially
2242  */
2243 int
2244 throttle_info_io_will_be_throttled(void * throttle_info, int policy)
2245 {
2246         struct _throttle_io_info_t *info = throttle_info;
2247         struct timeval elapsed;
2248         uint64_t elapsed_msecs;
2249         int     throttle_level;
2250         int     thread_throttle_level;
2251
2252         switch (policy) {
2253         case IOPOL_THROTTLE:
2254                 thread_throttle_level = THROTTLE_LEVEL_TIER3;
2255                 break;
2256         case IOPOL_UTILITY:
2257                 thread_throttle_level = THROTTLE_LEVEL_TIER2;
2258                 break;
2259         case IOPOL_STANDARD:
2260                 thread_throttle_level = THROTTLE_LEVEL_TIER1;
2261                 break;
2262         default:
2263                 thread_throttle_level = THROTTLE_LEVEL_TIER0;
2264                 break;
2265         }
2266         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
2267                 if (info->throttle_inflight_count[throttle_level]) {
2268                         break;
2269                 }
2270
2271                 microuptime(&elapsed);
2272                 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
2273                 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
2274
2275                 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level]) {
2276                         break;
2277                 }
2278         }
2279         if (throttle_level >= thread_throttle_level) {
2280                 /*
2281                  * we're beyond all of the throttle windows
2282                  * so go ahead and treat as normal I/O
2283                  */
2284                 return THROTTLE_DISENGAGED;
2285         }
2286         /*
2287          * we're in the throttle window
2288          */
2289         return THROTTLE_ENGAGED;
2290 }
2291
2292 int
2293 throttle_lowpri_window(void)
2294 {
2295         struct uthread *ut = get_bsdthread_info(current_thread());
2296         return ut->uu_lowpri_window;
2297 }
2298
2299
2300 #if CONFIG_IOSCHED
2301 int upl_get_cached_tier(void *);
2302 #endif
2303
2304 int
2305 spec_strategy(struct vnop_strategy_args *ap)
2306 {
2307         buf_t   bp;
2308         int     bflags;
2309         int     io_tier;
2310         int     passive;
2311         dev_t   bdev;
2312         uthread_t ut;
2313         mount_t mp;
2314         struct  bufattr *bap;
2315         int     strategy_ret;
2316         struct _throttle_io_info_t *throttle_info;
2317         boolean_t isssd = FALSE;
2318         boolean_t inflight = FALSE;
2319         boolean_t upgrade = FALSE;
2320         int code = 0;
2321
2322 #if !CONFIG_EMBEDDED
2323         proc_t curproc = current_proc();
2324 #endif /* !CONFIG_EMBEDDED */
2325
2326         bp = ap->a_bp;
2327         bdev = buf_device(bp);
2328         mp = buf_vnode(bp)->v_mount;
2329         bap = &bp->b_attr;
2330
2331 #if CONFIG_IOSCHED
2332         if (bp->b_flags & B_CLUSTER) {
2333                 io_tier = upl_get_cached_tier(bp->b_upl);
2334
2335                 if (io_tier == -1) {
2336                         io_tier = throttle_get_io_policy(&ut);
2337                 }
2338 #if DEVELOPMENT || DEBUG
2339                 else {
2340                         int my_io_tier = throttle_get_io_policy(&ut);
2341
2342                         if (io_tier != my_io_tier) {
2343                                 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, IO_TIER_UPL_MISMATCH)) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(bp), my_io_tier, io_tier, 0, 0);
2344                         }
2345                 }
2346 #endif
2347         } else {
2348                 io_tier = throttle_get_io_policy(&ut);
2349         }
2350 #else
2351         io_tier = throttle_get_io_policy(&ut);
2352 #endif
2353         passive = throttle_get_passive_io_policy(&ut);
2354
2355         /*
2356          * Mark if the I/O was upgraded by throttle_get_thread_throttle_level
2357          * while preserving the original issued tier (throttle_get_io_policy
2358          * does not return upgraded tiers)
2359          */
2360         if (mp && io_tier > throttle_get_thread_throttle_level_internal(ut, io_tier)) {
2361 #if CONFIG_IOSCHED
2362                 if (!(mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) {
2363                         upgrade = TRUE;
2364                 }
2365 #else /* CONFIG_IOSCHED */
2366                 upgrade = TRUE;
2367 #endif /* CONFIG_IOSCHED */
2368         }
2369
2370         if (bp->b_flags & B_META) {
2371                 bap->ba_flags |= BA_META;
2372         }
2373
2374 #if CONFIG_IOSCHED
2375         /*
2376          * For I/O Scheduling, we currently do not have a way to track and expedite metadata I/Os.
2377          * To ensure we dont get into priority inversions due to metadata I/Os, we use the following rules:
2378          * For metadata reads, ceil all I/Os to IOSCHED_METADATA_TIER & mark them passive if the I/O tier was upgraded
2379          * For metadata writes, unconditionally mark them as IOSCHED_METADATA_TIER and passive
2380          */
2381         if (bap->ba_flags & BA_META) {
2382                 if (mp && (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) {
2383                         if (bp->b_flags & B_READ) {
2384                                 if (io_tier > IOSCHED_METADATA_TIER) {
2385                                         io_tier = IOSCHED_METADATA_TIER;
2386                                         passive = 1;
2387                                 }
2388                         } else {
2389                                 io_tier = IOSCHED_METADATA_TIER;
2390                                 passive = 1;
2391                         }
2392                 }
2393         }
2394 #endif /* CONFIG_IOSCHED */
2395
2396         SET_BUFATTR_IO_TIER(bap, io_tier);
2397
2398         if (passive) {
2399                 bp->b_flags |= B_PASSIVE;
2400                 bap->ba_flags |= BA_PASSIVE;
2401         }
2402
2403 #if !CONFIG_EMBEDDED
2404         if ((curproc != NULL) && ((curproc->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP)) {
2405                 bap->ba_flags |= BA_DELAYIDLESLEEP;
2406         }
2407 #endif /* !CONFIG_EMBEDDED */
2408
2409         bflags = bp->b_flags;
2410
2411         if (((bflags & B_READ) == 0) && ((bflags & B_ASYNC) == 0)) {
2412                 bufattr_markquickcomplete(bap);
2413         }
2414
2415         if (bflags & B_READ) {
2416                 code |= DKIO_READ;
2417         }
2418         if (bflags & B_ASYNC) {
2419                 code |= DKIO_ASYNC;
2420         }
2421
2422         if (bap->ba_flags & BA_META) {
2423                 code |= DKIO_META;
2424         } else if (bflags & B_PAGEIO) {
2425                 code |= DKIO_PAGING;
2426         }
2427
2428         if (io_tier != 0) {
2429                 code |= DKIO_THROTTLE;
2430         }
2431
2432         code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
2433
2434         if (bflags & B_PASSIVE) {
2435                 code |= DKIO_PASSIVE;
2436         }
2437
2438         if (bap->ba_flags & BA_NOCACHE) {
2439                 code |= DKIO_NOCACHE;
2440         }
2441
2442         if (upgrade) {
2443                 code |= DKIO_TIER_UPGRADE;
2444                 SET(bap->ba_flags, BA_IO_TIER_UPGRADE);
2445         }
2446
2447         if (kdebug_enable) {
2448                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
2449                     buf_kernel_addrperm_addr(bp), bdev, buf_blkno(bp), buf_count(bp), 0);
2450         }
2451
2452         thread_update_io_stats(current_thread(), buf_count(bp), code);
2453
2454         if (mp != NULL) {
2455                 if (disk_conditioner_mount_is_ssd(mp)) {
2456                         isssd = TRUE;
2457                 }
2458                 /*
2459                  * Partially initialized mounts don't have a final devbsdunit and should not be tracked.
2460                  * Verify that devbsdunit is initialized (non-zero) or that 0 is the correct initialized value
2461                  * (mnt_throttle_mask is initialized and num_trailing_0 would be 0)
2462                  */
2463                 if (mp->mnt_devbsdunit || (mp->mnt_throttle_mask != LOWPRI_MAX_NUM_DEV - 1 && mp->mnt_throttle_mask & 0x1)) {
2464                         inflight = TRUE;
2465                 }
2466                 throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
2467         } else {
2468                 throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2469         }
2470
2471         throttle_info_update_internal(throttle_info, ut, bflags, isssd, inflight, bap);
2472
2473         if ((bflags & B_READ) == 0) {
2474                 microuptime(&throttle_info->throttle_last_write_timestamp);
2475
2476                 if (mp) {
2477                         mp->mnt_last_write_issued_timestamp = throttle_info->throttle_last_write_timestamp;
2478                         INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
2479                 }
2480         } else if (mp) {
2481                 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
2482         }
2483         /*
2484          * The BootCache may give us special information about
2485          * the IO, so it returns special values that we check
2486          * for here.
2487          *
2488          * IO_SATISFIED_BY_CACHE
2489          * The read has been satisfied by the boot cache. Don't
2490          * throttle the thread unnecessarily.
2491          *
2492          * IO_SHOULD_BE_THROTTLED
2493          * The boot cache is playing back a playlist and this IO
2494          * cut through. Throttle it so we're not cutting through
2495          * the boot cache too often.
2496          *
2497          * Note that typical strategy routines are defined with
2498          * a void return so we'll get garbage here. In the
2499          * unlikely case the garbage matches our special return
2500          * value, it's not a big deal since we're only adjusting
2501          * the throttling delay.
2502          */
2503 #define IO_SATISFIED_BY_CACHE  ((int)0xcafefeed)
2504 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
2505         typedef int strategy_fcn_ret_t(struct buf *bp);
2506
2507         strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp);
2508
2509         // disk conditioner needs to track when this I/O actually starts
2510         // which means track it after `strategy` which may include delays
2511         // from inflight I/Os
2512         microuptime(&bp->b_timestamp_tv);
2513
2514         if (IO_SATISFIED_BY_CACHE == strategy_ret) {
2515                 /*
2516                  * If this was a throttled IO satisfied by the boot cache,
2517                  * don't delay the thread.
2518                  */
2519                 throttle_info_reset_window(ut);
2520         } else if (IO_SHOULD_BE_THROTTLED == strategy_ret) {
2521                 /*
2522                  * If the boot cache indicates this IO should be throttled,
2523                  * delay the thread.
2524                  */
2525                 throttle_info_set_initial_window(ut, throttle_info, TRUE, isssd);
2526         }
2527         return 0;
2528 }
2529
2530
2531 /*
2532  * This is a noop, simply returning what one has been given.
2533  */
2534 int
2535 spec_blockmap(__unused struct vnop_blockmap_args *ap)
2536 {
2537         return ENOTSUP;
2538 }
2539
2540
2541 /*
2542  * Device close routine
2543  */
2544 int
2545 spec_close(struct vnop_close_args *ap)
2546 {
2547         struct vnode *vp = ap->a_vp;
2548         dev_t dev = vp->v_rdev;
2549         int error = 0;
2550         int flags = ap->a_fflag;
2551         struct proc *p = vfs_context_proc(ap->a_context);
2552         struct session *sessp;
2553
2554         switch (vp->v_type) {
2555         case VCHR:
2556                 /*
2557                  * Hack: a tty device that is a controlling terminal
2558                  * has a reference from the session structure.
2559                  * We cannot easily tell that a character device is
2560                  * a controlling terminal, unless it is the closing
2561                  * process' controlling terminal.  In that case,
2562                  * if the reference count is 1 (this is the very
2563                  * last close)
2564                  */
2565                 sessp = proc_session(p);
2566                 devsw_lock(dev, S_IFCHR);
2567                 if (sessp != SESSION_NULL) {
2568                         if (vp == sessp->s_ttyvp && vcount(vp) == 1) {
2569                                 struct tty *tp = TTY_NULL;
2570
2571                                 devsw_unlock(dev, S_IFCHR);
2572                                 session_lock(sessp);
2573                                 if (vp == sessp->s_ttyvp) {
2574                                         tp = SESSION_TP(sessp);
2575                                         sessp->s_ttyvp = NULL;
2576                                         sessp->s_ttyvid = 0;
2577                                         sessp->s_ttyp = TTY_NULL;
2578                                         sessp->s_ttypgrpid = NO_PID;
2579                                 }
2580                                 session_unlock(sessp);
2581
2582                                 if (tp != TTY_NULL) {
2583                                         /*
2584                                          * We may have won a race with a proc_exit
2585                                          * of the session leader, the winner
2586                                          * clears the flag (even if not set)
2587                                          */
2588                                         tty_lock(tp);
2589                                         ttyclrpgrphup(tp);
2590                                         tty_unlock(tp);
2591
2592                                         ttyfree(tp);
2593                                 }
2594                                 devsw_lock(dev, S_IFCHR);
2595                         }
2596                         session_rele(sessp);
2597                 }
2598
2599                 if (--vp->v_specinfo->si_opencount < 0) {
2600                         panic("negative open count (c, %u, %u)", major(dev), minor(dev));
2601                 }
2602
2603                 /*
2604                  * close on last reference or on vnode revoke call
2605                  */
2606                 if (vcount(vp) == 0 || (flags & IO_REVOKE) != 0) {
2607                         error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
2608                 }
2609
2610                 devsw_unlock(dev, S_IFCHR);
2611                 break;
2612
2613         case VBLK:
2614                 /*
2615                  * If there is more than one outstanding open, don't
2616                  * send the close to the device.
2617                  */
2618                 devsw_lock(dev, S_IFBLK);
2619                 if (vcount(vp) > 1) {
2620                         vp->v_specinfo->si_opencount--;
2621                         devsw_unlock(dev, S_IFBLK);
2622                         return 0;
2623                 }
2624                 devsw_unlock(dev, S_IFBLK);
2625
2626                 /*
2627                  * On last close of a block device (that isn't mounted)
2628                  * we must invalidate any in core blocks, so that
2629                  * we can, for instance, change floppy disks.
2630                  */
2631                 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context))) {
2632                         return error;
2633                 }
2634
2635                 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
2636                 if (error) {
2637                         return error;
2638                 }
2639
2640                 devsw_lock(dev, S_IFBLK);
2641
2642                 if (--vp->v_specinfo->si_opencount < 0) {
2643                         panic("negative open count (b, %u, %u)", major(dev), minor(dev));
2644                 }
2645
2646                 if (vcount(vp) == 0) {
2647                         error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
2648                 }
2649
2650                 devsw_unlock(dev, S_IFBLK);
2651                 break;
2652
2653         default:
2654                 panic("spec_close: not special");
2655                 return EBADF;
2656         }
2657
2658         return error;
2659 }
2660
2661 /*
2662  * Return POSIX pathconf information applicable to special devices.
2663  */
2664 int
2665 spec_pathconf(struct vnop_pathconf_args *ap)
2666 {
2667         switch (ap->a_name) {
2668         case _PC_LINK_MAX:
2669                 *ap->a_retval = LINK_MAX;
2670                 return 0;
2671         case _PC_MAX_CANON:
2672                 *ap->a_retval = MAX_CANON;
2673                 return 0;
2674         case _PC_MAX_INPUT:
2675                 *ap->a_retval = MAX_INPUT;
2676                 return 0;
2677         case _PC_PIPE_BUF:
2678                 *ap->a_retval = PIPE_BUF;
2679                 return 0;
2680         case _PC_CHOWN_RESTRICTED:
2681                 *ap->a_retval = 200112;         /* _POSIX_CHOWN_RESTRICTED */
2682                 return 0;
2683         case _PC_VDISABLE:
2684                 *ap->a_retval = _POSIX_VDISABLE;
2685                 return 0;
2686         default:
2687                 return EINVAL;
2688         }
2689         /* NOTREACHED */
2690 }
2691
2692 /*
2693  * Special device failed operation
2694  */
2695 int
2696 spec_ebadf(__unused void *dummy)
2697 {
2698         return EBADF;
2699 }
2700
2701 /* Blktooff derives file offset from logical block number */
2702 int
2703 spec_blktooff(struct vnop_blktooff_args *ap)
2704 {
2705         struct vnode *vp = ap->a_vp;
2706
2707         switch (vp->v_type) {
2708         case VCHR:
2709                 *ap->a_offset = (off_t)-1; /* failure */
2710                 return ENOTSUP;
2711
2712         case VBLK:
2713                 printf("spec_blktooff: not implemented for VBLK\n");
2714                 *ap->a_offset = (off_t)-1; /* failure */
2715                 return ENOTSUP;
2716
2717         default:
2718                 panic("spec_blktooff type");
2719         }
2720         /* NOTREACHED */
2721
2722         return 0;
2723 }
2724
2725 /* Offtoblk derives logical block number from file offset */
2726 int
2727 spec_offtoblk(struct vnop_offtoblk_args *ap)
2728 {
2729         struct vnode *vp = ap->a_vp;
2730
2731         switch (vp->v_type) {
2732         case VCHR:
2733                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
2734                 return ENOTSUP;
2735
2736         case VBLK:
2737                 printf("spec_offtoblk: not implemented for VBLK\n");
2738                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
2739                 return ENOTSUP;
2740
2741         default:
2742                 panic("spec_offtoblk type");
2743         }
2744         /* NOTREACHED */
2745
2746         return 0;
2747 }
2748
2749 static void filt_specdetach(struct knote *kn);
2750 static int filt_specevent(struct knote *kn, long hint);
2751 static int filt_spectouch(struct knote *kn, struct kevent_internal_s *kev);
2752 static int filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
2753 static int filt_specpeek(struct knote *kn);
2754
2755 SECURITY_READ_ONLY_EARLY(struct filterops) spec_filtops = {
2756         .f_isfd    = 1,
2757         .f_attach  = filt_specattach,
2758         .f_detach  = filt_specdetach,
2759         .f_event   = filt_specevent,
2760         .f_touch   = filt_spectouch,
2761         .f_process = filt_specprocess,
2762         .f_peek    = filt_specpeek
2763 };
2764
2765
2766 /*
2767  * Given a waitq that is assumed to be embedded within a selinfo structure,
2768  * return the containing selinfo structure. While 'wq' is not really a queue
2769  * element, this macro simply does the offset_of calculation to get back to a
2770  * containing struct given the struct type and member name.
2771  */
2772 #define selinfo_from_waitq(wq) \
2773         qe_element((wq), struct selinfo, si_waitq)
2774
2775 static int
2776 spec_knote_select_and_link(struct knote *kn)
2777 {
2778         uthread_t uth;
2779         vfs_context_t ctx;
2780         vnode_t vp;
2781         struct waitq_set *old_wqs;
2782         uint64_t rsvd, rsvd_arg;
2783         uint64_t *rlptr = NULL;
2784         struct selinfo *si = NULL;
2785         int selres = 0;
2786
2787         uth = get_bsdthread_info(current_thread());
2788
2789         ctx = vfs_context_current();
2790         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
2791
2792         int error = vnode_getwithvid(vp, kn->kn_hookid);
2793         if (error != 0) {
2794                 knote_set_error(kn, ENOENT);
2795                 return 0;
2796         }
2797
2798         /*
2799          * This function may be called many times to link or re-link the
2800          * underlying vnode to the kqueue.  If we've already linked the two,
2801          * we will have a valid kn_hook_data which ties us to the underlying
2802          * device's waitq via a the waitq's prepost table object. However,
2803          * devices can abort any select action by calling selthreadclear().
2804          * This is OK because the table object will be invalidated by the
2805          * driver (through a call to selthreadclear), so any attempt to access
2806          * the associated waitq will fail because the table object is invalid.
2807          *
2808          * Even if we've already registered, we need to pass a pointer
2809          * to a reserved link structure. Otherwise, selrecord() will
2810          * infer that we're in the second pass of select() and won't
2811          * actually do anything!
2812          */
2813         rsvd = rsvd_arg = waitq_link_reserve(NULL);
2814         rlptr = (void *)&rsvd_arg;
2815
2816         /*
2817          * Trick selrecord() into hooking kqueue's wait queue set into the device's
2818          * selinfo wait queue.
2819          */
2820         old_wqs = uth->uu_wqset;
2821         uth->uu_wqset = &(knote_get_kq(kn)->kq_wqs);
2822
2823         /*
2824          * Be sure that the waitq set is linked
2825          * before calling select to avoid possible
2826          * allocation under spinlocks.
2827          */
2828         waitq_set_lazy_init_link(uth->uu_wqset);
2829
2830         /*
2831          * Now these are the laws of VNOP_SELECT, as old and as true as the sky,
2832          * And the device that shall keep it may prosper, but the device that shall
2833          * break it must receive ENODEV:
2834          *
2835          * 1. Take a lock to protect against other selects on the same vnode.
2836          * 2. Return 1 if data is ready to be read.
2837          * 3. Return 0 and call `selrecord` on a handy `selinfo` structure if there
2838          *    is no data.
2839          * 4. Call `selwakeup` when the vnode has an active `selrecord` and data
2840          *    can be read or written (depending on the seltype).
2841          * 5. If there's a `selrecord` and no corresponding `selwakeup`, but the
2842          *    vnode is going away, call `selthreadclear`.
2843          */
2844         selres = VNOP_SELECT(vp, knote_get_seltype(kn), 0, rlptr, ctx);
2845         uth->uu_wqset = old_wqs;
2846
2847         /*
2848          * Make sure to cleanup the reserved link - this guards against
2849          * drivers that may not actually call selrecord().
2850          */
2851         waitq_link_release(rsvd);
2852         if (rsvd != rsvd_arg) {
2853                 /* The driver / handler called selrecord() */
2854                 struct waitq *wq;
2855                 memcpy(&wq, rlptr, sizeof(void *));
2856
2857                 /*
2858                  * The waitq is part of the selinfo structure managed by the
2859                  * driver. For certain drivers, we want to hook the knote into
2860                  * the selinfo structure's si_note field so selwakeup can call
2861                  * KNOTE.
2862                  */
2863                 si = selinfo_from_waitq(wq);
2864
2865                 /*
2866                  * The waitq_get_prepost_id() function will (potentially)
2867                  * allocate a prepost table object for the waitq and return
2868                  * the table object's ID to us.  It will also set the
2869                  * waitq_prepost_id field within the waitq structure.
2870                  *
2871                  * We can just overwrite kn_hook_data because it's simply a
2872                  * table ID used to grab a reference when needed.
2873                  *
2874                  * We have a reference on the vnode, so we know that the
2875                  * device won't go away while we get this ID.
2876                  */
2877                 kn->kn_hook_data = waitq_get_prepost_id(wq);
2878         } else if (selres == 0) {
2879                 /*
2880                  * The device indicated that there's no data to read, but didn't call
2881                  * `selrecord`.  Nothing will be notified of changes to this vnode, so
2882                  * return an error back to user space, to make it clear that the knote
2883                  * is not attached.
2884                  */
2885                 knote_set_error(kn, ENODEV);
2886         }
2887
2888         vnode_put(vp);
2889
2890         return selres;
2891 }
2892
2893 static void
2894 filt_spec_common(struct knote *kn, int selres)
2895 {
2896         if (kn->kn_vnode_use_ofst) {
2897                 if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) {
2898                         kn->kn_data = 0;
2899                 } else {
2900                         kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset;
2901                 }
2902         } else {
2903                 kn->kn_data = selres;
2904         }
2905 }
2906
2907 static int
2908 filt_specattach(struct knote *kn, __unused struct kevent_internal_s *kev)
2909 {
2910         vnode_t vp;
2911         dev_t dev;
2912
2913         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */
2914
2915         assert(vnode_ischr(vp));
2916
2917         dev = vnode_specrdev(vp);
2918
2919         /*
2920          * For a few special kinds of devices, we can attach knotes with
2921          * no restrictions because their "select" vectors return the amount
2922          * of data available.  Others require an explicit NOTE_LOWAT with
2923          * data of 1, indicating that the caller doesn't care about actual
2924          * data counts, just an indication that the device has data.
2925          */
2926         if (!kn->kn_vnode_kqok &&
2927             ((kn->kn_sfflags & NOTE_LOWAT) == 0 || kn->kn_sdata != 1)) {
2928                 knote_set_error(kn, EINVAL);
2929                 return 0;
2930         }
2931
2932         /*
2933          * This forces the select fallback to call through VNOP_SELECT and hook
2934          * up selinfo on every filter routine.
2935          *
2936          * Pseudo-terminal controllers are opted out of native kevent support --
2937          * remove this when they get their own EVFILTID.
2938          */
2939         if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTC) {
2940                 kn->kn_vnode_kqok = 0;
2941         }
2942
2943         kn->kn_filtid = EVFILTID_SPEC;
2944         kn->kn_hook_data = 0;
2945         kn->kn_hookid = vnode_vid(vp);
2946
2947         knote_markstayactive(kn);
2948         return spec_knote_select_and_link(kn);
2949 }
2950
2951 static void
2952 filt_specdetach(struct knote *kn)
2953 {
2954         knote_clearstayactive(kn);
2955
2956         /*
2957          * This is potentially tricky: the device's selinfo waitq that was
2958          * tricked into being part of this knote's waitq set may not be a part
2959          * of any other set, and the device itself may have revoked the memory
2960          * in which the waitq was held. We use the knote's kn_hook_data field
2961          * to keep the ID of the waitq's prepost table object. This
2962          * object keeps a pointer back to the waitq, and gives us a safe way
2963          * to decouple the dereferencing of driver allocated memory: if the
2964          * driver goes away (taking the waitq with it) then the prepost table
2965          * object will be invalidated. The waitq details are handled in the
2966          * waitq API invoked here.
2967          */
2968         if (kn->kn_hook_data) {
2969                 waitq_unlink_by_prepost_id(kn->kn_hook_data, &(knote_get_kq(kn)->kq_wqs));
2970                 kn->kn_hook_data = 0;
2971         }
2972 }
2973
2974 static int
2975 filt_specevent(struct knote *kn, __unused long hint)
2976 {
2977         /*
2978          * Nothing should call knote or knote_vanish on this knote.
2979          */
2980         panic("filt_specevent(%p)", kn);
2981         return 0;
2982 }
2983
2984 static int
2985 filt_spectouch(struct knote *kn, struct kevent_internal_s *kev)
2986 {
2987         kn->kn_sdata = kev->data;
2988         kn->kn_sfflags = kev->fflags;
2989
2990         if (kev->flags & EV_ENABLE) {
2991                 return spec_knote_select_and_link(kn);
2992         }
2993
2994         return 0;
2995 }
2996
2997 static int
2998 filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
2999 {
3000 #pragma unused(data)
3001         vnode_t vp;
3002         uthread_t uth;
3003         vfs_context_t ctx;
3004         int res;
3005         int selres;
3006         int error;
3007
3008         uth = get_bsdthread_info(current_thread());
3009         ctx = vfs_context_current();
3010         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
3011
3012         error = vnode_getwithvid(vp, kn->kn_hookid);
3013         if (error != 0) {
3014                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3015                 *kev = kn->kn_kevent;
3016                 return 1;
3017         }
3018
3019         selres = spec_knote_select_and_link(kn);
3020         filt_spec_common(kn, selres);
3021
3022         vnode_put(vp);
3023
3024         res = ((kn->kn_sfflags & NOTE_LOWAT) != 0) ?
3025             (kn->kn_data >= kn->kn_sdata) : kn->kn_data;
3026
3027         if (res) {
3028                 *kev = kn->kn_kevent;
3029                 if (kn->kn_flags & EV_CLEAR) {
3030                         kn->kn_fflags = 0;
3031                         kn->kn_data = 0;
3032                 }
3033         }
3034
3035         return res;
3036 }
3037
3038 static int
3039 filt_specpeek(struct knote *kn)
3040 {
3041         int selres = 0;
3042
3043         selres = spec_knote_select_and_link(kn);
3044         filt_spec_common(kn, selres);
3045
3046         return kn->kn_data != 0;
3047 }