bsd/miscfs/specfs/spec_vnops.c

   1 /*
   2  * Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1989, 1993, 1995
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)spec_vnops.c        8.14 (Berkeley) 5/21/95
  62  */
  63
  64 #include <sys/param.h>
  65 #include <sys/proc_internal.h>
  66 #include <sys/kauth.h>
  67 #include <sys/systm.h>
  68 #include <sys/kernel.h>
  69 #include <sys/conf.h>
  70 #include <sys/buf_internal.h>
  71 #include <sys/mount_internal.h>
  72 #include <sys/vnode_internal.h>
  73 #include <sys/file_internal.h>
  74 #include <sys/namei.h>
  75 #include <sys/stat.h>
  76 #include <sys/errno.h>
  77 #include <sys/ioctl.h>
  78 #include <sys/file.h>
  79 #include <sys/user.h>
  80 #include <sys/malloc.h>
  81 #include <sys/disk.h>
  82 #include <sys/uio_internal.h>
  83 #include <sys/resource.h>
  84 #include <machine/machine_routines.h>
  85 #include <miscfs/specfs/specdev.h>
  86 #include <vfs/vfs_support.h>
  87 #include <vfs/vfs_disk_conditioner.h>
  88
  89 #include <kern/assert.h>
  90 #include <kern/task.h>
  91 #include <kern/sched_prim.h>
  92 #include <kern/thread.h>
  93 #include <kern/policy_internal.h>
  94 #include <kern/timer_call.h>
  95 #include <kern/waitq.h>
  96
  97 #include <pexpert/pexpert.h>
  98
  99 #include <sys/kdebug.h>
 100 #include <libkern/section_keywords.h>
 101
 102 /* XXX following three prototypes should be in a header file somewhere */
 103 extern dev_t    chrtoblk(dev_t dev);
 104 extern boolean_t        iskmemdev(dev_t dev);
 105 extern int      bpfkqfilter(dev_t dev, struct knote *kn);
 106 extern int ptsd_kqfilter(dev_t, struct knote *);
 107 extern int ptmx_kqfilter(dev_t, struct knote *);
 108
 109 struct vnode *speclisth[SPECHSZ];
 110
 111 /* symbolic sleep message strings for devices */
 112 char    devopn[] = "devopn";
 113 char    devio[] = "devio";
 114 char    devwait[] = "devwait";
 115 char    devin[] = "devin";
 116 char    devout[] = "devout";
 117 char    devioc[] = "devioc";
 118 char    devcls[] = "devcls";
 119
 120 #define VOPFUNC int (*)(void *)
 121
 122 int (**spec_vnodeop_p)(void *);
 123 struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
 124         { &vnop_default_desc, (VOPFUNC)vn_default_error },
 125         { &vnop_lookup_desc, (VOPFUNC)spec_lookup },            /* lookup */
 126         { &vnop_create_desc, (VOPFUNC)err_create },             /* create */
 127         { &vnop_mknod_desc, (VOPFUNC)err_mknod },               /* mknod */
 128         { &vnop_open_desc, (VOPFUNC)spec_open },                        /* open */
 129         { &vnop_close_desc, (VOPFUNC)spec_close },              /* close */
 130         { &vnop_access_desc, (VOPFUNC)spec_access },            /* access */
 131         { &vnop_getattr_desc, (VOPFUNC)spec_getattr },          /* getattr */
 132         { &vnop_setattr_desc, (VOPFUNC)spec_setattr },          /* setattr */
 133         { &vnop_read_desc, (VOPFUNC)spec_read },                        /* read */
 134         { &vnop_write_desc, (VOPFUNC)spec_write },              /* write */
 135         { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl },              /* ioctl */
 136         { &vnop_select_desc, (VOPFUNC)spec_select },            /* select */
 137         { &vnop_revoke_desc, (VOPFUNC)nop_revoke },             /* revoke */
 138         { &vnop_mmap_desc, (VOPFUNC)err_mmap },                 /* mmap */
 139         { &vnop_fsync_desc, (VOPFUNC)spec_fsync },              /* fsync */
 140         { &vnop_remove_desc, (VOPFUNC)err_remove },             /* remove */
 141         { &vnop_link_desc, (VOPFUNC)err_link },                 /* link */
 142         { &vnop_rename_desc, (VOPFUNC)err_rename },             /* rename */
 143         { &vnop_mkdir_desc, (VOPFUNC)err_mkdir },               /* mkdir */
 144         { &vnop_rmdir_desc, (VOPFUNC)err_rmdir },               /* rmdir */
 145         { &vnop_symlink_desc, (VOPFUNC)err_symlink },           /* symlink */
 146         { &vnop_readdir_desc, (VOPFUNC)err_readdir },           /* readdir */
 147         { &vnop_readlink_desc, (VOPFUNC)err_readlink },         /* readlink */
 148         { &vnop_inactive_desc, (VOPFUNC)nop_inactive },         /* inactive */
 149         { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim },           /* reclaim */
 150         { &vnop_strategy_desc, (VOPFUNC)spec_strategy },                /* strategy */
 151         { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf },                /* pathconf */
 152         { &vnop_advlock_desc, (VOPFUNC)err_advlock },           /* advlock */
 153         { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite },            /* bwrite */
 154         { &vnop_pagein_desc, (VOPFUNC)err_pagein },             /* Pagein */
 155         { &vnop_pageout_desc, (VOPFUNC)err_pageout },           /* Pageout */
 156         { &vnop_copyfile_desc, (VOPFUNC)err_copyfile },         /* Copyfile */
 157         { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff },                /* blktooff */
 158         { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk },                /* offtoblk */
 159         { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap },                /* blockmap */
 160         { (struct vnodeop_desc*)NULL, (int(*)(void *))NULL }
 161 };
 162 struct vnodeopv_desc spec_vnodeop_opv_desc =
 163         { &spec_vnodeop_p, spec_vnodeop_entries };
 164
 165
 166 static void set_blocksize(vnode_t, dev_t);
 167
 168 #define LOWPRI_TIER1_WINDOW_MSECS         25
 169 #define LOWPRI_TIER2_WINDOW_MSECS         100
 170 #define LOWPRI_TIER3_WINDOW_MSECS         500
 171
 172 #define LOWPRI_TIER1_IO_PERIOD_MSECS      40
 173 #define LOWPRI_TIER2_IO_PERIOD_MSECS      85
 174 #define LOWPRI_TIER3_IO_PERIOD_MSECS      200
 175
 176 #define LOWPRI_TIER1_IO_PERIOD_SSD_MSECS  5
 177 #define LOWPRI_TIER2_IO_PERIOD_SSD_MSECS  15
 178 #define LOWPRI_TIER3_IO_PERIOD_SSD_MSECS  25
 179
 180
 181 int     throttle_windows_msecs[THROTTLE_LEVEL_END + 1] = {
 182         0,
 183         LOWPRI_TIER1_WINDOW_MSECS,
 184         LOWPRI_TIER2_WINDOW_MSECS,
 185         LOWPRI_TIER3_WINDOW_MSECS,
 186 };
 187
 188 int     throttle_io_period_msecs[THROTTLE_LEVEL_END + 1] = {
 189         0,
 190         LOWPRI_TIER1_IO_PERIOD_MSECS,
 191         LOWPRI_TIER2_IO_PERIOD_MSECS,
 192         LOWPRI_TIER3_IO_PERIOD_MSECS,
 193 };
 194
 195 int     throttle_io_period_ssd_msecs[THROTTLE_LEVEL_END + 1] = {
 196         0,
 197         LOWPRI_TIER1_IO_PERIOD_SSD_MSECS,
 198         LOWPRI_TIER2_IO_PERIOD_SSD_MSECS,
 199         LOWPRI_TIER3_IO_PERIOD_SSD_MSECS,
 200 };
 201
 202
 203 int     throttled_count[THROTTLE_LEVEL_END + 1];
 204
 205 struct _throttle_io_info_t {
 206         lck_mtx_t       throttle_lock;
 207
 208         struct timeval  throttle_last_write_timestamp;
 209         struct timeval  throttle_min_timer_deadline;
 210         struct timeval  throttle_window_start_timestamp[THROTTLE_LEVEL_END + 1]; /* window starts at both the beginning and completion of an I/O */
 211         struct timeval  throttle_last_IO_timestamp[THROTTLE_LEVEL_END + 1];
 212         pid_t           throttle_last_IO_pid[THROTTLE_LEVEL_END + 1];
 213         struct timeval  throttle_start_IO_period_timestamp[THROTTLE_LEVEL_END + 1];
 214         int32_t throttle_inflight_count[THROTTLE_LEVEL_END + 1];
 215
 216         TAILQ_HEAD( , uthread) throttle_uthlist[THROTTLE_LEVEL_END + 1];        /* Lists of throttled uthreads */
 217         int             throttle_next_wake_level;
 218
 219         thread_call_t   throttle_timer_call;
 220         int32_t throttle_timer_ref;
 221         int32_t throttle_timer_active;
 222
 223         int32_t throttle_io_count;
 224         int32_t throttle_io_count_begin;
 225         int    *throttle_io_periods;
 226         uint32_t throttle_io_period_num;
 227
 228         int32_t throttle_refcnt;
 229         int32_t throttle_alloc;
 230         int32_t throttle_disabled;
 231         int32_t throttle_is_fusion_with_priority;
 232 };
 233
 234 struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
 235
 236
 237 int     lowpri_throttle_enabled = 1;
 238
 239
 240 static void throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level);
 241 static int throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap);
 242 static int throttle_get_thread_throttle_level(uthread_t ut);
 243 static int throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier);
 244 void throttle_info_mount_reset_period(mount_t mp, int isssd);
 245
 246 /*
 247  * Trivial lookup routine that always fails.
 248  */
 249 int
 250 spec_lookup(struct vnop_lookup_args *ap)
 251 {
 252
 253         *ap->a_vpp = NULL;
 254         return (ENOTDIR);
 255 }
 256
 257 static void
 258 set_blocksize(struct vnode *vp, dev_t dev)
 259 {
 260     int (*size)(dev_t);
 261     int rsize;
 262
 263     if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
 264         rsize = (*size)(dev);
 265         if (rsize <= 0)        /* did size fail? */
 266             vp->v_specsize = DEV_BSIZE;
 267         else
 268             vp->v_specsize = rsize;
 269     }
 270     else
 271             vp->v_specsize = DEV_BSIZE;
 272 }
 273
 274 void
 275 set_fsblocksize(struct vnode *vp)
 276 {
 277
 278         if (vp->v_type == VBLK) {
 279                 dev_t dev = (dev_t)vp->v_rdev;
 280                 int maj = major(dev);
 281
 282                 if ((u_int)maj >= (u_int)nblkdev)
 283                         return;
 284
 285                 vnode_lock(vp);
 286                 set_blocksize(vp, dev);
 287                 vnode_unlock(vp);
 288         }
 289
 290 }
 291
 292
 293 /*
 294  * Open a special file.
 295  */
 296 int
 297 spec_open(struct vnop_open_args *ap)
 298 {
 299         struct proc *p = vfs_context_proc(ap->a_context);
 300         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
 301         struct vnode *vp = ap->a_vp;
 302         dev_t bdev, dev = (dev_t)vp->v_rdev;
 303         int maj = major(dev);
 304         int error;
 305
 306         /*
 307          * Don't allow open if fs is mounted -nodev.
 308          */
 309         if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
 310                 return (ENXIO);
 311
 312         switch (vp->v_type) {
 313
 314         case VCHR:
 315                 if ((u_int)maj >= (u_int)nchrdev)
 316                         return (ENXIO);
 317                 if (cred != FSCRED && (ap->a_mode & FWRITE)) {
 318                         /*
 319                          * When running in very secure mode, do not allow
 320                          * opens for writing of any disk character devices.
 321                          */
 322                         if (securelevel >= 2 && isdisk(dev, VCHR))
 323                                 return (EPERM);
 324
 325                         /* Never allow writing to /dev/mem or /dev/kmem */
 326                         if (iskmemdev(dev))
 327                                 return (EPERM);
 328                         /*
 329                          * When running in secure mode, do not allow opens for
 330                          * writing of character devices whose corresponding block
 331                          * devices are currently mounted.
 332                          */
 333                         if (securelevel >= 1) {
 334                                 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error))
 335                                         return (error);
 336                         }
 337                 }
 338
 339                 devsw_lock(dev, S_IFCHR);
 340                 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
 341
 342                 if (error == 0) {
 343                         vp->v_specinfo->si_opencount++;
 344                 }
 345
 346                 devsw_unlock(dev, S_IFCHR);
 347
 348                 if (error == 0 && cdevsw[maj].d_type == D_DISK && !vp->v_un.vu_specinfo->si_initted) {
 349                         int     isssd = 0;
 350                         uint64_t throttle_mask = 0;
 351                         uint32_t devbsdunit = 0;
 352
 353                         if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL) == 0) {
 354
 355                                 if (throttle_mask != 0 &&
 356                                     VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) {
 357                                         /*
 358                                          * as a reasonable approximation, only use the lowest bit of the mask
 359                                          * to generate a disk unit number
 360                                          */
 361                                         devbsdunit = num_trailing_0(throttle_mask);
 362
 363                                         vnode_lock(vp);
 364
 365                                         vp->v_un.vu_specinfo->si_isssd = isssd;
 366                                         vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit;
 367                                         vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask;
 368                                         vp->v_un.vu_specinfo->si_throttleable = 1;
 369                                         vp->v_un.vu_specinfo->si_initted = 1;
 370
 371                                         vnode_unlock(vp);
 372                                 }
 373                         }
 374                         if (vp->v_un.vu_specinfo->si_initted == 0) {
 375                                 vnode_lock(vp);
 376                                 vp->v_un.vu_specinfo->si_initted = 1;
 377                                 vnode_unlock(vp);
 378                         }
 379                 }
 380                 return (error);
 381
 382         case VBLK:
 383                 if ((u_int)maj >= (u_int)nblkdev)
 384                         return (ENXIO);
 385                 /*
 386                  * When running in very secure mode, do not allow
 387                  * opens for writing of any disk block devices.
 388                  */
 389                 if (securelevel >= 2 && cred != FSCRED &&
 390                     (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK)
 391                         return (EPERM);
 392                 /*
 393                  * Do not allow opens of block devices that are
 394                  * currently mounted.
 395                  */
 396                 if ( (error = vfs_mountedon(vp)) )
 397                         return (error);
 398
 399                 devsw_lock(dev, S_IFBLK);
 400                 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
 401                 if (!error) {
 402                         vp->v_specinfo->si_opencount++;
 403                 }
 404                 devsw_unlock(dev, S_IFBLK);
 405
 406                 if (!error) {
 407                     u_int64_t blkcnt;
 408                     u_int32_t blksize;
 409                         int setsize = 0;
 410                         u_int32_t size512 = 512;
 411
 412
 413                     if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
 414                                 /* Switch to 512 byte sectors (temporarily) */
 415
 416                                 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
 417                                 /* Get the number of 512 byte physical blocks. */
 418                                 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
 419                                                 setsize = 1;
 420                                 }
 421                                 }
 422                                 /* If it doesn't set back, we can't recover */
 423                                 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context))
 424                                 error = ENXIO;
 425                     }
 426
 427
 428                         vnode_lock(vp);
 429                     set_blocksize(vp, dev);
 430
 431                     /*
 432                      * Cache the size in bytes of the block device for later
 433                      * use by spec_write().
 434                      */
 435                         if (setsize)
 436                                 vp->v_specdevsize = blkcnt * (u_int64_t)size512;
 437                         else
 438                         vp->v_specdevsize = (u_int64_t)0;       /* Default: Can't get */
 439
 440                         vnode_unlock(vp);
 441
 442                 }
 443                 return(error);
 444         default:
 445                 panic("spec_open type");
 446         }
 447         return (0);
 448 }
 449
 450 /*
 451  * Vnode op for read
 452  */
 453 int
 454 spec_read(struct vnop_read_args *ap)
 455 {
 456         struct vnode *vp = ap->a_vp;
 457         struct uio *uio = ap->a_uio;
 458         struct buf *bp;
 459         daddr64_t bn, nextbn;
 460         long bsize, bscale;
 461         int devBlockSize=0;
 462         int n, on;
 463         int error = 0;
 464         dev_t dev;
 465
 466 #if DIAGNOSTIC
 467         if (uio->uio_rw != UIO_READ)
 468                 panic("spec_read mode");
 469         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
 470                 panic("spec_read proc");
 471 #endif
 472         if (uio_resid(uio) == 0)
 473                 return (0);
 474
 475         switch (vp->v_type) {
 476
 477         case VCHR:
 478                 {
 479                         struct _throttle_io_info_t *throttle_info = NULL;
 480                         int thread_throttle_level;
 481                 if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
 482                         throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
 483                                 thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
 484                 }
 485                 error = (*cdevsw[major(vp->v_rdev)].d_read)
 486                         (vp->v_rdev, uio, ap->a_ioflag);
 487
 488                         if (throttle_info) {
 489                                 throttle_info_end_io_internal(throttle_info, thread_throttle_level);
 490                         }
 491
 492                 return (error);
 493                 }
 494
 495         case VBLK:
 496                 if (uio->uio_offset < 0)
 497                         return (EINVAL);
 498
 499                 dev = vp->v_rdev;
 500
 501                 devBlockSize = vp->v_specsize;
 502
 503                 if (devBlockSize > PAGE_SIZE)
 504                         return (EINVAL);
 505
 506                 bscale = PAGE_SIZE / devBlockSize;
 507                 bsize = bscale * devBlockSize;
 508
 509                 do {
 510                         on = uio->uio_offset % bsize;
 511
 512                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1));
 513
 514                         if (vp->v_speclastr + bscale == bn) {
 515                                 nextbn = bn + bscale;
 516                                 error = buf_breadn(vp, bn, (int)bsize, &nextbn,
 517                                                (int *)&bsize, 1, NOCRED, &bp);
 518                         } else
 519                                 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
 520
 521                         vnode_lock(vp);
 522                         vp->v_speclastr = bn;
 523                         vnode_unlock(vp);
 524
 525                         n = bsize - buf_resid(bp);
 526                         if ((on > n) || error) {
 527                                 if (!error)
 528                                         error = EINVAL;
 529                                 buf_brelse(bp);
 530                                 return (error);
 531                         }
 532                         n = min((unsigned)(n  - on), uio_resid(uio));
 533
 534                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
 535                         if (n + on == bsize)
 536                                 buf_markaged(bp);
 537                         buf_brelse(bp);
 538                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
 539                 return (error);
 540
 541         default:
 542                 panic("spec_read type");
 543         }
 544         /* NOTREACHED */
 545
 546         return (0);
 547 }
 548
 549 /*
 550  * Vnode op for write
 551  */
 552 int
 553 spec_write(struct vnop_write_args *ap)
 554 {
 555         struct vnode *vp = ap->a_vp;
 556         struct uio *uio = ap->a_uio;
 557         struct buf *bp;
 558         daddr64_t bn;
 559         int bsize, blkmask, bscale;
 560         int io_sync;
 561         int devBlockSize=0;
 562         int n, on;
 563         int error = 0;
 564         dev_t dev;
 565
 566 #if DIAGNOSTIC
 567         if (uio->uio_rw != UIO_WRITE)
 568                 panic("spec_write mode");
 569         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
 570                 panic("spec_write proc");
 571 #endif
 572
 573         switch (vp->v_type) {
 574
 575         case VCHR:
 576                 {
 577                         struct _throttle_io_info_t *throttle_info = NULL;
 578                         int thread_throttle_level;
 579                 if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
 580                         throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
 581
 582                                 thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
 583
 584                         microuptime(&throttle_info->throttle_last_write_timestamp);
 585                 }
 586                 error = (*cdevsw[major(vp->v_rdev)].d_write)
 587                         (vp->v_rdev, uio, ap->a_ioflag);
 588
 589                         if (throttle_info) {
 590                                 throttle_info_end_io_internal(throttle_info, thread_throttle_level);
 591                         }
 592
 593                 return (error);
 594                 }
 595
 596         case VBLK:
 597                 if (uio_resid(uio) == 0)
 598                         return (0);
 599                 if (uio->uio_offset < 0)
 600                         return (EINVAL);
 601
 602                 io_sync = (ap->a_ioflag & IO_SYNC);
 603
 604                 dev = (vp->v_rdev);
 605
 606                 devBlockSize = vp->v_specsize;
 607                 if (devBlockSize > PAGE_SIZE)
 608                         return(EINVAL);
 609
 610                 bscale = PAGE_SIZE / devBlockSize;
 611                 blkmask = bscale - 1;
 612                 bsize = bscale * devBlockSize;
 613
 614
 615                 do {
 616                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask);
 617                         on = uio->uio_offset % bsize;
 618
 619                         n = min((unsigned)(bsize - on), uio_resid(uio));
 620
 621                         /*
 622                          * Use buf_getblk() as an optimization IFF:
 623                          *
 624                          * 1)   We are reading exactly a block on a block
 625                          *      aligned boundary
 626                          * 2)   We know the size of the device from spec_open
 627                          * 3)   The read doesn't span the end of the device
 628                          *
 629                          * Otherwise, we fall back on buf_bread().
 630                          */
 631                         if (n == bsize &&
 632                             vp->v_specdevsize != (u_int64_t)0 &&
 633                             (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
 634                             /* reduce the size of the read to what is there */
 635                             n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
 636                         }
 637
 638                         if (n == bsize)
 639                                 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
 640                         else
 641                                 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
 642
 643                         /* Translate downstream error for upstream, if needed */
 644                         if (!error)
 645                                 error = (int)buf_error(bp);
 646                         if (error) {
 647                                 buf_brelse(bp);
 648                                 return (error);
 649                         }
 650                         n = min(n, bsize - buf_resid(bp));
 651
 652                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
 653                         if (error) {
 654                                 buf_brelse(bp);
 655                                 return (error);
 656                         }
 657                         buf_markaged(bp);
 658
 659                         if (io_sync)
 660                                 error = buf_bwrite(bp);
 661                         else {
 662                                 if ((n + on) == bsize)
 663                                         error = buf_bawrite(bp);
 664                                 else
 665                                         error = buf_bdwrite(bp);
 666                         }
 667                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
 668                 return (error);
 669
 670         default:
 671                 panic("spec_write type");
 672         }
 673         /* NOTREACHED */
 674
 675         return (0);
 676 }
 677
 678 /*
 679  * Device ioctl operation.
 680  */
 681 int
 682 spec_ioctl(struct vnop_ioctl_args *ap)
 683 {
 684         proc_t p = vfs_context_proc(ap->a_context);
 685         dev_t dev = ap->a_vp->v_rdev;
 686         int     retval = 0;
 687
 688         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START,
 689                 dev, ap->a_command, ap->a_fflag, ap->a_vp->v_type, 0);
 690
 691         switch (ap->a_vp->v_type) {
 692
 693         case VCHR:
 694                 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
 695                                                        ap->a_fflag, p);
 696                 break;
 697
 698         case VBLK:
 699                 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, p);
 700                 if (!retval && ap->a_command == DKIOCSETBLOCKSIZE)
 701                         ap->a_vp->v_specsize = *(uint32_t *)ap->a_data;
 702                 break;
 703
 704         default:
 705                 panic("spec_ioctl");
 706                 /* NOTREACHED */
 707         }
 708         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END,
 709                 dev, ap->a_command, ap->a_fflag, retval, 0);
 710
 711         return (retval);
 712 }
 713
 714 int
 715 spec_select(struct vnop_select_args *ap)
 716 {
 717         proc_t p = vfs_context_proc(ap->a_context);
 718         dev_t dev;
 719
 720         switch (ap->a_vp->v_type) {
 721
 722         default:
 723                 return (1);             /* XXX */
 724
 725         case VCHR:
 726                 dev = ap->a_vp->v_rdev;
 727                 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
 728         }
 729 }
 730
 731 static int filt_specattach(struct knote *kn, struct kevent_internal_s *kev);
 732
 733 int
 734 spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_internal_s *kev)
 735 {
 736         dev_t dev;
 737
 738         assert(vnode_ischr(vp));
 739
 740         dev = vnode_specrdev(vp);
 741
 742 #if NETWORKING
 743         /*
 744          * Try a bpf device, as defined in bsd/net/bpf.c
 745          * If it doesn't error out the attach, then it
 746          * claimed it. Otherwise, fall through and try
 747          * other attaches.
 748          */
 749         int32_t tmp_flags = kn->kn_flags;
 750         int64_t tmp_data = kn->kn_data;
 751         int res;
 752
 753         res = bpfkqfilter(dev, kn);
 754         if ((kn->kn_flags & EV_ERROR) == 0) {
 755                 return res;
 756         }
 757         kn->kn_flags = tmp_flags;
 758         kn->kn_data = tmp_data;
 759 #endif
 760
 761         if (major(dev) > nchrdev) {
 762                 knote_set_error(kn, ENXIO);
 763                 return 0;
 764         }
 765
 766         kn->kn_vnode_kqok = !!(cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE);
 767         kn->kn_vnode_use_ofst = !!(cdevsw_flags[major(dev)] & CDEVSW_USE_OFFSET);
 768
 769         if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTS) {
 770                 kn->kn_filtid = EVFILTID_PTSD;
 771                 return ptsd_kqfilter(dev, kn);
 772         } else if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTC) {
 773                 kn->kn_filtid = EVFILTID_PTMX;
 774                 return ptmx_kqfilter(dev, kn);
 775         } else if (cdevsw[major(dev)].d_type == D_TTY && kn->kn_vnode_kqok) {
 776                 /*
 777                  * TTYs from drivers that use struct ttys use their own filter
 778                  * routines.  The PTC driver doesn't use the tty for character
 779                  * counts, so it must go through the select fallback.
 780                  */
 781                 kn->kn_filtid = EVFILTID_TTY;
 782                 return knote_fops(kn)->f_attach(kn, kev);
 783         }
 784
 785         /* Try to attach to other char special devices */
 786         return filt_specattach(kn, kev);
 787 }
 788
 789 /*
 790  * Synch buffers associated with a block device
 791  */
 792 int
 793 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
 794 {
 795         if (vp->v_type == VCHR)
 796                 return (0);
 797         /*
 798          * Flush all dirty buffers associated with a block device.
 799          */
 800         buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync");
 801
 802         return (0);
 803 }
 804
 805 int
 806 spec_fsync(struct vnop_fsync_args *ap)
 807 {
 808         return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
 809 }
 810
 811
 812 /*
 813  * Just call the device strategy routine
 814  */
 815 void throttle_init(void);
 816
 817
 818 #if 0
 819 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)  \
 820         do {                                                    \
 821                if ((debug_info)->alloc)                           \
 822                printf("%s: "format, __FUNCTION__, ## args);     \
 823        } while(0)
 824
 825 #else
 826 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
 827 #endif
 828
 829
 830 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER1], 0, "");
 831 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER2], 0, "");
 832 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER3], 0, "");
 833
 834 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER1], 0, "");
 835 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER2], 0, "");
 836 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER3], 0, "");
 837
 838 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER1], 0, "");
 839 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER2], 0, "");
 840 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER3], 0, "");
 841
 842 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_throttle_enabled, 0, "");
 843
 844
 845 static lck_grp_t        *throttle_lock_grp;
 846 static lck_attr_t       *throttle_lock_attr;
 847 static lck_grp_attr_t   *throttle_lock_grp_attr;
 848
 849
 850 /*
 851  * throttled I/O helper function
 852  * convert the index of the lowest set bit to a device index
 853  */
 854 int
 855 num_trailing_0(uint64_t n)
 856 {
 857         /*
 858          * since in most cases the number of trailing 0s is very small,
 859          * we simply counting sequentially from the lowest bit
 860          */
 861         if (n == 0)
 862                 return sizeof(n) * 8;
 863         int count = 0;
 864         while (!ISSET(n, 1)) {
 865                 n >>= 1;
 866                 ++count;
 867         }
 868         return count;
 869 }
 870
 871
 872 /*
 873  * Release the reference and if the item was allocated and this is the last
 874  * reference then free it.
 875  *
 876  * This routine always returns the old value.
 877  */
 878 static int
 879 throttle_info_rel(struct _throttle_io_info_t *info)
 880 {
 881         SInt32 oldValue = OSDecrementAtomic(&info->throttle_refcnt);
 882
 883         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
 884                 info, (int)(oldValue -1), info );
 885
 886         /* The reference count just went negative, very bad */
 887         if (oldValue == 0)
 888                 panic("throttle info ref cnt went negative!");
 889
 890         /*
 891          * Once reference count is zero, no one else should be able to take a
 892          * reference
 893          */
 894         if ((info->throttle_refcnt == 0) && (info->throttle_alloc)) {
 895                 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info);
 896
 897                 lck_mtx_destroy(&info->throttle_lock, throttle_lock_grp);
 898                 FREE(info, M_TEMP);
 899         }
 900         return oldValue;
 901 }
 902
 903
 904 /*
 905  * Just take a reference on the throttle info structure.
 906  *
 907  * This routine always returns the old value.
 908  */
 909 static SInt32
 910 throttle_info_ref(struct _throttle_io_info_t *info)
 911 {
 912         SInt32 oldValue = OSIncrementAtomic(&info->throttle_refcnt);
 913
 914         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
 915                 info, (int)(oldValue -1), info );
 916         /* Allocated items should never have a reference of zero */
 917         if (info->throttle_alloc && (oldValue == 0))
 918                 panic("Taking a reference without calling create throttle info!\n");
 919
 920         return oldValue;
 921 }
 922
 923 /*
 924  * on entry the throttle_lock is held...
 925  * this function is responsible for taking
 926  * and dropping the reference on the info
 927  * structure which will keep it from going
 928  * away while the timer is running if it
 929  * happens to have been dynamically allocated by
 930  * a network fileystem kext which is now trying
 931  * to free it
 932  */
 933 static uint32_t
 934 throttle_timer_start(struct _throttle_io_info_t *info, boolean_t update_io_count, int wakelevel)
 935 {
 936         struct timeval  elapsed;
 937         struct timeval  now;
 938         struct timeval  period;
 939         uint64_t        elapsed_msecs;
 940         int             throttle_level;
 941         int             level;
 942         int             msecs;
 943         boolean_t       throttled = FALSE;
 944         boolean_t       need_timer = FALSE;
 945
 946         microuptime(&now);
 947
 948         if (update_io_count == TRUE) {
 949                 info->throttle_io_count_begin = info->throttle_io_count;
 950                 info->throttle_io_period_num++;
 951
 952                 while (wakelevel >= THROTTLE_LEVEL_THROTTLED)
 953                         info->throttle_start_IO_period_timestamp[wakelevel--] = now;
 954
 955                 info->throttle_min_timer_deadline = now;
 956
 957                 msecs = info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED];
 958                 period.tv_sec = msecs / 1000;
 959                 period.tv_usec = (msecs % 1000) * 1000;
 960
 961                 timevaladd(&info->throttle_min_timer_deadline, &period);
 962         }
 963         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < THROTTLE_LEVEL_END; throttle_level++) {
 964
 965                 elapsed = now;
 966                 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
 967                 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
 968
 969                 for (level = throttle_level + 1; level <= THROTTLE_LEVEL_END; level++) {
 970
 971                         if (!TAILQ_EMPTY(&info->throttle_uthlist[level])) {
 972
 973                                 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[level] || info->throttle_inflight_count[throttle_level]) {
 974                                         /*
 975                                          * we had an I/O occur at a higher priority tier within
 976                                          * this tier's throttle window
 977                                          */
 978                                         throttled = TRUE;
 979                                 }
 980                                 /*
 981                                  * we assume that the windows are the same or longer
 982                                  * as we drop through the throttling tiers...  thus
 983                                  * we can stop looking once we run into a tier with
 984                                  * threads to schedule regardless of whether it's
 985                                  * still in its throttling window or not
 986                                  */
 987                                 break;
 988                         }
 989                 }
 990                 if (throttled == TRUE)
 991                         break;
 992         }
 993         if (throttled == TRUE) {
 994                 uint64_t        deadline = 0;
 995                 struct timeval  target;
 996                 struct timeval  min_target;
 997
 998                 /*
 999                  * we've got at least one tier still in a throttled window
1000                  * so we need a timer running... compute the next deadline
1001                  * and schedule it
1002                  */
1003                 for (level = throttle_level+1; level <= THROTTLE_LEVEL_END; level++) {
1004
1005                         if (TAILQ_EMPTY(&info->throttle_uthlist[level]))
1006                                 continue;
1007
1008                         target = info->throttle_start_IO_period_timestamp[level];
1009
1010                         msecs = info->throttle_io_periods[level];
1011                         period.tv_sec = msecs / 1000;
1012                         period.tv_usec = (msecs % 1000) * 1000;
1013
1014                         timevaladd(&target, &period);
1015
1016                         if (need_timer == FALSE || timevalcmp(&target, &min_target, <)) {
1017                                 min_target = target;
1018                                 need_timer = TRUE;
1019                         }
1020                 }
1021                 if (timevalcmp(&info->throttle_min_timer_deadline, &now, >)) {
1022                         if (timevalcmp(&info->throttle_min_timer_deadline, &min_target, >))
1023                                 min_target = info->throttle_min_timer_deadline;
1024                 }
1025
1026                 if (info->throttle_timer_active) {
1027                         if (thread_call_cancel(info->throttle_timer_call) == FALSE) {
1028                                 /*
1029                                  * couldn't kill the timer because it's already
1030                                  * been dispatched, so don't try to start a new
1031                                  * one... once we drop the lock, the timer will
1032                                  * proceed and eventually re-run this function
1033                                  */
1034                                 need_timer = FALSE;
1035                         } else
1036                                 info->throttle_timer_active = 0;
1037                 }
1038                 if (need_timer == TRUE) {
1039                         /*
1040                          * This is defined as an int (32-bit) rather than a 64-bit
1041                          * value because it would need a really big period in the
1042                          * order of ~500 days to overflow this. So, we let this be
1043                          * 32-bit which allows us to use the clock_interval_to_deadline()
1044                          * routine.
1045                          */
1046                         int     target_msecs;
1047
1048                         if (info->throttle_timer_ref == 0) {
1049                                 /*
1050                                  * take a reference for the timer
1051                                  */
1052                                 throttle_info_ref(info);
1053
1054                                 info->throttle_timer_ref = 1;
1055                         }
1056                         elapsed = min_target;
1057                         timevalsub(&elapsed, &now);
1058                         target_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
1059
1060                         if (target_msecs <= 0) {
1061                                 /*
1062                                  * we may have computed a deadline slightly in the past
1063                                  * due to various factors... if so, just set the timer
1064                                  * to go off in the near future (we don't need to be precise)
1065                                  */
1066                                 target_msecs = 1;
1067                         }
1068                         clock_interval_to_deadline(target_msecs, 1000000, &deadline);
1069
1070                         thread_call_enter_delayed(info->throttle_timer_call, deadline);
1071                         info->throttle_timer_active = 1;
1072                 }
1073         }
1074         return (throttle_level);
1075 }
1076
1077
1078 static void
1079 throttle_timer(struct _throttle_io_info_t *info)
1080 {
1081         uthread_t       ut, utlist;
1082         struct timeval  elapsed;
1083         struct timeval  now;
1084         uint64_t        elapsed_msecs;
1085         int             throttle_level;
1086         int             level;
1087         int             wake_level;
1088         caddr_t         wake_address = NULL;
1089         boolean_t       update_io_count = FALSE;
1090         boolean_t       need_wakeup = FALSE;
1091         boolean_t       need_release = FALSE;
1092
1093         ut = NULL;
1094         lck_mtx_lock(&info->throttle_lock);
1095
1096         info->throttle_timer_active = 0;
1097         microuptime(&now);
1098
1099         elapsed = now;
1100         timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[THROTTLE_LEVEL_THROTTLED]);
1101         elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1102
1103         if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED]) {
1104
1105                 wake_level = info->throttle_next_wake_level;
1106
1107                 for (level = THROTTLE_LEVEL_START; level < THROTTLE_LEVEL_END; level++) {
1108
1109                         elapsed = now;
1110                         timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[wake_level]);
1111                         elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1112
1113                         if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[wake_level] && !TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
1114                                 /*
1115                                  * we're closing out the current IO period...
1116                                  * if we have a waiting thread, wake it up
1117                                  * after we have reset the I/O window info
1118                                  */
1119                                 need_wakeup = TRUE;
1120                                 update_io_count = TRUE;
1121
1122                                 info->throttle_next_wake_level = wake_level - 1;
1123
1124                                 if (info->throttle_next_wake_level == THROTTLE_LEVEL_START)
1125                                         info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1126
1127                                 break;
1128                         }
1129                         wake_level--;
1130
1131                         if (wake_level == THROTTLE_LEVEL_START)
1132                                 wake_level = THROTTLE_LEVEL_END;
1133                 }
1134         }
1135         if (need_wakeup == TRUE) {
1136                 if (!TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
1137
1138                         ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist[wake_level]);
1139                         TAILQ_REMOVE(&info->throttle_uthlist[wake_level], ut, uu_throttlelist);
1140                         ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1141                         ut->uu_is_throttled = FALSE;
1142
1143                         wake_address = (caddr_t)&ut->uu_on_throttlelist;
1144                 }
1145         } else
1146                 wake_level = THROTTLE_LEVEL_START;
1147
1148         throttle_level = throttle_timer_start(info, update_io_count, wake_level);
1149
1150         if (wake_address != NULL)
1151                 wakeup(wake_address);
1152
1153         for (level = THROTTLE_LEVEL_THROTTLED; level <= throttle_level; level++) {
1154
1155                 TAILQ_FOREACH_SAFE(ut, &info->throttle_uthlist[level], uu_throttlelist, utlist) {
1156
1157                         TAILQ_REMOVE(&info->throttle_uthlist[level], ut, uu_throttlelist);
1158                         ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1159                         ut->uu_is_throttled = FALSE;
1160
1161                         wakeup(&ut->uu_on_throttlelist);
1162                 }
1163         }
1164         if (info->throttle_timer_active == 0 && info->throttle_timer_ref) {
1165                 info->throttle_timer_ref = 0;
1166                 need_release = TRUE;
1167         }
1168         lck_mtx_unlock(&info->throttle_lock);
1169
1170         if (need_release == TRUE)
1171                 throttle_info_rel(info);
1172 }
1173
1174
1175 static int
1176 throttle_add_to_list(struct _throttle_io_info_t *info, uthread_t ut, int mylevel, boolean_t insert_tail)
1177 {
1178         boolean_t start_timer = FALSE;
1179         int level = THROTTLE_LEVEL_START;
1180
1181         if (TAILQ_EMPTY(&info->throttle_uthlist[mylevel])) {
1182                 info->throttle_start_IO_period_timestamp[mylevel] = info->throttle_last_IO_timestamp[mylevel];
1183                 start_timer = TRUE;
1184         }
1185
1186         if (insert_tail == TRUE)
1187                 TAILQ_INSERT_TAIL(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
1188         else
1189                 TAILQ_INSERT_HEAD(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
1190
1191         ut->uu_on_throttlelist = mylevel;
1192
1193         if (start_timer == TRUE) {
1194                 /* we may need to start or rearm the timer */
1195                 level = throttle_timer_start(info, FALSE, THROTTLE_LEVEL_START);
1196
1197                 if (level == THROTTLE_LEVEL_END) {
1198                         if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
1199                                 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1200
1201                                 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1202                         }
1203                 }
1204         }
1205         return (level);
1206 }
1207
1208 static void
1209 throttle_init_throttle_window(void)
1210 {
1211         int throttle_window_size;
1212
1213         /*
1214          * The hierarchy of throttle window values is as follows:
1215          * - Global defaults
1216          * - Device tree properties
1217          * - Boot-args
1218          * All values are specified in msecs.
1219          */
1220
1221         /* Override global values with device-tree properties */
1222         if (PE_get_default("kern.io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size)))
1223                 throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
1224
1225         if (PE_get_default("kern.io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size)))
1226                 throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
1227
1228         if (PE_get_default("kern.io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size)))
1229                 throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
1230
1231         /* Override with boot-args */
1232         if (PE_parse_boot_argn("io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size)))
1233                 throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
1234
1235         if (PE_parse_boot_argn("io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size)))
1236                 throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
1237
1238         if (PE_parse_boot_argn("io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size)))
1239                 throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
1240 }
1241
1242 static void
1243 throttle_init_throttle_period(struct _throttle_io_info_t *info, boolean_t isssd)
1244 {
1245         int throttle_period_size;
1246
1247         /*
1248          * The hierarchy of throttle period values is as follows:
1249          * - Global defaults
1250          * - Device tree properties
1251          * - Boot-args
1252          * All values are specified in msecs.
1253          */
1254
1255         /* Assign global defaults */
1256         if ((isssd == TRUE) && (info->throttle_is_fusion_with_priority == 0))
1257                 info->throttle_io_periods = &throttle_io_period_ssd_msecs[0];
1258         else
1259                 info->throttle_io_periods = &throttle_io_period_msecs[0];
1260
1261         /* Override global values with device-tree properties */
1262         if (PE_get_default("kern.io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size)))
1263                 info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
1264
1265         if (PE_get_default("kern.io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size)))
1266                 info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
1267
1268         if (PE_get_default("kern.io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size)))
1269                 info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
1270
1271         /* Override with boot-args */
1272         if (PE_parse_boot_argn("io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size)))
1273                 info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
1274
1275         if (PE_parse_boot_argn("io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size)))
1276                 info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
1277
1278         if (PE_parse_boot_argn("io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size)))
1279                 info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
1280
1281 }
1282
1283 #if CONFIG_IOSCHED
1284 extern  void vm_io_reprioritize_init(void);
1285 int     iosched_enabled = 1;
1286 #endif
1287
1288 void
1289 throttle_init(void)
1290 {
1291         struct _throttle_io_info_t *info;
1292         int     i;
1293         int     level;
1294 #if CONFIG_IOSCHED
1295         int     iosched;
1296 #endif
1297         /*
1298          * allocate lock group attribute and group
1299          */
1300         throttle_lock_grp_attr = lck_grp_attr_alloc_init();
1301         throttle_lock_grp = lck_grp_alloc_init("throttle I/O", throttle_lock_grp_attr);
1302
1303         /* Update throttle parameters based on device tree configuration */
1304         throttle_init_throttle_window();
1305
1306         /*
1307          * allocate the lock attribute
1308          */
1309         throttle_lock_attr = lck_attr_alloc_init();
1310
1311         for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) {
1312                 info = &_throttle_io_info[i];
1313
1314                 lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr);
1315                 info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1316
1317                 for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
1318                         TAILQ_INIT(&info->throttle_uthlist[level]);
1319                         info->throttle_last_IO_pid[level] = 0;
1320                         info->throttle_inflight_count[level] = 0;
1321                 }
1322                 info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1323                 info->throttle_disabled = 0;
1324                 info->throttle_is_fusion_with_priority = 0;
1325         }
1326 #if CONFIG_IOSCHED
1327         if (PE_parse_boot_argn("iosched", &iosched, sizeof(iosched))) {
1328                 iosched_enabled = iosched;
1329         }
1330         if (iosched_enabled) {
1331                 /* Initialize I/O Reprioritization mechanism */
1332                 vm_io_reprioritize_init();
1333         }
1334 #endif
1335 }
1336
1337 void
1338 sys_override_io_throttle(int flag)
1339 {
1340         if (flag == THROTTLE_IO_ENABLE)
1341                 lowpri_throttle_enabled = 1;
1342
1343         if (flag == THROTTLE_IO_DISABLE)
1344                 lowpri_throttle_enabled = 0;
1345 }
1346
1347 int rethrottle_wakeups = 0;
1348
1349 /*
1350  * the uu_rethrottle_lock is used to synchronize this function
1351  * with "throttle_lowpri_io" which is where a throttled thread
1352  * will block... that function will grab this lock before beginning
1353  * it's decision making process concerning the need to block, and
1354  * hold it through the assert_wait.  When that thread is awakened
1355  * for any reason (timer or rethrottle), it will reacquire the
1356  * uu_rethrottle_lock before determining if it really is ok for
1357  * it to now run.  This is the point at which the thread could
1358  * enter a different throttling queue and reblock or return from
1359  * the throttle w/o having waited out it's entire throttle if
1360  * the rethrottle has now moved it out of any currently
1361  * active throttle window.
1362  *
1363  *
1364  * NOTES:
1365  * 1 - This may be called with the task lock held.
1366  * 2 - This may be called with preemption and interrupts disabled
1367  *     in the kqueue wakeup path so we can't take the throttle_lock which is a mutex
1368  * 3 - This cannot safely dereference uu_throttle_info, as it may
1369  *     get deallocated out from under us
1370  */
1371
1372 void
1373 rethrottle_thread(uthread_t ut)
1374 {
1375         /*
1376          * If uthread doesn't have throttle state, then there's no chance
1377          * of it needing a rethrottle.
1378          */
1379         if (ut->uu_throttle_info == NULL)
1380                 return;
1381
1382         boolean_t s = ml_set_interrupts_enabled(FALSE);
1383         lck_spin_lock(&ut->uu_rethrottle_lock);
1384
1385         if (ut->uu_is_throttled == FALSE)
1386                 ut->uu_was_rethrottled = TRUE;
1387         else {
1388                 int my_new_level = throttle_get_thread_throttle_level(ut);
1389
1390                 if (my_new_level != ut->uu_on_throttlelist) {
1391                         /*
1392                          * ut is currently blocked (as indicated by
1393                          * ut->uu_is_throttled == TRUE)
1394                          * and we're changing it's throttle level, so
1395                          * we need to wake it up.
1396                          */
1397                         ut->uu_is_throttled = FALSE;
1398                         wakeup(&ut->uu_on_throttlelist);
1399
1400                         rethrottle_wakeups++;
1401                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 102)), thread_tid(ut->uu_thread), ut->uu_on_throttlelist, my_new_level, 0, 0);
1402                 }
1403         }
1404         lck_spin_unlock(&ut->uu_rethrottle_lock);
1405         ml_set_interrupts_enabled(s);
1406 }
1407
1408
1409 /*
1410  * KPI routine
1411  *
1412  * Create and take a reference on a throttle info structure and return a
1413  * pointer for the file system to use when calling throttle_info_update.
1414  * Calling file system must have a matching release for every create.
1415  */
1416 void *
1417 throttle_info_create(void)
1418 {
1419         struct _throttle_io_info_t *info;
1420         int     level;
1421
1422         MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK);
1423         /* Should never happen but just in case */
1424         if (info == NULL)
1425                 return NULL;
1426         /* Mark that this one was allocated and needs to be freed */
1427         DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
1428         info->throttle_alloc = TRUE;
1429
1430         lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr);
1431         info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1432
1433         for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
1434                 TAILQ_INIT(&info->throttle_uthlist[level]);
1435         }
1436         info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1437
1438         /* Take a reference */
1439         OSIncrementAtomic(&info->throttle_refcnt);
1440         return info;
1441 }
1442
1443 /*
1444  * KPI routine
1445  *
1446  * Release the throttle info pointer if all the reference are gone. Should be
1447  * called to release reference taken by throttle_info_create
1448  */
1449 void
1450 throttle_info_release(void *throttle_info)
1451 {
1452         DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
1453                 (struct _throttle_io_info_t *)throttle_info,
1454                 (struct _throttle_io_info_t *)throttle_info);
1455         if (throttle_info) /* Just to be careful */
1456                 throttle_info_rel(throttle_info);
1457 }
1458
1459 /*
1460  * KPI routine
1461  *
1462  * File Systems that create an info structure, need to call this routine in
1463  * their mount routine (used by cluster code). File Systems that call this in
1464  * their mount routines must call throttle_info_mount_rel in their unmount
1465  * routines.
1466  */
1467 void
1468 throttle_info_mount_ref(mount_t mp, void *throttle_info)
1469 {
1470         if ((throttle_info == NULL) || (mp == NULL))
1471                 return;
1472         throttle_info_ref(throttle_info);
1473
1474         /*
1475          * We already have a reference release it before adding the new one
1476          */
1477         if (mp->mnt_throttle_info)
1478                 throttle_info_rel(mp->mnt_throttle_info);
1479         mp->mnt_throttle_info = throttle_info;
1480 }
1481
1482 /*
1483  * Private KPI routine
1484  *
1485  * return a handle for accessing throttle_info given a throttle_mask.  The
1486  * handle must be released by throttle_info_rel_by_mask
1487  */
1488 int
1489 throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle)
1490 {
1491         int     dev_index;
1492         struct _throttle_io_info_t *info;
1493
1494         if (throttle_info_handle == NULL)
1495                 return EINVAL;
1496
1497         dev_index = num_trailing_0(throttle_mask);
1498         info = &_throttle_io_info[dev_index];
1499         throttle_info_ref(info);
1500         *(struct _throttle_io_info_t**)throttle_info_handle = info;
1501
1502         return 0;
1503 }
1504
1505 /*
1506  * Private KPI routine
1507  *
1508  * release the handle obtained by throttle_info_ref_by_mask
1509  */
1510 void
1511 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
1512 {
1513         /*
1514          * for now the handle is just a pointer to _throttle_io_info_t
1515          */
1516         throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
1517 }
1518
1519 /*
1520  * KPI routine
1521  *
1522  * File Systems that throttle_info_mount_ref, must call this routine in their
1523  * umount routine.
1524  */
1525 void
1526 throttle_info_mount_rel(mount_t mp)
1527 {
1528         if (mp->mnt_throttle_info)
1529                 throttle_info_rel(mp->mnt_throttle_info);
1530         mp->mnt_throttle_info = NULL;
1531 }
1532
1533 /*
1534  * Reset throttling periods for the given mount point
1535  *
1536  * private interface used by disk conditioner to reset
1537  * throttling periods when 'is_ssd' status changes
1538  */
1539 void
1540 throttle_info_mount_reset_period(mount_t mp, int isssd)
1541 {
1542         struct _throttle_io_info_t *info;
1543
1544         if (mp == NULL)
1545                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1546         else if (mp->mnt_throttle_info == NULL)
1547                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1548         else
1549                 info = mp->mnt_throttle_info;
1550
1551         throttle_init_throttle_period(info, isssd);
1552 }
1553
1554 void
1555 throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
1556 {
1557         struct _throttle_io_info_t *info;
1558
1559         if (mp == NULL)
1560                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1561         else if (mp->mnt_throttle_info == NULL)
1562                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1563         else
1564                 info = mp->mnt_throttle_info;
1565
1566         *tv = info->throttle_last_write_timestamp;
1567 }
1568
1569 void
1570 update_last_io_time(mount_t mp)
1571 {
1572         struct _throttle_io_info_t *info;
1573
1574         if (mp == NULL)
1575                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1576         else if (mp->mnt_throttle_info == NULL)
1577                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1578         else
1579                 info = mp->mnt_throttle_info;
1580
1581         microuptime(&info->throttle_last_write_timestamp);
1582         if (mp != NULL)
1583                 mp->mnt_last_write_completed_timestamp = info->throttle_last_write_timestamp;
1584 }
1585
1586 int
1587 throttle_get_io_policy(uthread_t *ut)
1588 {
1589         if (ut != NULL)
1590                 *ut = get_bsdthread_info(current_thread());
1591
1592         return (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO));
1593 }
1594
1595 int
1596 throttle_get_passive_io_policy(uthread_t *ut)
1597 {
1598         if (ut != NULL)
1599                 *ut = get_bsdthread_info(current_thread());
1600
1601         return (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_PASSIVE_IO));
1602 }
1603
1604
1605 static int
1606 throttle_get_thread_throttle_level(uthread_t ut)
1607 {
1608         uthread_t *ut_p = (ut == NULL) ? &ut : NULL;
1609         int io_tier = throttle_get_io_policy(ut_p);
1610
1611         return throttle_get_thread_throttle_level_internal(ut, io_tier);
1612 }
1613
1614 /*
1615  * Return a throttle level given an existing I/O tier (such as returned by throttle_get_io_policy)
1616  */
1617 static int
1618 throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier) {
1619         int thread_throttle_level = io_tier;
1620         int user_idle_level;
1621
1622         assert(ut != NULL);
1623
1624         /* Bootcache misses should always be throttled */
1625         if (ut->uu_throttle_bc == TRUE)
1626                 thread_throttle_level = THROTTLE_LEVEL_TIER3;
1627
1628         /*
1629          * Issue tier3 I/O as tier2 when the user is idle
1630          * to allow maintenance tasks to make more progress.
1631          *
1632          * Assume any positive idle level is enough... for now it's
1633          * only ever 0 or 128 but this is not defined anywhere.
1634          */
1635         if (thread_throttle_level >= THROTTLE_LEVEL_TIER3) {
1636                 user_idle_level = timer_get_user_idle_level();
1637                 if (user_idle_level > 0) {
1638                         thread_throttle_level--;
1639                 }
1640         }
1641
1642         return (thread_throttle_level);
1643 }
1644
1645 /*
1646  * I/O will be throttled if either of the following are true:
1647  *   - Higher tiers have in-flight I/O
1648  *   - The time delta since the last start/completion of a higher tier is within the throttle window interval
1649  *
1650  * In-flight I/O is bookended by throttle_info_update_internal/throttle_info_end_io_internal
1651  */
1652 static int
1653 throttle_io_will_be_throttled_internal(void * throttle_info, int * mylevel, int * throttling_level)
1654 {
1655         struct _throttle_io_info_t *info = throttle_info;
1656         struct timeval elapsed;
1657         struct timeval now;
1658         uint64_t elapsed_msecs;
1659         int     thread_throttle_level;
1660         int     throttle_level;
1661
1662         if ((thread_throttle_level = throttle_get_thread_throttle_level(NULL)) < THROTTLE_LEVEL_THROTTLED)
1663                 return (THROTTLE_DISENGAGED);
1664
1665         microuptime(&now);
1666
1667         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
1668                 if (info->throttle_inflight_count[throttle_level]) {
1669                         break;
1670                 }
1671                 elapsed = now;
1672                 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
1673                 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1674
1675                 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level])
1676                         break;
1677         }
1678         if (throttle_level >= thread_throttle_level) {
1679                 /*
1680                  * we're beyond all of the throttle windows
1681                  * that affect the throttle level of this thread,
1682                  * so go ahead and treat as normal I/O
1683                  */
1684                 return (THROTTLE_DISENGAGED);
1685         }
1686         if (mylevel)
1687                 *mylevel = thread_throttle_level;
1688         if (throttling_level)
1689                 *throttling_level = throttle_level;
1690
1691         if (info->throttle_io_count != info->throttle_io_count_begin) {
1692                 /*
1693                  * we've already issued at least one throttleable I/O
1694                  * in the current I/O window, so avoid issuing another one
1695                  */
1696                 return (THROTTLE_NOW);
1697         }
1698         /*
1699          * we're in the throttle window, so
1700          * cut the I/O size back
1701          */
1702         return (THROTTLE_ENGAGED);
1703 }
1704
1705 /*
1706  * If we have a mount point and it has a throttle info pointer then
1707  * use it to do the check, otherwise use the device unit number to find
1708  * the correct throttle info array element.
1709  */
1710 int
1711 throttle_io_will_be_throttled(__unused int lowpri_window_msecs, mount_t mp)
1712 {
1713         struct _throttle_io_info_t      *info;
1714
1715         /*
1716          * Should we just return zero if no mount point
1717          */
1718         if (mp == NULL)
1719                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1720         else if (mp->mnt_throttle_info == NULL)
1721                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1722         else
1723                 info = mp->mnt_throttle_info;
1724
1725         if (info->throttle_is_fusion_with_priority) {
1726                 uthread_t ut = get_bsdthread_info(current_thread());
1727                 if (ut->uu_lowpri_window == 0)
1728                         return (THROTTLE_DISENGAGED);
1729         }
1730
1731         if (info->throttle_disabled)
1732                 return (THROTTLE_DISENGAGED);
1733         else
1734                 return throttle_io_will_be_throttled_internal(info, NULL, NULL);
1735 }
1736
1737 /*
1738  * Routine to increment I/O throttling counters maintained in the proc
1739  */
1740
1741 static void
1742 throttle_update_proc_stats(pid_t throttling_pid, int count)
1743 {
1744         proc_t throttling_proc;
1745         proc_t throttled_proc = current_proc();
1746
1747         /* The throttled_proc is always the current proc; so we are not concerned with refs */
1748         OSAddAtomic64(count, &(throttled_proc->was_throttled));
1749
1750         /* The throttling pid might have exited by now */
1751         throttling_proc = proc_find(throttling_pid);
1752         if (throttling_proc != PROC_NULL) {
1753                 OSAddAtomic64(count, &(throttling_proc->did_throttle));
1754                 proc_rele(throttling_proc);
1755         }
1756 }
1757
1758 /*
1759  * Block until woken up by the throttle timer or by a rethrottle call.
1760  * As long as we hold the throttle_lock while querying the throttle tier, we're
1761  * safe against seeing an old throttle tier after a rethrottle.
1762  */
1763 uint32_t
1764 throttle_lowpri_io(int sleep_amount)
1765 {
1766         uthread_t ut;
1767         struct _throttle_io_info_t *info;
1768         int     throttle_type = 0;
1769         int     mylevel = 0;
1770         int     throttling_level = THROTTLE_LEVEL_NONE;
1771         int     sleep_cnt = 0;
1772         uint32_t  throttle_io_period_num = 0;
1773         boolean_t insert_tail = TRUE;
1774         boolean_t s;
1775
1776         ut = get_bsdthread_info(current_thread());
1777
1778         if (ut->uu_lowpri_window == 0)
1779                 return (0);
1780
1781         info = ut->uu_throttle_info;
1782
1783         if (info == NULL) {
1784                 ut->uu_throttle_bc = FALSE;
1785                 ut->uu_lowpri_window = 0;
1786                 return (0);
1787         }
1788         lck_mtx_lock(&info->throttle_lock);
1789         assert(ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED);
1790
1791         if (sleep_amount == 0)
1792                 goto done;
1793
1794         if (sleep_amount == 1 && ut->uu_throttle_bc == FALSE)
1795                 sleep_amount = 0;
1796
1797         throttle_io_period_num = info->throttle_io_period_num;
1798
1799         ut->uu_was_rethrottled = FALSE;
1800
1801         while ( (throttle_type = throttle_io_will_be_throttled_internal(info, &mylevel, &throttling_level)) ) {
1802
1803                 if (throttle_type == THROTTLE_ENGAGED) {
1804                         if (sleep_amount == 0)
1805                                 break;
1806                         if (info->throttle_io_period_num < throttle_io_period_num)
1807                                 break;
1808                         if ((info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount)
1809                                 break;
1810                 }
1811                 /*
1812                  * keep the same position in the list if "rethrottle_thread" changes our throttle level  and
1813                  * then puts us back to the original level before we get a chance to run
1814                  */
1815                 if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED && ut->uu_on_throttlelist != mylevel) {
1816                         /*
1817                          * must have been awakened via "rethrottle_thread" (the timer pulls us off the list)
1818                          * and we've changed our throttling level, so pull ourselves off of the appropriate list
1819                          * and make sure we get put on the tail of the new list since we're starting anew w/r to
1820                          * the throttling engine
1821                          */
1822                         TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1823                         ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1824                         insert_tail = TRUE;
1825                 }
1826                 if (ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED) {
1827                         if (throttle_add_to_list(info, ut, mylevel, insert_tail) == THROTTLE_LEVEL_END)
1828                                 goto done;
1829                 }
1830                 assert(throttling_level >= THROTTLE_LEVEL_START && throttling_level <= THROTTLE_LEVEL_END);
1831
1832                 s = ml_set_interrupts_enabled(FALSE);
1833                 lck_spin_lock(&ut->uu_rethrottle_lock);
1834
1835                 /*
1836                  * this is the critical section w/r to our interaction
1837                  * with "rethrottle_thread"
1838                  */
1839                 if (ut->uu_was_rethrottled == TRUE) {
1840
1841                         lck_spin_unlock(&ut->uu_rethrottle_lock);
1842                         ml_set_interrupts_enabled(s);
1843                         lck_mtx_yield(&info->throttle_lock);
1844
1845                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 103)), thread_tid(ut->uu_thread), ut->uu_on_throttlelist, 0, 0, 0);
1846
1847                         ut->uu_was_rethrottled = FALSE;
1848                         continue;
1849                 }
1850                 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, PROCESS_THROTTLED)) | DBG_FUNC_NONE,
1851                                 info->throttle_last_IO_pid[throttling_level], throttling_level, proc_selfpid(), mylevel, 0);
1852
1853                 if (sleep_cnt == 0) {
1854                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
1855                                               throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0);
1856                         throttled_count[mylevel]++;
1857                 }
1858                 ut->uu_wmesg = "throttle_lowpri_io";
1859
1860                 assert_wait((caddr_t)&ut->uu_on_throttlelist, THREAD_UNINT);
1861
1862                 ut->uu_is_throttled = TRUE;
1863                 lck_spin_unlock(&ut->uu_rethrottle_lock);
1864                 ml_set_interrupts_enabled(s);
1865
1866                 lck_mtx_unlock(&info->throttle_lock);
1867
1868                 thread_block(THREAD_CONTINUE_NULL);
1869
1870                 ut->uu_wmesg = NULL;
1871
1872                 ut->uu_is_throttled = FALSE;
1873                 ut->uu_was_rethrottled = FALSE;
1874
1875                 lck_mtx_lock(&info->throttle_lock);
1876
1877                 sleep_cnt++;
1878
1879                 if (sleep_amount == 0)
1880                         insert_tail = FALSE;
1881                 else if (info->throttle_io_period_num < throttle_io_period_num ||
1882                          (info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) {
1883                         insert_tail = FALSE;
1884                         sleep_amount = 0;
1885                 }
1886         }
1887 done:
1888         if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
1889                 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1890                 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1891         }
1892         lck_mtx_unlock(&info->throttle_lock);
1893
1894         if (sleep_cnt) {
1895                 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
1896                                       throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0);
1897                 /*
1898                  * We update the stats for the last pid which opened a throttle window for the throttled thread.
1899                  * This might not be completely accurate since the multiple throttles seen by the lower tier pid
1900                  * might have been caused by various higher prio pids. However, updating these stats accurately
1901                  * means doing a proc_find while holding the throttle lock which leads to deadlock.
1902                  */
1903                 throttle_update_proc_stats(info->throttle_last_IO_pid[throttling_level], sleep_cnt);
1904         }
1905
1906         ut->uu_throttle_info = NULL;
1907         ut->uu_throttle_bc = FALSE;
1908         ut->uu_lowpri_window = 0;
1909
1910         throttle_info_rel(info);
1911
1912         return (sleep_cnt);
1913 }
1914
1915 /*
1916  * KPI routine
1917  *
1918  * set a kernel thread's IO policy.  policy can be:
1919  * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE, IOPOL_UTILITY, IOPOL_STANDARD
1920  *
1921  * explanations about these policies are in the man page of setiopolicy_np
1922  */
1923 void throttle_set_thread_io_policy(int policy)
1924 {
1925         proc_set_thread_policy(current_thread(), TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, policy);
1926 }
1927
1928 int throttle_get_thread_effective_io_policy()
1929 {
1930         return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
1931 }
1932
1933 void throttle_info_reset_window(uthread_t ut)
1934 {
1935         struct _throttle_io_info_t *info;
1936
1937         if (ut == NULL)
1938                 ut = get_bsdthread_info(current_thread());
1939
1940         if ( (info = ut->uu_throttle_info) ) {
1941                 throttle_info_rel(info);
1942
1943                 ut->uu_throttle_info = NULL;
1944                 ut->uu_lowpri_window = 0;
1945                 ut->uu_throttle_bc = FALSE;
1946         }
1947 }
1948
1949 static
1950 void throttle_info_set_initial_window(uthread_t ut, struct _throttle_io_info_t *info, boolean_t BC_throttle, boolean_t isssd)
1951 {
1952         if (lowpri_throttle_enabled == 0 || info->throttle_disabled)
1953                 return;
1954
1955         if (info->throttle_io_periods == 0) {
1956                 throttle_init_throttle_period(info, isssd);
1957         }
1958         if (ut->uu_throttle_info == NULL) {
1959
1960                 ut->uu_throttle_info = info;
1961                 throttle_info_ref(info);
1962                 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
1963
1964                 ut->uu_lowpri_window = 1;
1965                 ut->uu_throttle_bc = BC_throttle;
1966         }
1967 }
1968
1969 /*
1970  * Update inflight IO count and throttling window
1971  * Should be called when an IO is done
1972  *
1973  * Only affects IO that was sent through spec_strategy
1974  */
1975 void throttle_info_end_io(buf_t bp) {
1976         mount_t mp;
1977         struct bufattr *bap;
1978         struct _throttle_io_info_t *info;
1979         int io_tier;
1980
1981         bap = &bp->b_attr;
1982         if (!ISSET(bap->ba_flags, BA_STRATEGY_TRACKED_IO)) {
1983                 return;
1984         }
1985         CLR(bap->ba_flags, BA_STRATEGY_TRACKED_IO);
1986
1987         mp = buf_vnode(bp)->v_mount;
1988         if (mp != NULL) {
1989                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1990         } else {
1991                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1992         }
1993
1994         io_tier = GET_BUFATTR_IO_TIER(bap);
1995         if (ISSET(bap->ba_flags, BA_IO_TIER_UPGRADE)) {
1996                 io_tier--;
1997         }
1998
1999         throttle_info_end_io_internal(info, io_tier);
2000 }
2001
2002 /*
2003  * Decrement inflight count initially incremented by throttle_info_update_internal
2004  */
2005 static
2006 void throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level) {
2007         if (throttle_level == THROTTLE_LEVEL_NONE) {
2008                 return;
2009         }
2010
2011         microuptime(&info->throttle_window_start_timestamp[throttle_level]);
2012         OSDecrementAtomic(&info->throttle_inflight_count[throttle_level]);
2013         assert(info->throttle_inflight_count[throttle_level] >= 0);
2014 }
2015
2016 /*
2017  * If inflight is TRUE and bap is NULL then the caller is responsible for calling
2018  * throttle_info_end_io_internal to avoid leaking in-flight I/O.
2019  */
2020 static
2021 int throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap)
2022 {
2023         int     thread_throttle_level;
2024
2025         if (lowpri_throttle_enabled == 0 || info->throttle_disabled)
2026                 return THROTTLE_LEVEL_NONE;
2027
2028         if (ut == NULL)
2029                 ut = get_bsdthread_info(current_thread());
2030
2031         if (bap && inflight && !ut->uu_throttle_bc) {
2032                 thread_throttle_level = GET_BUFATTR_IO_TIER(bap);
2033                 if (ISSET(bap->ba_flags, BA_IO_TIER_UPGRADE)) {
2034                         thread_throttle_level--;
2035                 }
2036         } else {
2037                 thread_throttle_level = throttle_get_thread_throttle_level(ut);
2038         }
2039
2040         if (thread_throttle_level != THROTTLE_LEVEL_NONE) {
2041         if(!ISSET(flags, B_PASSIVE)) {
2042                         info->throttle_last_IO_pid[thread_throttle_level] = proc_selfpid();
2043                         if (inflight && !ut->uu_throttle_bc) {
2044                                 if (NULL != bap) {
2045                                         SET(bap->ba_flags, BA_STRATEGY_TRACKED_IO);
2046                                 }
2047                                 OSIncrementAtomic(&info->throttle_inflight_count[thread_throttle_level]);
2048                         } else {
2049                                 microuptime(&info->throttle_window_start_timestamp[thread_throttle_level]);
2050                         }
2051                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, OPEN_THROTTLE_WINDOW)) | DBG_FUNC_NONE,
2052                                         current_proc()->p_pid, thread_throttle_level, 0, 0, 0);
2053                 }
2054                 microuptime(&info->throttle_last_IO_timestamp[thread_throttle_level]);
2055         }
2056
2057
2058         if (thread_throttle_level >= THROTTLE_LEVEL_THROTTLED) {
2059                 /*
2060                  * I'd really like to do the IOSleep here, but
2061                  * we may be holding all kinds of filesystem related locks
2062                  * and the pages for this I/O marked 'busy'...
2063                  * we don't want to cause a normal task to block on
2064                  * one of these locks while we're throttling a task marked
2065                  * for low priority I/O... we'll mark the uthread and
2066                  * do the delay just before we return from the system
2067                  * call that triggered this I/O or from vnode_pagein
2068                  */
2069                 OSAddAtomic(1, &info->throttle_io_count);
2070
2071                 throttle_info_set_initial_window(ut, info, FALSE, isssd);
2072         }
2073
2074         return thread_throttle_level;
2075 }
2076
2077 void *throttle_info_update_by_mount(mount_t mp)
2078 {
2079         struct _throttle_io_info_t *info;
2080         uthread_t ut;
2081         boolean_t isssd = FALSE;
2082
2083         ut = get_bsdthread_info(current_thread());
2084
2085         if (mp != NULL) {
2086                 if (disk_conditioner_mount_is_ssd(mp))
2087                         isssd = TRUE;
2088                 info = &_throttle_io_info[mp->mnt_devbsdunit];
2089         } else
2090                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2091
2092         if (!ut->uu_lowpri_window)
2093                 throttle_info_set_initial_window(ut, info, FALSE, isssd);
2094
2095         return info;
2096 }
2097
2098
2099 /*
2100  * KPI routine
2101  *
2102  * this is usually called before every I/O, used for throttled I/O
2103  * book keeping.  This routine has low overhead and does not sleep
2104  */
2105 void throttle_info_update(void *throttle_info, int flags)
2106 {
2107         if (throttle_info)
2108                 throttle_info_update_internal(throttle_info, NULL, flags, FALSE, FALSE, NULL);
2109 }
2110
2111 /*
2112  * KPI routine
2113  *
2114  * this is usually called before every I/O, used for throttled I/O
2115  * book keeping.  This routine has low overhead and does not sleep
2116  */
2117 void throttle_info_update_by_mask(void *throttle_info_handle, int flags)
2118 {
2119         void *throttle_info = throttle_info_handle;
2120
2121         /*
2122          * for now we only use the lowest bit of the throttle mask, so the
2123          * handle is the same as the throttle_info.  Later if we store a
2124          * set of throttle infos in the handle, we will want to loop through
2125          * them and call throttle_info_update in a loop
2126          */
2127         throttle_info_update(throttle_info, flags);
2128 }
2129 /*
2130  * KPI routine
2131  *
2132  * This routine marks the throttle info as disabled. Used for mount points which
2133  * support I/O scheduling.
2134  */
2135
2136 void throttle_info_disable_throttle(int devno, boolean_t isfusion)
2137 {
2138         struct _throttle_io_info_t *info;
2139
2140         if (devno < 0 || devno >= LOWPRI_MAX_NUM_DEV)
2141                 panic("Illegal devno (%d) passed into throttle_info_disable_throttle()", devno);
2142
2143         info = &_throttle_io_info[devno];
2144         // don't disable software throttling on devices that are part of a fusion device
2145         // and override the software throttle periods to use HDD periods
2146         if (isfusion) {
2147                 info->throttle_is_fusion_with_priority = isfusion;
2148                 throttle_init_throttle_period(info, FALSE);
2149         }
2150         info->throttle_disabled = !info->throttle_is_fusion_with_priority;
2151         return;
2152 }
2153
2154
2155 /*
2156  * KPI routine (private)
2157  * Called to determine if this IO is being throttled to this level so that it can be treated specially
2158  */
2159 int throttle_info_io_will_be_throttled(void * throttle_info, int policy)
2160 {
2161         struct _throttle_io_info_t *info = throttle_info;
2162         struct timeval elapsed;
2163         uint64_t elapsed_msecs;
2164         int     throttle_level;
2165         int     thread_throttle_level;
2166
2167         switch (policy) {
2168
2169         case IOPOL_THROTTLE:
2170                 thread_throttle_level = THROTTLE_LEVEL_TIER3;
2171                 break;
2172         case IOPOL_UTILITY:
2173                 thread_throttle_level = THROTTLE_LEVEL_TIER2;
2174                 break;
2175         case IOPOL_STANDARD:
2176                 thread_throttle_level = THROTTLE_LEVEL_TIER1;
2177                 break;
2178         default:
2179                 thread_throttle_level = THROTTLE_LEVEL_TIER0;
2180                 break;
2181         }
2182         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
2183                 if (info->throttle_inflight_count[throttle_level]) {
2184                         break;
2185                 }
2186
2187                 microuptime(&elapsed);
2188                 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
2189                 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
2190
2191                 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level])
2192                         break;
2193         }
2194         if (throttle_level >= thread_throttle_level) {
2195                 /*
2196                  * we're beyond all of the throttle windows
2197                  * so go ahead and treat as normal I/O
2198                  */
2199                 return (THROTTLE_DISENGAGED);
2200         }
2201         /*
2202          * we're in the throttle window
2203          */
2204         return (THROTTLE_ENGAGED);
2205 }
2206
2207 int throttle_lowpri_window(void)
2208 {
2209         struct uthread *ut = get_bsdthread_info(current_thread());
2210         return ut->uu_lowpri_window;
2211 }
2212
2213
2214 #if CONFIG_IOSCHED
2215 int upl_get_cached_tier(void *);
2216 #endif
2217
2218 int
2219 spec_strategy(struct vnop_strategy_args *ap)
2220 {
2221         buf_t   bp;
2222         int     bflags;
2223         int     io_tier;
2224         int     passive;
2225         dev_t   bdev;
2226         uthread_t ut;
2227         mount_t mp;
2228         struct  bufattr *bap;
2229         int     strategy_ret;
2230         struct _throttle_io_info_t *throttle_info;
2231         boolean_t isssd = FALSE;
2232         boolean_t inflight = FALSE;
2233         boolean_t upgrade = FALSE;
2234         int code = 0;
2235
2236 #if !CONFIG_EMBEDDED
2237         proc_t curproc = current_proc();
2238 #endif /* !CONFIG_EMBEDDED */
2239
2240         bp = ap->a_bp;
2241         bdev = buf_device(bp);
2242         mp = buf_vnode(bp)->v_mount;
2243         bap = &bp->b_attr;
2244
2245 #if CONFIG_IOSCHED
2246        if (bp->b_flags & B_CLUSTER) {
2247
2248                io_tier = upl_get_cached_tier(bp->b_upl);
2249
2250                if (io_tier == -1)
2251                        io_tier = throttle_get_io_policy(&ut);
2252 #if DEVELOPMENT || DEBUG
2253                else {
2254                        int my_io_tier = throttle_get_io_policy(&ut);
2255
2256                        if (io_tier != my_io_tier)
2257                                KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, IO_TIER_UPL_MISMATCH)) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(bp), my_io_tier, io_tier, 0, 0);
2258                }
2259 #endif
2260        } else
2261                io_tier = throttle_get_io_policy(&ut);
2262 #else
2263         io_tier = throttle_get_io_policy(&ut);
2264 #endif
2265         passive = throttle_get_passive_io_policy(&ut);
2266
2267         /*
2268          * Mark if the I/O was upgraded by throttle_get_thread_throttle_level
2269          * while preserving the original issued tier (throttle_get_io_policy
2270          * does not return upgraded tiers)
2271          */
2272         if (mp && io_tier > throttle_get_thread_throttle_level_internal(ut, io_tier)) {
2273 #if CONFIG_IOSCHED
2274                 if (!(mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) {
2275                         upgrade = TRUE;
2276                 }
2277 #else /* CONFIG_IOSCHED */
2278                 upgrade = TRUE;
2279 #endif /* CONFIG_IOSCHED */
2280         }
2281
2282         if (bp->b_flags & B_META)
2283                 bap->ba_flags |= BA_META;
2284
2285 #if CONFIG_IOSCHED
2286         /*
2287          * For I/O Scheduling, we currently do not have a way to track and expedite metadata I/Os.
2288          * To ensure we dont get into priority inversions due to metadata I/Os, we use the following rules:
2289          * For metadata reads, ceil all I/Os to IOSCHED_METADATA_TIER & mark them passive if the I/O tier was upgraded
2290          * For metadata writes, unconditionally mark them as IOSCHED_METADATA_TIER and passive
2291          */
2292         if (bap->ba_flags & BA_META) {
2293                 if (mp && (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) {
2294                         if (bp->b_flags & B_READ) {
2295                                 if (io_tier > IOSCHED_METADATA_TIER) {
2296                                         io_tier = IOSCHED_METADATA_TIER;
2297                                         passive = 1;
2298                                 }
2299                         } else {
2300                                 io_tier = IOSCHED_METADATA_TIER;
2301                                 passive = 1;
2302                         }
2303                 }
2304         }
2305 #endif /* CONFIG_IOSCHED */
2306
2307         SET_BUFATTR_IO_TIER(bap, io_tier);
2308
2309         if (passive) {
2310                 bp->b_flags |= B_PASSIVE;
2311                 bap->ba_flags |= BA_PASSIVE;
2312         }
2313
2314 #if !CONFIG_EMBEDDED
2315         if ((curproc != NULL) && ((curproc->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP))
2316                 bap->ba_flags |= BA_DELAYIDLESLEEP;
2317 #endif /* !CONFIG_EMBEDDED */
2318
2319         bflags = bp->b_flags;
2320
2321         if (((bflags & B_READ) == 0) && ((bflags & B_ASYNC) == 0))
2322                 bufattr_markquickcomplete(bap);
2323
2324         if (bflags & B_READ)
2325                 code |= DKIO_READ;
2326         if (bflags & B_ASYNC)
2327                 code |= DKIO_ASYNC;
2328
2329         if (bap->ba_flags & BA_META)
2330                 code |= DKIO_META;
2331         else if (bflags & B_PAGEIO)
2332                 code |= DKIO_PAGING;
2333
2334         if (io_tier != 0)
2335                 code |= DKIO_THROTTLE;
2336
2337         code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
2338
2339         if (bflags & B_PASSIVE)
2340                 code |= DKIO_PASSIVE;
2341
2342         if (bap->ba_flags & BA_NOCACHE)
2343                 code |= DKIO_NOCACHE;
2344
2345         if (upgrade) {
2346                 code |= DKIO_TIER_UPGRADE;
2347                 SET(bap->ba_flags, BA_IO_TIER_UPGRADE);
2348         }
2349
2350         if (kdebug_enable) {
2351                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
2352                                           buf_kernel_addrperm_addr(bp), bdev, (int)buf_blkno(bp), buf_count(bp), 0);
2353         }
2354
2355         thread_update_io_stats(current_thread(), buf_count(bp), code);
2356
2357         if (mp != NULL) {
2358                 if (disk_conditioner_mount_is_ssd(mp))
2359                         isssd = TRUE;
2360                 /*
2361                  * Partially initialized mounts don't have a final devbsdunit and should not be tracked.
2362                  * Verify that devbsdunit is initialized (non-zero) or that 0 is the correct initialized value
2363                  * (mnt_throttle_mask is initialized and num_trailing_0 would be 0)
2364                  */
2365                 if (mp->mnt_devbsdunit || (mp->mnt_throttle_mask != LOWPRI_MAX_NUM_DEV - 1 && mp->mnt_throttle_mask & 0x1)) {
2366                         inflight = TRUE;
2367                 }
2368                 throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
2369
2370         } else
2371                 throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2372
2373         throttle_info_update_internal(throttle_info, ut, bflags, isssd, inflight, bap);
2374
2375         if ((bflags & B_READ) == 0) {
2376                 microuptime(&throttle_info->throttle_last_write_timestamp);
2377
2378                 if (mp) {
2379                         mp->mnt_last_write_issued_timestamp = throttle_info->throttle_last_write_timestamp;
2380                         INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
2381                 }
2382         } else if (mp) {
2383                 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
2384         }
2385         /*
2386          * The BootCache may give us special information about
2387          * the IO, so it returns special values that we check
2388          * for here.
2389          *
2390          * IO_SATISFIED_BY_CACHE
2391          * The read has been satisfied by the boot cache. Don't
2392          * throttle the thread unnecessarily.
2393          *
2394          * IO_SHOULD_BE_THROTTLED
2395          * The boot cache is playing back a playlist and this IO
2396          * cut through. Throttle it so we're not cutting through
2397          * the boot cache too often.
2398          *
2399          * Note that typical strategy routines are defined with
2400          * a void return so we'll get garbage here. In the
2401          * unlikely case the garbage matches our special return
2402          * value, it's not a big deal since we're only adjusting
2403          * the throttling delay.
2404          */
2405 #define IO_SATISFIED_BY_CACHE  ((int)0xcafefeed)
2406 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
2407         typedef int strategy_fcn_ret_t(struct buf *bp);
2408
2409         strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp);
2410
2411         // disk conditioner needs to track when this I/O actually starts
2412         // which means track it after `strategy` which may include delays
2413         // from inflight I/Os
2414         microuptime(&bp->b_timestamp_tv);
2415
2416         if (IO_SATISFIED_BY_CACHE == strategy_ret) {
2417                 /*
2418                  * If this was a throttled IO satisfied by the boot cache,
2419                  * don't delay the thread.
2420                  */
2421                 throttle_info_reset_window(ut);
2422
2423         } else if (IO_SHOULD_BE_THROTTLED == strategy_ret) {
2424                 /*
2425                  * If the boot cache indicates this IO should be throttled,
2426                  * delay the thread.
2427                  */
2428                 throttle_info_set_initial_window(ut, throttle_info, TRUE, isssd);
2429         }
2430         return (0);
2431 }
2432
2433
2434 /*
2435  * This is a noop, simply returning what one has been given.
2436  */
2437 int
2438 spec_blockmap(__unused struct vnop_blockmap_args *ap)
2439 {
2440         return (ENOTSUP);
2441 }
2442
2443
2444 /*
2445  * Device close routine
2446  */
2447 int
2448 spec_close(struct vnop_close_args *ap)
2449 {
2450         struct vnode *vp = ap->a_vp;
2451         dev_t dev = vp->v_rdev;
2452         int error = 0;
2453         int flags = ap->a_fflag;
2454         struct proc *p = vfs_context_proc(ap->a_context);
2455         struct session *sessp;
2456
2457         switch (vp->v_type) {
2458
2459         case VCHR:
2460                 /*
2461                  * Hack: a tty device that is a controlling terminal
2462                  * has a reference from the session structure.
2463                  * We cannot easily tell that a character device is
2464                  * a controlling terminal, unless it is the closing
2465                  * process' controlling terminal.  In that case,
2466                  * if the reference count is 1 (this is the very
2467                  * last close)
2468                  */
2469                 sessp = proc_session(p);
2470                 devsw_lock(dev, S_IFCHR);
2471                 if (sessp != SESSION_NULL) {
2472                         if (vp == sessp->s_ttyvp && vcount(vp) == 1) {
2473                                 struct tty *tp = TTY_NULL;
2474
2475                                 devsw_unlock(dev, S_IFCHR);
2476                                 session_lock(sessp);
2477                                 if (vp == sessp->s_ttyvp) {
2478                                         tp = SESSION_TP(sessp);
2479                                         sessp->s_ttyvp = NULL;
2480                                         sessp->s_ttyvid = 0;
2481                                         sessp->s_ttyp = TTY_NULL;
2482                                         sessp->s_ttypgrpid = NO_PID;
2483                                 }
2484                                 session_unlock(sessp);
2485
2486                                 if (tp != TTY_NULL) {
2487                                         /*
2488                                          * We may have won a race with a proc_exit
2489                                          * of the session leader, the winner
2490                                          * clears the flag (even if not set)
2491                                          */
2492                                         tty_lock(tp);
2493                                         ttyclrpgrphup(tp);
2494                                         tty_unlock(tp);
2495
2496                                         ttyfree(tp);
2497                                 }
2498                                 devsw_lock(dev, S_IFCHR);
2499                         }
2500                         session_rele(sessp);
2501                 }
2502
2503                 if (--vp->v_specinfo->si_opencount < 0)
2504                         panic("negative open count (c, %u, %u)", major(dev), minor(dev));
2505
2506                 /*
2507                  * close on last reference or on vnode revoke call
2508                  */
2509                 if (vcount(vp) == 0 || (flags & IO_REVOKE) != 0)
2510                         error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
2511
2512                 devsw_unlock(dev, S_IFCHR);
2513                 break;
2514
2515         case VBLK:
2516                 /*
2517                  * If there is more than one outstanding open, don't
2518                  * send the close to the device.
2519                  */
2520                 devsw_lock(dev, S_IFBLK);
2521                 if (vcount(vp) > 1) {
2522                         vp->v_specinfo->si_opencount--;
2523                         devsw_unlock(dev, S_IFBLK);
2524                         return (0);
2525                 }
2526                 devsw_unlock(dev, S_IFBLK);
2527
2528                 /*
2529                  * On last close of a block device (that isn't mounted)
2530                  * we must invalidate any in core blocks, so that
2531                  * we can, for instance, change floppy disks.
2532                  */
2533                 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
2534                         return (error);
2535
2536                 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
2537                 if (error)
2538                         return (error);
2539
2540                 devsw_lock(dev, S_IFBLK);
2541
2542                 if (--vp->v_specinfo->si_opencount < 0)
2543                         panic("negative open count (b, %u, %u)", major(dev), minor(dev));
2544
2545                 if (vcount(vp) == 0)
2546                         error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
2547
2548                 devsw_unlock(dev, S_IFBLK);
2549                 break;
2550
2551         default:
2552                 panic("spec_close: not special");
2553                 return(EBADF);
2554         }
2555
2556         return error;
2557 }
2558
2559 /*
2560  * Return POSIX pathconf information applicable to special devices.
2561  */
2562 int
2563 spec_pathconf(struct vnop_pathconf_args *ap)
2564 {
2565
2566         switch (ap->a_name) {
2567         case _PC_LINK_MAX:
2568                 *ap->a_retval = LINK_MAX;
2569                 return (0);
2570         case _PC_MAX_CANON:
2571                 *ap->a_retval = MAX_CANON;
2572                 return (0);
2573         case _PC_MAX_INPUT:
2574                 *ap->a_retval = MAX_INPUT;
2575                 return (0);
2576         case _PC_PIPE_BUF:
2577                 *ap->a_retval = PIPE_BUF;
2578                 return (0);
2579         case _PC_CHOWN_RESTRICTED:
2580                 *ap->a_retval = 200112;         /* _POSIX_CHOWN_RESTRICTED */
2581                 return (0);
2582         case _PC_VDISABLE:
2583                 *ap->a_retval = _POSIX_VDISABLE;
2584                 return (0);
2585         default:
2586                 return (EINVAL);
2587         }
2588         /* NOTREACHED */
2589 }
2590
2591 /*
2592  * Special device failed operation
2593  */
2594 int
2595 spec_ebadf(__unused void *dummy)
2596 {
2597
2598         return (EBADF);
2599 }
2600
2601 /* Blktooff derives file offset from logical block number */
2602 int
2603 spec_blktooff(struct vnop_blktooff_args *ap)
2604 {
2605         struct vnode *vp = ap->a_vp;
2606
2607         switch (vp->v_type) {
2608         case VCHR:
2609                 *ap->a_offset = (off_t)-1; /* failure */
2610                 return (ENOTSUP);
2611
2612         case VBLK:
2613                 printf("spec_blktooff: not implemented for VBLK\n");
2614                 *ap->a_offset = (off_t)-1; /* failure */
2615                 return (ENOTSUP);
2616
2617         default:
2618                 panic("spec_blktooff type");
2619         }
2620         /* NOTREACHED */
2621
2622         return (0);
2623 }
2624
2625 /* Offtoblk derives logical block number from file offset */
2626 int
2627 spec_offtoblk(struct vnop_offtoblk_args *ap)
2628 {
2629         struct vnode *vp = ap->a_vp;
2630
2631         switch (vp->v_type) {
2632         case VCHR:
2633                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
2634                 return (ENOTSUP);
2635
2636         case VBLK:
2637                 printf("spec_offtoblk: not implemented for VBLK\n");
2638                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
2639                 return (ENOTSUP);
2640
2641         default:
2642                 panic("spec_offtoblk type");
2643         }
2644         /* NOTREACHED */
2645
2646         return (0);
2647 }
2648
2649 static void filt_specdetach(struct knote *kn);
2650 static int filt_specevent(struct knote *kn, long hint);
2651 static int filt_spectouch(struct knote *kn, struct kevent_internal_s *kev);
2652 static int filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
2653 static unsigned filt_specpeek(struct knote *kn);
2654
2655 SECURITY_READ_ONLY_EARLY(struct filterops) spec_filtops = {
2656         .f_isfd    = 1,
2657         .f_attach  = filt_specattach,
2658         .f_detach  = filt_specdetach,
2659         .f_event   = filt_specevent,
2660         .f_touch   = filt_spectouch,
2661         .f_process = filt_specprocess,
2662         .f_peek    = filt_specpeek
2663 };
2664
2665
2666 /*
2667  * Given a waitq that is assumed to be embedded within a selinfo structure,
2668  * return the containing selinfo structure. While 'wq' is not really a queue
2669  * element, this macro simply does the offset_of calculation to get back to a
2670  * containing struct given the struct type and member name.
2671  */
2672 #define selinfo_from_waitq(wq) \
2673         qe_element((wq), struct selinfo, si_waitq)
2674
2675 static int
2676 spec_knote_select_and_link(struct knote *kn)
2677 {
2678         uthread_t uth;
2679         vfs_context_t ctx;
2680         vnode_t vp;
2681         struct waitq_set *old_wqs;
2682         uint64_t rsvd, rsvd_arg;
2683         uint64_t *rlptr = NULL;
2684         struct selinfo *si = NULL;
2685         int selres = 0;
2686
2687         uth = get_bsdthread_info(current_thread());
2688
2689         ctx = vfs_context_current();
2690         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
2691
2692         int error = vnode_getwithvid(vp, kn->kn_hookid);
2693         if (error != 0) {
2694                 knote_set_error(kn, ENOENT);
2695                 return 0;
2696         }
2697
2698         /*
2699          * This function may be called many times to link or re-link the
2700          * underlying vnode to the kqueue.  If we've already linked the two,
2701          * we will have a valid kn_hook_data which ties us to the underlying
2702          * device's waitq via a the waitq's prepost table object. However,
2703          * devices can abort any select action by calling selthreadclear().
2704          * This is OK because the table object will be invalidated by the
2705          * driver (through a call to selthreadclear), so any attempt to access
2706          * the associated waitq will fail because the table object is invalid.
2707          *
2708          * Even if we've already registered, we need to pass a pointer
2709          * to a reserved link structure. Otherwise, selrecord() will
2710          * infer that we're in the second pass of select() and won't
2711          * actually do anything!
2712          */
2713         rsvd = rsvd_arg = waitq_link_reserve(NULL);
2714         rlptr = (void *)&rsvd_arg;
2715
2716         /*
2717          * Trick selrecord() into hooking kqueue's wait queue set into the device's
2718          * selinfo wait queue.
2719          */
2720         old_wqs = uth->uu_wqset;
2721         uth->uu_wqset = &(knote_get_kq(kn)->kq_wqs);
2722         /*
2723          * Now these are the laws of VNOP_SELECT, as old and as true as the sky,
2724          * And the device that shall keep it may prosper, but the device that shall
2725          * break it must receive ENODEV:
2726          *
2727          * 1. Take a lock to protect against other selects on the same vnode.
2728          * 2. Return 1 if data is ready to be read.
2729          * 3. Return 0 and call `selrecord` on a handy `selinfo` structure if there
2730          *    is no data.
2731          * 4. Call `selwakeup` when the vnode has an active `selrecord` and data
2732          *    can be read or written (depending on the seltype).
2733          * 5. If there's a `selrecord` and no corresponding `selwakeup`, but the
2734          *    vnode is going away, call `selthreadclear`.
2735          */
2736         selres = VNOP_SELECT(vp, knote_get_seltype(kn), 0, rlptr, ctx);
2737         uth->uu_wqset = old_wqs;
2738
2739         /*
2740          * Make sure to cleanup the reserved link - this guards against
2741          * drivers that may not actually call selrecord().
2742          */
2743         waitq_link_release(rsvd);
2744         if (rsvd != rsvd_arg) {
2745                 /* The driver / handler called selrecord() */
2746                 struct waitq *wq;
2747                 memcpy(&wq, rlptr, sizeof(void *));
2748
2749                 /*
2750                  * The waitq is part of the selinfo structure managed by the
2751                  * driver. For certain drivers, we want to hook the knote into
2752                  * the selinfo structure's si_note field so selwakeup can call
2753                  * KNOTE.
2754                  */
2755                 si = selinfo_from_waitq(wq);
2756
2757                 /*
2758                  * The waitq_get_prepost_id() function will (potentially)
2759                  * allocate a prepost table object for the waitq and return
2760                  * the table object's ID to us.  It will also set the
2761                  * waitq_prepost_id field within the waitq structure.
2762                  *
2763                  * We can just overwrite kn_hook_data because it's simply a
2764                  * table ID used to grab a reference when needed.
2765                  *
2766                  * We have a reference on the vnode, so we know that the
2767                  * device won't go away while we get this ID.
2768                  */
2769                 kn->kn_hook_data = waitq_get_prepost_id(wq);
2770         } else if (selres == 0) {
2771                 /*
2772                  * The device indicated that there's no data to read, but didn't call
2773                  * `selrecord`.  Nothing will be notified of changes to this vnode, so
2774                  * return an error back to user space, to make it clear that the knote
2775                  * is not attached.
2776                  */
2777                 knote_set_error(kn, ENODEV);
2778         }
2779
2780         vnode_put(vp);
2781
2782         return selres;
2783 }
2784
2785 static void filt_spec_common(struct knote *kn, int selres)
2786 {
2787         if (kn->kn_vnode_use_ofst) {
2788                 if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) {
2789                         kn->kn_data = 0;
2790                 } else {
2791                         kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset;
2792                 }
2793         } else {
2794                 kn->kn_data = selres;
2795         }
2796 }
2797
2798 static int
2799 filt_specattach(struct knote *kn, __unused struct kevent_internal_s *kev)
2800 {
2801         vnode_t vp;
2802         dev_t dev;
2803
2804         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */
2805
2806         assert(vnode_ischr(vp));
2807
2808         dev = vnode_specrdev(vp);
2809
2810         /*
2811          * For a few special kinds of devices, we can attach knotes with
2812          * no restrictions because their "select" vectors return the amount
2813          * of data available.  Others require an explicit NOTE_LOWAT with
2814          * data of 1, indicating that the caller doesn't care about actual
2815          * data counts, just an indication that the device has data.
2816          */
2817         if (!kn->kn_vnode_kqok &&
2818             ((kn->kn_sfflags & NOTE_LOWAT) == 0 || kn->kn_sdata != 1)) {
2819                 knote_set_error(kn, EINVAL);
2820                 return 0;
2821         }
2822
2823         /*
2824          * This forces the select fallback to call through VNOP_SELECT and hook
2825          * up selinfo on every filter routine.
2826          *
2827          * Pseudo-terminal controllers are opted out of native kevent support --
2828          * remove this when they get their own EVFILTID.
2829          */
2830         if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTC) {
2831                 kn->kn_vnode_kqok = 0;
2832         }
2833
2834         kn->kn_filtid = EVFILTID_SPEC;
2835         kn->kn_hook_data = 0;
2836         kn->kn_hookid = vnode_vid(vp);
2837
2838         knote_markstayactive(kn);
2839         return spec_knote_select_and_link(kn);
2840 }
2841
2842 static void
2843 filt_specdetach(struct knote *kn)
2844 {
2845         knote_clearstayactive(kn);
2846
2847         /*
2848          * This is potentially tricky: the device's selinfo waitq that was
2849          * tricked into being part of this knote's waitq set may not be a part
2850          * of any other set, and the device itself may have revoked the memory
2851          * in which the waitq was held. We use the knote's kn_hook_data field
2852          * to keep the ID of the waitq's prepost table object. This
2853          * object keeps a pointer back to the waitq, and gives us a safe way
2854          * to decouple the dereferencing of driver allocated memory: if the
2855          * driver goes away (taking the waitq with it) then the prepost table
2856          * object will be invalidated. The waitq details are handled in the
2857          * waitq API invoked here.
2858          */
2859         if (kn->kn_hook_data) {
2860                 waitq_unlink_by_prepost_id(kn->kn_hook_data, &(knote_get_kq(kn)->kq_wqs));
2861                 kn->kn_hook_data = 0;
2862         }
2863 }
2864
2865 static int
2866 filt_specevent(struct knote *kn, __unused long hint)
2867 {
2868         /*
2869          * Nothing should call knote or knote_vanish on this knote.
2870          */
2871         panic("filt_specevent(%p)", kn);
2872         return 0;
2873 }
2874
2875 static int
2876 filt_spectouch(struct knote *kn, struct kevent_internal_s *kev)
2877 {
2878         kn->kn_sdata = kev->data;
2879         kn->kn_sfflags = kev->fflags;
2880         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
2881                 kn->kn_udata = kev->udata;
2882
2883         if (kev->flags & EV_ENABLE) {
2884                 return spec_knote_select_and_link(kn);
2885         }
2886
2887         return 0;
2888 }
2889
2890 static int
2891 filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
2892 {
2893 #pragma unused(data)
2894         vnode_t vp;
2895         uthread_t uth;
2896         vfs_context_t ctx;
2897         int res;
2898         int selres;
2899         int error;
2900
2901         uth = get_bsdthread_info(current_thread());
2902         ctx = vfs_context_current();
2903         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
2904
2905         /* FIXME JMM - locking against touches? */
2906
2907         error = vnode_getwithvid(vp, kn->kn_hookid);
2908         if (error != 0) {
2909                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
2910                 *kev = kn->kn_kevent;
2911                 return 1;
2912         }
2913
2914         selres = spec_knote_select_and_link(kn);
2915         filt_spec_common(kn, selres);
2916
2917         vnode_put(vp);
2918
2919         res = ((kn->kn_sfflags & NOTE_LOWAT) != 0) ?
2920                 (kn->kn_data >= kn->kn_sdata) : kn->kn_data;
2921
2922         if (res) {
2923                 *kev = kn->kn_kevent;
2924                 if (kn->kn_flags & EV_CLEAR) {
2925                         kn->kn_fflags = 0;
2926                         kn->kn_data = 0;
2927                 }
2928         }
2929
2930         return res;
2931 }
2932
2933 static unsigned
2934 filt_specpeek(struct knote *kn)
2935 {
2936         int selres = 0;
2937
2938         selres = spec_knote_select_and_link(kn);
2939         filt_spec_common(kn, selres);
2940
2941         return kn->kn_data;
2942 }
2943