bsd/miscfs/specfs/spec_vnops.c

   1 /*
   2  * Copyright (c) 2000-2019 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1989, 1993, 1995
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)spec_vnops.c        8.14 (Berkeley) 5/21/95
  62  */
  63
  64 #include <sys/param.h>
  65 #include <sys/proc_internal.h>
  66 #include <sys/kauth.h>
  67 #include <sys/systm.h>
  68 #include <sys/kernel.h>
  69 #include <sys/conf.h>
  70 #include <sys/buf_internal.h>
  71 #include <sys/mount_internal.h>
  72 #include <sys/vnode_internal.h>
  73 #include <sys/file_internal.h>
  74 #include <sys/namei.h>
  75 #include <sys/stat.h>
  76 #include <sys/errno.h>
  77 #include <sys/ioctl.h>
  78 #include <sys/file.h>
  79 #include <sys/user.h>
  80 #include <sys/malloc.h>
  81 #include <sys/disk.h>
  82 #include <sys/uio_internal.h>
  83 #include <sys/resource.h>
  84 #include <machine/machine_routines.h>
  85 #include <miscfs/specfs/specdev.h>
  86 #include <vfs/vfs_support.h>
  87 #include <vfs/vfs_disk_conditioner.h>
  88
  89 #include <kern/assert.h>
  90 #include <kern/task.h>
  91 #include <kern/sched_prim.h>
  92 #include <kern/thread.h>
  93 #include <kern/policy_internal.h>
  94 #include <kern/timer_call.h>
  95 #include <kern/waitq.h>
  96
  97 #include <pexpert/pexpert.h>
  98
  99 #include <sys/kdebug.h>
 100 #include <libkern/section_keywords.h>
 101
 102 /* XXX following three prototypes should be in a header file somewhere */
 103 extern dev_t    chrtoblk(dev_t dev);
 104 extern boolean_t        iskmemdev(dev_t dev);
 105 extern int bpfkqfilter(dev_t dev, struct knote *kn);
 106 extern int ptsd_kqfilter(dev_t, struct knote *);
 107 extern int ptmx_kqfilter(dev_t, struct knote *);
 108
 109 struct vnode *speclisth[SPECHSZ];
 110
 111 /* symbolic sleep message strings for devices */
 112 char    devopn[] = "devopn";
 113 char    devio[] = "devio";
 114 char    devwait[] = "devwait";
 115 char    devin[] = "devin";
 116 char    devout[] = "devout";
 117 char    devioc[] = "devioc";
 118 char    devcls[] = "devcls";
 119
 120 #define VOPFUNC int (*)(void *)
 121
 122 int(**spec_vnodeop_p)(void *);
 123 const struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
 124         { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)vn_default_error },
 125         { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)spec_lookup },            /* lookup */
 126         { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)err_create },             /* create */
 127         { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)err_mknod },               /* mknod */
 128         { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)spec_open },                        /* open */
 129         { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)spec_close },              /* close */
 130         { .opve_op = &vnop_access_desc, .opve_impl = (VOPFUNC)spec_access },            /* access */
 131         { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)spec_getattr },          /* getattr */
 132         { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)spec_setattr },          /* setattr */
 133         { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)spec_read },                        /* read */
 134         { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)spec_write },              /* write */
 135         { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)spec_ioctl },              /* ioctl */
 136         { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)spec_select },            /* select */
 137         { .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)nop_revoke },             /* revoke */
 138         { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)err_mmap },                 /* mmap */
 139         { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)spec_fsync },              /* fsync */
 140         { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)err_remove },             /* remove */
 141         { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)err_link },                 /* link */
 142         { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)err_rename },             /* rename */
 143         { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)err_mkdir },               /* mkdir */
 144         { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)err_rmdir },               /* rmdir */
 145         { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)err_symlink },           /* symlink */
 146         { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)err_readdir },           /* readdir */
 147         { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)err_readlink },         /* readlink */
 148         { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)nop_inactive },         /* inactive */
 149         { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)nop_reclaim },           /* reclaim */
 150         { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)spec_strategy },                /* strategy */
 151         { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)spec_pathconf },                /* pathconf */
 152         { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)err_advlock },           /* advlock */
 153         { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)spec_bwrite },            /* bwrite */
 154         { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein },             /* Pagein */
 155         { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout },           /* Pageout */
 156         { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile },         /* Copyfile */
 157         { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)spec_blktooff },                /* blktooff */
 158         { .opve_op = &vnop_offtoblk_desc, .opve_impl = (VOPFUNC)spec_offtoblk },                /* offtoblk */
 159         { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)spec_blockmap },                /* blockmap */
 160         { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (int (*)(void *))NULL }
 161 };
 162 const struct vnodeopv_desc spec_vnodeop_opv_desc =
 163 { .opv_desc_vector_p = &spec_vnodeop_p, .opv_desc_ops = spec_vnodeop_entries };
 164
 165
 166 static void set_blocksize(vnode_t, dev_t);
 167
 168 #define LOWPRI_TIER1_WINDOW_MSECS         25
 169 #define LOWPRI_TIER2_WINDOW_MSECS         100
 170 #define LOWPRI_TIER3_WINDOW_MSECS         500
 171
 172 #define LOWPRI_TIER1_IO_PERIOD_MSECS      40
 173 #define LOWPRI_TIER2_IO_PERIOD_MSECS      85
 174 #define LOWPRI_TIER3_IO_PERIOD_MSECS      200
 175
 176 #define LOWPRI_TIER1_IO_PERIOD_SSD_MSECS  5
 177 #define LOWPRI_TIER2_IO_PERIOD_SSD_MSECS  15
 178 #define LOWPRI_TIER3_IO_PERIOD_SSD_MSECS  25
 179
 180
 181 int     throttle_windows_msecs[THROTTLE_LEVEL_END + 1] = {
 182         0,
 183         LOWPRI_TIER1_WINDOW_MSECS,
 184         LOWPRI_TIER2_WINDOW_MSECS,
 185         LOWPRI_TIER3_WINDOW_MSECS,
 186 };
 187
 188 int     throttle_io_period_msecs[THROTTLE_LEVEL_END + 1] = {
 189         0,
 190         LOWPRI_TIER1_IO_PERIOD_MSECS,
 191         LOWPRI_TIER2_IO_PERIOD_MSECS,
 192         LOWPRI_TIER3_IO_PERIOD_MSECS,
 193 };
 194
 195 int     throttle_io_period_ssd_msecs[THROTTLE_LEVEL_END + 1] = {
 196         0,
 197         LOWPRI_TIER1_IO_PERIOD_SSD_MSECS,
 198         LOWPRI_TIER2_IO_PERIOD_SSD_MSECS,
 199         LOWPRI_TIER3_IO_PERIOD_SSD_MSECS,
 200 };
 201
 202
 203 int     throttled_count[THROTTLE_LEVEL_END + 1];
 204
 205 struct _throttle_io_info_t {
 206         lck_mtx_t       throttle_lock;
 207
 208         struct timeval  throttle_last_write_timestamp;
 209         struct timeval  throttle_min_timer_deadline;
 210         struct timeval  throttle_window_start_timestamp[THROTTLE_LEVEL_END + 1]; /* window starts at both the beginning and completion of an I/O */
 211         struct timeval  throttle_last_IO_timestamp[THROTTLE_LEVEL_END + 1];
 212         pid_t           throttle_last_IO_pid[THROTTLE_LEVEL_END + 1];
 213         struct timeval  throttle_start_IO_period_timestamp[THROTTLE_LEVEL_END + 1];
 214         int32_t throttle_inflight_count[THROTTLE_LEVEL_END + 1];
 215
 216         TAILQ_HEAD(, uthread) throttle_uthlist[THROTTLE_LEVEL_END + 1];         /* Lists of throttled uthreads */
 217         int             throttle_next_wake_level;
 218
 219         thread_call_t   throttle_timer_call;
 220         int32_t throttle_timer_ref;
 221         int32_t throttle_timer_active;
 222
 223         int32_t throttle_io_count;
 224         int32_t throttle_io_count_begin;
 225         int    *throttle_io_periods;
 226         uint32_t throttle_io_period_num;
 227
 228         int32_t throttle_refcnt;
 229         int32_t throttle_alloc;
 230         int32_t throttle_disabled;
 231         int32_t throttle_is_fusion_with_priority;
 232 };
 233
 234 struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
 235
 236
 237 int     lowpri_throttle_enabled = 1;
 238
 239
 240 static void throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level);
 241 static int throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap);
 242 static int throttle_get_thread_throttle_level(uthread_t ut);
 243 static int throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier);
 244 void throttle_info_mount_reset_period(mount_t mp, int isssd);
 245
 246 /*
 247  * Trivial lookup routine that always fails.
 248  */
 249 int
 250 spec_lookup(struct vnop_lookup_args *ap)
 251 {
 252         *ap->a_vpp = NULL;
 253         return ENOTDIR;
 254 }
 255
 256 static void
 257 set_blocksize(struct vnode *vp, dev_t dev)
 258 {
 259         int (*size)(dev_t);
 260         int rsize;
 261
 262         if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
 263                 rsize = (*size)(dev);
 264                 if (rsize <= 0) { /* did size fail? */
 265                         vp->v_specsize = DEV_BSIZE;
 266                 } else {
 267                         vp->v_specsize = rsize;
 268                 }
 269         } else {
 270                 vp->v_specsize = DEV_BSIZE;
 271         }
 272 }
 273
 274 void
 275 set_fsblocksize(struct vnode *vp)
 276 {
 277         if (vp->v_type == VBLK) {
 278                 dev_t dev = (dev_t)vp->v_rdev;
 279                 int maj = major(dev);
 280
 281                 if ((u_int)maj >= (u_int)nblkdev) {
 282                         return;
 283                 }
 284
 285                 vnode_lock(vp);
 286                 set_blocksize(vp, dev);
 287                 vnode_unlock(vp);
 288         }
 289 }
 290
 291
 292 /*
 293  * Open a special file.
 294  */
 295 int
 296 spec_open(struct vnop_open_args *ap)
 297 {
 298         struct proc *p = vfs_context_proc(ap->a_context);
 299         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
 300         struct vnode *vp = ap->a_vp;
 301         dev_t bdev, dev = (dev_t)vp->v_rdev;
 302         int maj = major(dev);
 303         int error;
 304
 305         /*
 306          * Don't allow open if fs is mounted -nodev.
 307          */
 308         if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) {
 309                 return ENXIO;
 310         }
 311
 312         switch (vp->v_type) {
 313         case VCHR:
 314                 if ((u_int)maj >= (u_int)nchrdev) {
 315                         return ENXIO;
 316                 }
 317                 if (cred != FSCRED && (ap->a_mode & FWRITE)) {
 318 #if 0
 319                         /*
 320                          * When running in very secure mode, do not allow
 321                          * opens for writing of any disk character devices.
 322                          */
 323                         if (securelevel >= 2 && isdisk(dev, VCHR)) {
 324                                 return EPERM;
 325                         }
 326 #endif
 327
 328                         /* Never allow writing to /dev/mem or /dev/kmem */
 329                         if (iskmemdev(dev)) {
 330                                 return EPERM;
 331                         }
 332                         /*
 333                          * When running in secure mode, do not allow opens for
 334                          * writing of character devices whose corresponding block
 335                          * devices are currently mounted.
 336                          */
 337                         if (securelevel >= 1) {
 338                                 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error)) {
 339                                         return error;
 340                                 }
 341                         }
 342                 }
 343
 344                 devsw_lock(dev, S_IFCHR);
 345                 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
 346
 347                 if (error == 0) {
 348                         vp->v_specinfo->si_opencount++;
 349                 }
 350
 351                 devsw_unlock(dev, S_IFCHR);
 352
 353                 if (error == 0 && cdevsw[maj].d_type == D_DISK && !vp->v_un.vu_specinfo->si_initted) {
 354                         int     isssd = 0;
 355                         uint64_t throttle_mask = 0;
 356                         uint32_t devbsdunit = 0;
 357
 358                         if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL) == 0) {
 359                                 if (throttle_mask != 0 &&
 360                                     VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) {
 361                                         /*
 362                                          * as a reasonable approximation, only use the lowest bit of the mask
 363                                          * to generate a disk unit number
 364                                          */
 365                                         devbsdunit = num_trailing_0(throttle_mask);
 366
 367                                         vnode_lock(vp);
 368
 369                                         vp->v_un.vu_specinfo->si_isssd = isssd;
 370                                         vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit;
 371                                         vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask;
 372                                         vp->v_un.vu_specinfo->si_throttleable = 1;
 373                                         vp->v_un.vu_specinfo->si_initted = 1;
 374
 375                                         vnode_unlock(vp);
 376                                 }
 377                         }
 378                         if (vp->v_un.vu_specinfo->si_initted == 0) {
 379                                 vnode_lock(vp);
 380                                 vp->v_un.vu_specinfo->si_initted = 1;
 381                                 vnode_unlock(vp);
 382                         }
 383                 }
 384                 return error;
 385
 386         case VBLK:
 387                 if ((u_int)maj >= (u_int)nblkdev) {
 388                         return ENXIO;
 389                 }
 390                 /*
 391                  * When running in very secure mode, do not allow
 392                  * opens for writing of any disk block devices.
 393                  */
 394                 if (securelevel >= 2 && cred != FSCRED &&
 395                     (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK) {
 396                         return EPERM;
 397                 }
 398                 /*
 399                  * Do not allow opens of block devices that are
 400                  * currently mounted.
 401                  */
 402                 if ((error = vfs_mountedon(vp))) {
 403                         return error;
 404                 }
 405
 406                 devsw_lock(dev, S_IFBLK);
 407                 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
 408                 if (!error) {
 409                         vp->v_specinfo->si_opencount++;
 410                 }
 411                 devsw_unlock(dev, S_IFBLK);
 412
 413                 if (!error) {
 414                         u_int64_t blkcnt;
 415                         u_int32_t blksize;
 416                         int setsize = 0;
 417                         u_int32_t size512 = 512;
 418
 419
 420                         if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
 421                                 /* Switch to 512 byte sectors (temporarily) */
 422
 423                                 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
 424                                         /* Get the number of 512 byte physical blocks. */
 425                                         if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
 426                                                 setsize = 1;
 427                                         }
 428                                 }
 429                                 /* If it doesn't set back, we can't recover */
 430                                 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context)) {
 431                                         error = ENXIO;
 432                                 }
 433                         }
 434
 435
 436                         vnode_lock(vp);
 437                         set_blocksize(vp, dev);
 438
 439                         /*
 440                          * Cache the size in bytes of the block device for later
 441                          * use by spec_write().
 442                          */
 443                         if (setsize) {
 444                                 vp->v_specdevsize = blkcnt * (u_int64_t)size512;
 445                         } else {
 446                                 vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */
 447                         }
 448                         vnode_unlock(vp);
 449                 }
 450                 return error;
 451         default:
 452                 panic("spec_open type");
 453         }
 454         return 0;
 455 }
 456
 457 /*
 458  * Vnode op for read
 459  */
 460 int
 461 spec_read(struct vnop_read_args *ap)
 462 {
 463         struct vnode *vp = ap->a_vp;
 464         struct uio *uio = ap->a_uio;
 465         struct buf *bp;
 466         daddr64_t bn, nextbn;
 467         long bsize, bscale;
 468         int devBlockSize = 0;
 469         int n, on;
 470         int error = 0;
 471         dev_t dev;
 472
 473 #if DIAGNOSTIC
 474         if (uio->uio_rw != UIO_READ) {
 475                 panic("spec_read mode");
 476         }
 477         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
 478                 panic("spec_read proc");
 479         }
 480 #endif
 481         if (uio_resid(uio) == 0) {
 482                 return 0;
 483         }
 484
 485         switch (vp->v_type) {
 486         case VCHR:
 487         {
 488                 struct _throttle_io_info_t *throttle_info = NULL;
 489                 int thread_throttle_level;
 490                 uint64_t blkno = 0;
 491                 uint32_t iolen = 0;
 492                 int ddisk = 0;
 493                 int ktrace_code = DKIO_READ;
 494                 devBlockSize = vp->v_specsize;
 495                 uintptr_t our_id;
 496
 497                 if (cdevsw[major(vp->v_rdev)].d_type == D_DISK) {
 498                         ddisk = 1;
 499                 }
 500
 501                 if (ddisk && vp->v_un.vu_specinfo->si_throttleable) {
 502                         throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
 503                         thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
 504                 }
 505
 506                 if (kdebug_enable && ddisk) {
 507                         if (devBlockSize == 0) {
 508                                 devBlockSize = 512;  // default sector size
 509                         }
 510
 511                         if (uio_offset(uio) && devBlockSize) {
 512                                 blkno = ((uint64_t) uio_offset(uio) / ((uint64_t)devBlockSize));
 513                         }
 514                         iolen = (int) uio_resid(uio);
 515                         our_id = (uintptr_t)thread_tid(current_thread());
 516                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
 517                             (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
 518                             vp->v_rdev, blkno, iolen, 0);
 519                 }
 520
 521                 error = (*cdevsw[major(vp->v_rdev)].d_read)
 522                     (vp->v_rdev, uio, ap->a_ioflag);
 523
 524
 525                 if (kdebug_enable && ddisk) {
 526                         uint32_t residual = (uint32_t)uio_resid(uio);
 527                         ktrace_code |= DKIO_DONE;
 528                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
 529                             (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
 530                             (uintptr_t)VM_KERNEL_ADDRPERM(vp), residual, error, 0);
 531                 }
 532
 533                 if (throttle_info) {
 534                         throttle_info_end_io_internal(throttle_info, thread_throttle_level);
 535                 }
 536
 537                 return error;
 538         }
 539
 540         case VBLK:
 541                 if (uio->uio_offset < 0) {
 542                         return EINVAL;
 543                 }
 544
 545                 dev = vp->v_rdev;
 546
 547                 devBlockSize = vp->v_specsize;
 548
 549                 if (devBlockSize > PAGE_SIZE) {
 550                         return EINVAL;
 551                 }
 552
 553                 bscale = PAGE_SIZE / devBlockSize;
 554                 bsize = bscale * devBlockSize;
 555
 556                 do {
 557                         on = uio->uio_offset % bsize;
 558
 559                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) & ~(bscale - 1));
 560
 561                         if (vp->v_speclastr + bscale == bn) {
 562                                 nextbn = bn + bscale;
 563                                 error = buf_breadn(vp, bn, (int)bsize, &nextbn,
 564                                     (int *)&bsize, 1, NOCRED, &bp);
 565                         } else {
 566                                 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
 567                         }
 568
 569                         vnode_lock(vp);
 570                         vp->v_speclastr = bn;
 571                         vnode_unlock(vp);
 572
 573                         n = bsize - buf_resid(bp);
 574                         if ((on > n) || error) {
 575                                 if (!error) {
 576                                         error = EINVAL;
 577                                 }
 578                                 buf_brelse(bp);
 579                                 return error;
 580                         }
 581                         n = min((unsigned)(n  - on), uio_resid(uio));
 582
 583                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
 584                         if (n + on == bsize) {
 585                                 buf_markaged(bp);
 586                         }
 587                         buf_brelse(bp);
 588                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
 589                 return error;
 590
 591         default:
 592                 panic("spec_read type");
 593         }
 594         /* NOTREACHED */
 595
 596         return 0;
 597 }
 598
 599 /*
 600  * Vnode op for write
 601  */
 602 int
 603 spec_write(struct vnop_write_args *ap)
 604 {
 605         struct vnode *vp = ap->a_vp;
 606         struct uio *uio = ap->a_uio;
 607         struct buf *bp;
 608         daddr64_t bn;
 609         int bsize, blkmask, bscale;
 610         int io_sync;
 611         int devBlockSize = 0;
 612         int n, on;
 613         int error = 0;
 614         dev_t dev;
 615
 616 #if DIAGNOSTIC
 617         if (uio->uio_rw != UIO_WRITE) {
 618                 panic("spec_write mode");
 619         }
 620         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
 621                 panic("spec_write proc");
 622         }
 623 #endif
 624
 625         switch (vp->v_type) {
 626         case VCHR:
 627         {
 628                 struct _throttle_io_info_t *throttle_info = NULL;
 629                 int thread_throttle_level;
 630                 dev = vp->v_rdev;
 631                 devBlockSize = vp->v_specsize;
 632                 uint32_t iolen = 0;
 633                 uint64_t blkno = 0;
 634                 int ddisk = 0;
 635                 int ktrace_code = 0;  // write is implied; read must be OR'd in.
 636                 uintptr_t our_id;
 637
 638                 if (cdevsw[major(dev)].d_type == D_DISK) {
 639                         ddisk = 1;
 640                 }
 641
 642                 if (ddisk && vp->v_un.vu_specinfo->si_throttleable) {
 643                         throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
 644
 645                         thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
 646
 647                         microuptime(&throttle_info->throttle_last_write_timestamp);
 648                 }
 649
 650                 if (kdebug_enable && ddisk) {
 651                         if (devBlockSize == 0) {
 652                                 devBlockSize = 512; // default sector size
 653                         }
 654                         if ((uio_offset(uio) != 0) && devBlockSize) {
 655                                 blkno = ((uint64_t)uio_offset(uio)) / ((uint64_t)devBlockSize);
 656                         }
 657                         iolen = (int)uio_resid(uio);
 658                         our_id = (uintptr_t)thread_tid(current_thread());
 659                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
 660                             (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
 661                             vp->v_rdev, blkno, iolen, 0);
 662                 }
 663                 error = (*cdevsw[major(vp->v_rdev)].d_write)
 664                     (vp->v_rdev, uio, ap->a_ioflag);
 665
 666                 if (kdebug_enable && ddisk) {
 667                         //emit the I/O completion
 668                         uint32_t residual = (uint32_t)uio_resid(uio);
 669                         ktrace_code |= DKIO_DONE;
 670                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
 671                             (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
 672                             (uintptr_t)VM_KERNEL_ADDRPERM(vp), residual, error, 0);
 673                 }
 674
 675                 if (throttle_info) {
 676                         throttle_info_end_io_internal(throttle_info, thread_throttle_level);
 677                 }
 678
 679                 return error;
 680         }
 681
 682         case VBLK:
 683                 if (uio_resid(uio) == 0) {
 684                         return 0;
 685                 }
 686                 if (uio->uio_offset < 0) {
 687                         return EINVAL;
 688                 }
 689
 690                 io_sync = (ap->a_ioflag & IO_SYNC);
 691
 692                 dev = (vp->v_rdev);
 693
 694                 devBlockSize = vp->v_specsize;
 695                 if (devBlockSize > PAGE_SIZE) {
 696                         return EINVAL;
 697                 }
 698
 699                 bscale = PAGE_SIZE / devBlockSize;
 700                 blkmask = bscale - 1;
 701                 bsize = bscale * devBlockSize;
 702
 703
 704                 do {
 705                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) & ~blkmask);
 706                         on = uio->uio_offset % bsize;
 707
 708                         n = min((unsigned)(bsize - on), uio_resid(uio));
 709
 710                         /*
 711                          * Use buf_getblk() as an optimization IFF:
 712                          *
 713                          * 1)   We are reading exactly a block on a block
 714                          *      aligned boundary
 715                          * 2)   We know the size of the device from spec_open
 716                          * 3)   The read doesn't span the end of the device
 717                          *
 718                          * Otherwise, we fall back on buf_bread().
 719                          */
 720                         if (n == bsize &&
 721                             vp->v_specdevsize != (u_int64_t)0 &&
 722                             (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
 723                                 /* reduce the size of the read to what is there */
 724                                 n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
 725                         }
 726
 727                         if (n == bsize) {
 728                                 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
 729                         } else {
 730                                 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
 731                         }
 732
 733                         /* Translate downstream error for upstream, if needed */
 734                         if (!error) {
 735                                 error = (int)buf_error(bp);
 736                         }
 737                         if (error) {
 738                                 buf_brelse(bp);
 739                                 return error;
 740                         }
 741                         n = min(n, bsize - buf_resid(bp));
 742
 743                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
 744                         if (error) {
 745                                 buf_brelse(bp);
 746                                 return error;
 747                         }
 748                         buf_markaged(bp);
 749
 750                         if (io_sync) {
 751                                 error = buf_bwrite(bp);
 752                         } else {
 753                                 if ((n + on) == bsize) {
 754                                         error = buf_bawrite(bp);
 755                                 } else {
 756                                         error = buf_bdwrite(bp);
 757                                 }
 758                         }
 759                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
 760                 return error;
 761
 762         default:
 763                 panic("spec_write type");
 764         }
 765         /* NOTREACHED */
 766
 767         return 0;
 768 }
 769
 770 /*
 771  * Device ioctl operation.
 772  */
 773 int
 774 spec_ioctl(struct vnop_ioctl_args *ap)
 775 {
 776         proc_t p = vfs_context_proc(ap->a_context);
 777         dev_t dev = ap->a_vp->v_rdev;
 778         int     retval = 0;
 779
 780         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START,
 781             dev, ap->a_command, ap->a_fflag, ap->a_vp->v_type, 0);
 782
 783         switch (ap->a_vp->v_type) {
 784         case VCHR:
 785                 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
 786                     ap->a_fflag, p);
 787                 break;
 788
 789         case VBLK:
 790                 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, p);
 791                 if (!retval && ap->a_command == DKIOCSETBLOCKSIZE) {
 792                         ap->a_vp->v_specsize = *(uint32_t *)ap->a_data;
 793                 }
 794                 break;
 795
 796         default:
 797                 panic("spec_ioctl");
 798                 /* NOTREACHED */
 799         }
 800         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END,
 801             dev, ap->a_command, ap->a_fflag, retval, 0);
 802
 803         return retval;
 804 }
 805
 806 int
 807 spec_select(struct vnop_select_args *ap)
 808 {
 809         proc_t p = vfs_context_proc(ap->a_context);
 810         dev_t dev;
 811
 812         switch (ap->a_vp->v_type) {
 813         default:
 814                 return 1;             /* XXX */
 815
 816         case VCHR:
 817                 dev = ap->a_vp->v_rdev;
 818                 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
 819         }
 820 }
 821
 822 static int filt_specattach(struct knote *kn, struct kevent_qos_s *kev);
 823
 824 int
 825 spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_qos_s *kev)
 826 {
 827         dev_t dev;
 828
 829         assert(vnode_ischr(vp));
 830
 831         dev = vnode_specrdev(vp);
 832
 833 #if NETWORKING
 834         /*
 835          * Try a bpf device, as defined in bsd/net/bpf.c
 836          * If it doesn't error out the attach, then it
 837          * claimed it. Otherwise, fall through and try
 838          * other attaches.
 839          */
 840         int32_t tmp_flags = kn->kn_flags;
 841         int64_t tmp_sdata = kn->kn_sdata;
 842         int res;
 843
 844         res = bpfkqfilter(dev, kn);
 845         if ((kn->kn_flags & EV_ERROR) == 0) {
 846                 return res;
 847         }
 848         kn->kn_flags = tmp_flags;
 849         kn->kn_sdata = tmp_sdata;
 850 #endif
 851
 852         if (major(dev) > nchrdev) {
 853                 knote_set_error(kn, ENXIO);
 854                 return 0;
 855         }
 856
 857         kn->kn_vnode_kqok = !!(cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE);
 858         kn->kn_vnode_use_ofst = !!(cdevsw_flags[major(dev)] & CDEVSW_USE_OFFSET);
 859
 860         if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTS) {
 861                 kn->kn_filtid = EVFILTID_PTSD;
 862                 return ptsd_kqfilter(dev, kn);
 863         } else if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTC) {
 864                 kn->kn_filtid = EVFILTID_PTMX;
 865                 return ptmx_kqfilter(dev, kn);
 866         } else if (cdevsw[major(dev)].d_type == D_TTY && kn->kn_vnode_kqok) {
 867                 /*
 868                  * TTYs from drivers that use struct ttys use their own filter
 869                  * routines.  The PTC driver doesn't use the tty for character
 870                  * counts, so it must go through the select fallback.
 871                  */
 872                 kn->kn_filtid = EVFILTID_TTY;
 873                 return knote_fops(kn)->f_attach(kn, kev);
 874         }
 875
 876         /* Try to attach to other char special devices */
 877         return filt_specattach(kn, kev);
 878 }
 879
 880 /*
 881  * Synch buffers associated with a block device
 882  */
 883 int
 884 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
 885 {
 886         if (vp->v_type == VCHR) {
 887                 return 0;
 888         }
 889         /*
 890          * Flush all dirty buffers associated with a block device.
 891          */
 892         buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync");
 893
 894         return 0;
 895 }
 896
 897 int
 898 spec_fsync(struct vnop_fsync_args *ap)
 899 {
 900         return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
 901 }
 902
 903
 904 /*
 905  * Just call the device strategy routine
 906  */
 907 void throttle_init(void);
 908
 909
 910 #if 0
 911 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)  \
 912         do {                                                    \
 913                if ((debug_info)->alloc)                           \
 914                printf("%s: "format, __FUNCTION__, ## args);     \
 915        } while(0)
 916
 917 #else
 918 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
 919 #endif
 920
 921
 922 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER1], 0, "");
 923 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER2], 0, "");
 924 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER3], 0, "");
 925
 926 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER1], 0, "");
 927 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER2], 0, "");
 928 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER3], 0, "");
 929
 930 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER1], 0, "");
 931 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER2], 0, "");
 932 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER3], 0, "");
 933
 934 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_throttle_enabled, 0, "");
 935
 936
 937 static lck_grp_t        *throttle_lock_grp;
 938 static lck_attr_t       *throttle_lock_attr;
 939 static lck_grp_attr_t   *throttle_lock_grp_attr;
 940
 941
 942 /*
 943  * throttled I/O helper function
 944  * convert the index of the lowest set bit to a device index
 945  */
 946 int
 947 num_trailing_0(uint64_t n)
 948 {
 949         /*
 950          * since in most cases the number of trailing 0s is very small,
 951          * we simply counting sequentially from the lowest bit
 952          */
 953         if (n == 0) {
 954                 return sizeof(n) * 8;
 955         }
 956         int count = 0;
 957         while (!ISSET(n, 1)) {
 958                 n >>= 1;
 959                 ++count;
 960         }
 961         return count;
 962 }
 963
 964
 965 /*
 966  * Release the reference and if the item was allocated and this is the last
 967  * reference then free it.
 968  *
 969  * This routine always returns the old value.
 970  */
 971 static int
 972 throttle_info_rel(struct _throttle_io_info_t *info)
 973 {
 974         SInt32 oldValue = OSDecrementAtomic(&info->throttle_refcnt);
 975
 976         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
 977             info, (int)(oldValue - 1), info );
 978
 979         /* The reference count just went negative, very bad */
 980         if (oldValue == 0) {
 981                 panic("throttle info ref cnt went negative!");
 982         }
 983
 984         /*
 985          * Once reference count is zero, no one else should be able to take a
 986          * reference
 987          */
 988         if ((info->throttle_refcnt == 0) && (info->throttle_alloc)) {
 989                 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info);
 990
 991                 lck_mtx_destroy(&info->throttle_lock, throttle_lock_grp);
 992                 FREE(info, M_TEMP);
 993         }
 994         return oldValue;
 995 }
 996
 997
 998 /*
 999  * Just take a reference on the throttle info structure.
1000  *
1001  * This routine always returns the old value.
1002  */
1003 static SInt32
1004 throttle_info_ref(struct _throttle_io_info_t *info)
1005 {
1006         SInt32 oldValue = OSIncrementAtomic(&info->throttle_refcnt);
1007
1008         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
1009             info, (int)(oldValue - 1), info );
1010         /* Allocated items should never have a reference of zero */
1011         if (info->throttle_alloc && (oldValue == 0)) {
1012                 panic("Taking a reference without calling create throttle info!\n");
1013         }
1014
1015         return oldValue;
1016 }
1017
1018 /*
1019  * on entry the throttle_lock is held...
1020  * this function is responsible for taking
1021  * and dropping the reference on the info
1022  * structure which will keep it from going
1023  * away while the timer is running if it
1024  * happens to have been dynamically allocated by
1025  * a network fileystem kext which is now trying
1026  * to free it
1027  */
1028 static uint32_t
1029 throttle_timer_start(struct _throttle_io_info_t *info, boolean_t update_io_count, int wakelevel)
1030 {
1031         struct timeval  elapsed;
1032         struct timeval  now;
1033         struct timeval  period;
1034         uint64_t        elapsed_msecs;
1035         int             throttle_level;
1036         int             level;
1037         int             msecs;
1038         boolean_t       throttled = FALSE;
1039         boolean_t       need_timer = FALSE;
1040
1041         microuptime(&now);
1042
1043         if (update_io_count == TRUE) {
1044                 info->throttle_io_count_begin = info->throttle_io_count;
1045                 info->throttle_io_period_num++;
1046
1047                 while (wakelevel >= THROTTLE_LEVEL_THROTTLED) {
1048                         info->throttle_start_IO_period_timestamp[wakelevel--] = now;
1049                 }
1050
1051                 info->throttle_min_timer_deadline = now;
1052
1053                 msecs = info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED];
1054                 period.tv_sec = msecs / 1000;
1055                 period.tv_usec = (msecs % 1000) * 1000;
1056
1057                 timevaladd(&info->throttle_min_timer_deadline, &period);
1058         }
1059         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < THROTTLE_LEVEL_END; throttle_level++) {
1060                 elapsed = now;
1061                 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
1062                 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1063
1064                 for (level = throttle_level + 1; level <= THROTTLE_LEVEL_END; level++) {
1065                         if (!TAILQ_EMPTY(&info->throttle_uthlist[level])) {
1066                                 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[level] || info->throttle_inflight_count[throttle_level]) {
1067                                         /*
1068                                          * we had an I/O occur at a higher priority tier within
1069                                          * this tier's throttle window
1070                                          */
1071                                         throttled = TRUE;
1072                                 }
1073                                 /*
1074                                  * we assume that the windows are the same or longer
1075                                  * as we drop through the throttling tiers...  thus
1076                                  * we can stop looking once we run into a tier with
1077                                  * threads to schedule regardless of whether it's
1078                                  * still in its throttling window or not
1079                                  */
1080                                 break;
1081                         }
1082                 }
1083                 if (throttled == TRUE) {
1084                         break;
1085                 }
1086         }
1087         if (throttled == TRUE) {
1088                 uint64_t        deadline = 0;
1089                 struct timeval  target;
1090                 struct timeval  min_target;
1091
1092                 /*
1093                  * we've got at least one tier still in a throttled window
1094                  * so we need a timer running... compute the next deadline
1095                  * and schedule it
1096                  */
1097                 for (level = throttle_level + 1; level <= THROTTLE_LEVEL_END; level++) {
1098                         if (TAILQ_EMPTY(&info->throttle_uthlist[level])) {
1099                                 continue;
1100                         }
1101
1102                         target = info->throttle_start_IO_period_timestamp[level];
1103
1104                         msecs = info->throttle_io_periods[level];
1105                         period.tv_sec = msecs / 1000;
1106                         period.tv_usec = (msecs % 1000) * 1000;
1107
1108                         timevaladd(&target, &period);
1109
1110                         if (need_timer == FALSE || timevalcmp(&target, &min_target, <)) {
1111                                 min_target = target;
1112                                 need_timer = TRUE;
1113                         }
1114                 }
1115                 if (timevalcmp(&info->throttle_min_timer_deadline, &now, >)) {
1116                         if (timevalcmp(&info->throttle_min_timer_deadline, &min_target, >)) {
1117                                 min_target = info->throttle_min_timer_deadline;
1118                         }
1119                 }
1120
1121                 if (info->throttle_timer_active) {
1122                         if (thread_call_cancel(info->throttle_timer_call) == FALSE) {
1123                                 /*
1124                                  * couldn't kill the timer because it's already
1125                                  * been dispatched, so don't try to start a new
1126                                  * one... once we drop the lock, the timer will
1127                                  * proceed and eventually re-run this function
1128                                  */
1129                                 need_timer = FALSE;
1130                         } else {
1131                                 info->throttle_timer_active = 0;
1132                         }
1133                 }
1134                 if (need_timer == TRUE) {
1135                         /*
1136                          * This is defined as an int (32-bit) rather than a 64-bit
1137                          * value because it would need a really big period in the
1138                          * order of ~500 days to overflow this. So, we let this be
1139                          * 32-bit which allows us to use the clock_interval_to_deadline()
1140                          * routine.
1141                          */
1142                         int     target_msecs;
1143
1144                         if (info->throttle_timer_ref == 0) {
1145                                 /*
1146                                  * take a reference for the timer
1147                                  */
1148                                 throttle_info_ref(info);
1149
1150                                 info->throttle_timer_ref = 1;
1151                         }
1152                         elapsed = min_target;
1153                         timevalsub(&elapsed, &now);
1154                         target_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
1155
1156                         if (target_msecs <= 0) {
1157                                 /*
1158                                  * we may have computed a deadline slightly in the past
1159                                  * due to various factors... if so, just set the timer
1160                                  * to go off in the near future (we don't need to be precise)
1161                                  */
1162                                 target_msecs = 1;
1163                         }
1164                         clock_interval_to_deadline(target_msecs, 1000000, &deadline);
1165
1166                         thread_call_enter_delayed(info->throttle_timer_call, deadline);
1167                         info->throttle_timer_active = 1;
1168                 }
1169         }
1170         return throttle_level;
1171 }
1172
1173
1174 static void
1175 throttle_timer(struct _throttle_io_info_t *info)
1176 {
1177         uthread_t       ut, utlist;
1178         struct timeval  elapsed;
1179         struct timeval  now;
1180         uint64_t        elapsed_msecs;
1181         int             throttle_level;
1182         int             level;
1183         int             wake_level;
1184         caddr_t         wake_address = NULL;
1185         boolean_t       update_io_count = FALSE;
1186         boolean_t       need_wakeup = FALSE;
1187         boolean_t       need_release = FALSE;
1188
1189         ut = NULL;
1190         lck_mtx_lock(&info->throttle_lock);
1191
1192         info->throttle_timer_active = 0;
1193         microuptime(&now);
1194
1195         elapsed = now;
1196         timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[THROTTLE_LEVEL_THROTTLED]);
1197         elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1198
1199         if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED]) {
1200                 wake_level = info->throttle_next_wake_level;
1201
1202                 for (level = THROTTLE_LEVEL_START; level < THROTTLE_LEVEL_END; level++) {
1203                         elapsed = now;
1204                         timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[wake_level]);
1205                         elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1206
1207                         if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[wake_level] && !TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
1208                                 /*
1209                                  * we're closing out the current IO period...
1210                                  * if we have a waiting thread, wake it up
1211                                  * after we have reset the I/O window info
1212                                  */
1213                                 need_wakeup = TRUE;
1214                                 update_io_count = TRUE;
1215
1216                                 info->throttle_next_wake_level = wake_level - 1;
1217
1218                                 if (info->throttle_next_wake_level == THROTTLE_LEVEL_START) {
1219                                         info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1220                                 }
1221
1222                                 break;
1223                         }
1224                         wake_level--;
1225
1226                         if (wake_level == THROTTLE_LEVEL_START) {
1227                                 wake_level = THROTTLE_LEVEL_END;
1228                         }
1229                 }
1230         }
1231         if (need_wakeup == TRUE) {
1232                 if (!TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
1233                         ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist[wake_level]);
1234                         TAILQ_REMOVE(&info->throttle_uthlist[wake_level], ut, uu_throttlelist);
1235                         ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1236                         ut->uu_is_throttled = false;
1237
1238                         wake_address = (caddr_t)&ut->uu_on_throttlelist;
1239                 }
1240         } else {
1241                 wake_level = THROTTLE_LEVEL_START;
1242         }
1243
1244         throttle_level = throttle_timer_start(info, update_io_count, wake_level);
1245
1246         if (wake_address != NULL) {
1247                 wakeup(wake_address);
1248         }
1249
1250         for (level = THROTTLE_LEVEL_THROTTLED; level <= throttle_level; level++) {
1251                 TAILQ_FOREACH_SAFE(ut, &info->throttle_uthlist[level], uu_throttlelist, utlist) {
1252                         TAILQ_REMOVE(&info->throttle_uthlist[level], ut, uu_throttlelist);
1253                         ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1254                         ut->uu_is_throttled = false;
1255
1256                         wakeup(&ut->uu_on_throttlelist);
1257                 }
1258         }
1259         if (info->throttle_timer_active == 0 && info->throttle_timer_ref) {
1260                 info->throttle_timer_ref = 0;
1261                 need_release = TRUE;
1262         }
1263         lck_mtx_unlock(&info->throttle_lock);
1264
1265         if (need_release == TRUE) {
1266                 throttle_info_rel(info);
1267         }
1268 }
1269
1270
1271 static int
1272 throttle_add_to_list(struct _throttle_io_info_t *info, uthread_t ut, int mylevel, boolean_t insert_tail)
1273 {
1274         boolean_t start_timer = FALSE;
1275         int level = THROTTLE_LEVEL_START;
1276
1277         if (TAILQ_EMPTY(&info->throttle_uthlist[mylevel])) {
1278                 info->throttle_start_IO_period_timestamp[mylevel] = info->throttle_last_IO_timestamp[mylevel];
1279                 start_timer = TRUE;
1280         }
1281
1282         if (insert_tail == TRUE) {
1283                 TAILQ_INSERT_TAIL(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
1284         } else {
1285                 TAILQ_INSERT_HEAD(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
1286         }
1287
1288         ut->uu_on_throttlelist = mylevel;
1289
1290         if (start_timer == TRUE) {
1291                 /* we may need to start or rearm the timer */
1292                 level = throttle_timer_start(info, FALSE, THROTTLE_LEVEL_START);
1293
1294                 if (level == THROTTLE_LEVEL_END) {
1295                         if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
1296                                 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1297
1298                                 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1299                         }
1300                 }
1301         }
1302         return level;
1303 }
1304
1305 static void
1306 throttle_init_throttle_window(void)
1307 {
1308         int throttle_window_size;
1309
1310         /*
1311          * The hierarchy of throttle window values is as follows:
1312          * - Global defaults
1313          * - Device tree properties
1314          * - Boot-args
1315          * All values are specified in msecs.
1316          */
1317
1318         /* Override global values with device-tree properties */
1319         if (PE_get_default("kern.io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size))) {
1320                 throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
1321         }
1322
1323         if (PE_get_default("kern.io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size))) {
1324                 throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
1325         }
1326
1327         if (PE_get_default("kern.io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size))) {
1328                 throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
1329         }
1330
1331         /* Override with boot-args */
1332         if (PE_parse_boot_argn("io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size))) {
1333                 throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
1334         }
1335
1336         if (PE_parse_boot_argn("io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size))) {
1337                 throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
1338         }
1339
1340         if (PE_parse_boot_argn("io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size))) {
1341                 throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
1342         }
1343 }
1344
1345 static void
1346 throttle_init_throttle_period(struct _throttle_io_info_t *info, boolean_t isssd)
1347 {
1348         int throttle_period_size;
1349
1350         /*
1351          * The hierarchy of throttle period values is as follows:
1352          * - Global defaults
1353          * - Device tree properties
1354          * - Boot-args
1355          * All values are specified in msecs.
1356          */
1357
1358         /* Assign global defaults */
1359         if ((isssd == TRUE) && (info->throttle_is_fusion_with_priority == 0)) {
1360                 info->throttle_io_periods = &throttle_io_period_ssd_msecs[0];
1361         } else {
1362                 info->throttle_io_periods = &throttle_io_period_msecs[0];
1363         }
1364
1365         /* Override global values with device-tree properties */
1366         if (PE_get_default("kern.io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size))) {
1367                 info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
1368         }
1369
1370         if (PE_get_default("kern.io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size))) {
1371                 info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
1372         }
1373
1374         if (PE_get_default("kern.io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size))) {
1375                 info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
1376         }
1377
1378         /* Override with boot-args */
1379         if (PE_parse_boot_argn("io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size))) {
1380                 info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
1381         }
1382
1383         if (PE_parse_boot_argn("io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size))) {
1384                 info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
1385         }
1386
1387         if (PE_parse_boot_argn("io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size))) {
1388                 info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
1389         }
1390 }
1391
1392 #if CONFIG_IOSCHED
1393 extern  void vm_io_reprioritize_init(void);
1394 int     iosched_enabled = 1;
1395 #endif
1396
1397 void
1398 throttle_init(void)
1399 {
1400         struct _throttle_io_info_t *info;
1401         int     i;
1402         int     level;
1403 #if CONFIG_IOSCHED
1404         int     iosched;
1405 #endif
1406         /*
1407          * allocate lock group attribute and group
1408          */
1409         throttle_lock_grp_attr = lck_grp_attr_alloc_init();
1410         throttle_lock_grp = lck_grp_alloc_init("throttle I/O", throttle_lock_grp_attr);
1411
1412         /* Update throttle parameters based on device tree configuration */
1413         throttle_init_throttle_window();
1414
1415         /*
1416          * allocate the lock attribute
1417          */
1418         throttle_lock_attr = lck_attr_alloc_init();
1419
1420         for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) {
1421                 info = &_throttle_io_info[i];
1422
1423                 lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr);
1424                 info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1425
1426                 for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
1427                         TAILQ_INIT(&info->throttle_uthlist[level]);
1428                         info->throttle_last_IO_pid[level] = 0;
1429                         info->throttle_inflight_count[level] = 0;
1430                 }
1431                 info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1432                 info->throttle_disabled = 0;
1433                 info->throttle_is_fusion_with_priority = 0;
1434         }
1435 #if CONFIG_IOSCHED
1436         if (PE_parse_boot_argn("iosched", &iosched, sizeof(iosched))) {
1437                 iosched_enabled = iosched;
1438         }
1439         if (iosched_enabled) {
1440                 /* Initialize I/O Reprioritization mechanism */
1441                 vm_io_reprioritize_init();
1442         }
1443 #endif
1444 }
1445
1446 void
1447 sys_override_io_throttle(boolean_t enable_override)
1448 {
1449         if (enable_override) {
1450                 lowpri_throttle_enabled = 0;
1451         } else {
1452                 lowpri_throttle_enabled = 1;
1453         }
1454 }
1455
1456 int rethrottle_wakeups = 0;
1457
1458 /*
1459  * the uu_rethrottle_lock is used to synchronize this function
1460  * with "throttle_lowpri_io" which is where a throttled thread
1461  * will block... that function will grab this lock before beginning
1462  * it's decision making process concerning the need to block, and
1463  * hold it through the assert_wait.  When that thread is awakened
1464  * for any reason (timer or rethrottle), it will reacquire the
1465  * uu_rethrottle_lock before determining if it really is ok for
1466  * it to now run.  This is the point at which the thread could
1467  * enter a different throttling queue and reblock or return from
1468  * the throttle w/o having waited out it's entire throttle if
1469  * the rethrottle has now moved it out of any currently
1470  * active throttle window.
1471  *
1472  *
1473  * NOTES:
1474  * 1 - This may be called with the task lock held.
1475  * 2 - This may be called with preemption and interrupts disabled
1476  *     in the kqueue wakeup path so we can't take the throttle_lock which is a mutex
1477  * 3 - This cannot safely dereference uu_throttle_info, as it may
1478  *     get deallocated out from under us
1479  */
1480
1481 void
1482 rethrottle_thread(uthread_t ut)
1483 {
1484         /*
1485          * If uthread doesn't have throttle state, then there's no chance
1486          * of it needing a rethrottle.
1487          */
1488         if (ut->uu_throttle_info == NULL) {
1489                 return;
1490         }
1491
1492         boolean_t s = ml_set_interrupts_enabled(FALSE);
1493         lck_spin_lock(&ut->uu_rethrottle_lock);
1494
1495         if (!ut->uu_is_throttled) {
1496                 ut->uu_was_rethrottled = true;
1497         } else {
1498                 int my_new_level = throttle_get_thread_throttle_level(ut);
1499
1500                 if (my_new_level != ut->uu_on_throttlelist) {
1501                         /*
1502                          * ut is currently blocked (as indicated by
1503                          * ut->uu_is_throttled == true)
1504                          * and we're changing it's throttle level, so
1505                          * we need to wake it up.
1506                          */
1507                         ut->uu_is_throttled = false;
1508                         wakeup(&ut->uu_on_throttlelist);
1509
1510                         rethrottle_wakeups++;
1511                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 102)), thread_tid(ut->uu_thread), ut->uu_on_throttlelist, my_new_level, 0, 0);
1512                 }
1513         }
1514         lck_spin_unlock(&ut->uu_rethrottle_lock);
1515         ml_set_interrupts_enabled(s);
1516 }
1517
1518
1519 /*
1520  * KPI routine
1521  *
1522  * Create and take a reference on a throttle info structure and return a
1523  * pointer for the file system to use when calling throttle_info_update.
1524  * Calling file system must have a matching release for every create.
1525  */
1526 void *
1527 throttle_info_create(void)
1528 {
1529         struct _throttle_io_info_t *info;
1530         int     level;
1531
1532         MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK);
1533         /* Should never happen but just in case */
1534         if (info == NULL) {
1535                 return NULL;
1536         }
1537         /* Mark that this one was allocated and needs to be freed */
1538         DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
1539         info->throttle_alloc = TRUE;
1540
1541         lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr);
1542         info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1543
1544         for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
1545                 TAILQ_INIT(&info->throttle_uthlist[level]);
1546         }
1547         info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1548
1549         /* Take a reference */
1550         OSIncrementAtomic(&info->throttle_refcnt);
1551         return info;
1552 }
1553
1554 /*
1555  * KPI routine
1556  *
1557  * Release the throttle info pointer if all the reference are gone. Should be
1558  * called to release reference taken by throttle_info_create
1559  */
1560 void
1561 throttle_info_release(void *throttle_info)
1562 {
1563         DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
1564             (struct _throttle_io_info_t *)throttle_info,
1565             (struct _throttle_io_info_t *)throttle_info);
1566         if (throttle_info) { /* Just to be careful */
1567                 throttle_info_rel(throttle_info);
1568         }
1569 }
1570
1571 /*
1572  * KPI routine
1573  *
1574  * File Systems that create an info structure, need to call this routine in
1575  * their mount routine (used by cluster code). File Systems that call this in
1576  * their mount routines must call throttle_info_mount_rel in their unmount
1577  * routines.
1578  */
1579 void
1580 throttle_info_mount_ref(mount_t mp, void *throttle_info)
1581 {
1582         if ((throttle_info == NULL) || (mp == NULL)) {
1583                 return;
1584         }
1585         throttle_info_ref(throttle_info);
1586
1587         /*
1588          * We already have a reference release it before adding the new one
1589          */
1590         if (mp->mnt_throttle_info) {
1591                 throttle_info_rel(mp->mnt_throttle_info);
1592         }
1593         mp->mnt_throttle_info = throttle_info;
1594 }
1595
1596 /*
1597  * Private KPI routine
1598  *
1599  * return a handle for accessing throttle_info given a throttle_mask.  The
1600  * handle must be released by throttle_info_rel_by_mask
1601  */
1602 int
1603 throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle)
1604 {
1605         int     dev_index;
1606         struct _throttle_io_info_t *info;
1607
1608         if (throttle_info_handle == NULL) {
1609                 return EINVAL;
1610         }
1611
1612         dev_index = num_trailing_0(throttle_mask);
1613         info = &_throttle_io_info[dev_index];
1614         throttle_info_ref(info);
1615         *(struct _throttle_io_info_t**)throttle_info_handle = info;
1616
1617         return 0;
1618 }
1619
1620 /*
1621  * Private KPI routine
1622  *
1623  * release the handle obtained by throttle_info_ref_by_mask
1624  */
1625 void
1626 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
1627 {
1628         /*
1629          * for now the handle is just a pointer to _throttle_io_info_t
1630          */
1631         throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
1632 }
1633
1634 /*
1635  * KPI routine
1636  *
1637  * File Systems that throttle_info_mount_ref, must call this routine in their
1638  * umount routine.
1639  */
1640 void
1641 throttle_info_mount_rel(mount_t mp)
1642 {
1643         if (mp->mnt_throttle_info) {
1644                 throttle_info_rel(mp->mnt_throttle_info);
1645         }
1646         mp->mnt_throttle_info = NULL;
1647 }
1648
1649 /*
1650  * Reset throttling periods for the given mount point
1651  *
1652  * private interface used by disk conditioner to reset
1653  * throttling periods when 'is_ssd' status changes
1654  */
1655 void
1656 throttle_info_mount_reset_period(mount_t mp, int isssd)
1657 {
1658         struct _throttle_io_info_t *info;
1659
1660         if (mp == NULL) {
1661                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1662         } else if (mp->mnt_throttle_info == NULL) {
1663                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1664         } else {
1665                 info = mp->mnt_throttle_info;
1666         }
1667
1668         throttle_init_throttle_period(info, isssd);
1669 }
1670
1671 void
1672 throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
1673 {
1674         struct _throttle_io_info_t *info;
1675
1676         if (mp == NULL) {
1677                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1678         } else if (mp->mnt_throttle_info == NULL) {
1679                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1680         } else {
1681                 info = mp->mnt_throttle_info;
1682         }
1683
1684         *tv = info->throttle_last_write_timestamp;
1685 }
1686
1687 void
1688 update_last_io_time(mount_t mp)
1689 {
1690         struct _throttle_io_info_t *info;
1691
1692         if (mp == NULL) {
1693                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1694         } else if (mp->mnt_throttle_info == NULL) {
1695                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1696         } else {
1697                 info = mp->mnt_throttle_info;
1698         }
1699
1700         microuptime(&info->throttle_last_write_timestamp);
1701         if (mp != NULL) {
1702                 mp->mnt_last_write_completed_timestamp = info->throttle_last_write_timestamp;
1703         }
1704 }
1705
1706 int
1707 throttle_get_io_policy(uthread_t *ut)
1708 {
1709         if (ut != NULL) {
1710                 *ut = get_bsdthread_info(current_thread());
1711         }
1712
1713         return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
1714 }
1715
1716 int
1717 throttle_get_passive_io_policy(uthread_t *ut)
1718 {
1719         if (ut != NULL) {
1720                 *ut = get_bsdthread_info(current_thread());
1721         }
1722
1723         return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_PASSIVE_IO);
1724 }
1725
1726
1727 static int
1728 throttle_get_thread_throttle_level(uthread_t ut)
1729 {
1730         uthread_t *ut_p = (ut == NULL) ? &ut : NULL;
1731         int io_tier = throttle_get_io_policy(ut_p);
1732
1733         return throttle_get_thread_throttle_level_internal(ut, io_tier);
1734 }
1735
1736 /*
1737  * Return a throttle level given an existing I/O tier (such as returned by throttle_get_io_policy)
1738  */
1739 static int
1740 throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier)
1741 {
1742         int thread_throttle_level = io_tier;
1743         int user_idle_level;
1744
1745         assert(ut != NULL);
1746
1747         /* Bootcache misses should always be throttled */
1748         if (ut->uu_throttle_bc) {
1749                 thread_throttle_level = THROTTLE_LEVEL_TIER3;
1750         }
1751
1752         /*
1753          * Issue tier3 I/O as tier2 when the user is idle
1754          * to allow maintenance tasks to make more progress.
1755          *
1756          * Assume any positive idle level is enough... for now it's
1757          * only ever 0 or 128 but this is not defined anywhere.
1758          */
1759         if (thread_throttle_level >= THROTTLE_LEVEL_TIER3) {
1760                 user_idle_level = timer_get_user_idle_level();
1761                 if (user_idle_level > 0) {
1762                         thread_throttle_level--;
1763                 }
1764         }
1765
1766         return thread_throttle_level;
1767 }
1768
1769 /*
1770  * I/O will be throttled if either of the following are true:
1771  *   - Higher tiers have in-flight I/O
1772  *   - The time delta since the last start/completion of a higher tier is within the throttle window interval
1773  *
1774  * In-flight I/O is bookended by throttle_info_update_internal/throttle_info_end_io_internal
1775  */
1776 static int
1777 throttle_io_will_be_throttled_internal(void * throttle_info, int * mylevel, int * throttling_level)
1778 {
1779         struct _throttle_io_info_t *info = throttle_info;
1780         struct timeval elapsed;
1781         struct timeval now;
1782         uint64_t elapsed_msecs;
1783         int     thread_throttle_level;
1784         int     throttle_level;
1785
1786         if ((thread_throttle_level = throttle_get_thread_throttle_level(NULL)) < THROTTLE_LEVEL_THROTTLED) {
1787                 return THROTTLE_DISENGAGED;
1788         }
1789
1790         microuptime(&now);
1791
1792         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
1793                 if (info->throttle_inflight_count[throttle_level]) {
1794                         break;
1795                 }
1796                 elapsed = now;
1797                 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
1798                 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1799
1800                 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level]) {
1801                         break;
1802                 }
1803         }
1804         if (throttle_level >= thread_throttle_level) {
1805                 /*
1806                  * we're beyond all of the throttle windows
1807                  * that affect the throttle level of this thread,
1808                  * so go ahead and treat as normal I/O
1809                  */
1810                 return THROTTLE_DISENGAGED;
1811         }
1812         if (mylevel) {
1813                 *mylevel = thread_throttle_level;
1814         }
1815         if (throttling_level) {
1816                 *throttling_level = throttle_level;
1817         }
1818
1819         if (info->throttle_io_count != info->throttle_io_count_begin) {
1820                 /*
1821                  * we've already issued at least one throttleable I/O
1822                  * in the current I/O window, so avoid issuing another one
1823                  */
1824                 return THROTTLE_NOW;
1825         }
1826         /*
1827          * we're in the throttle window, so
1828          * cut the I/O size back
1829          */
1830         return THROTTLE_ENGAGED;
1831 }
1832
1833 /*
1834  * If we have a mount point and it has a throttle info pointer then
1835  * use it to do the check, otherwise use the device unit number to find
1836  * the correct throttle info array element.
1837  */
1838 int
1839 throttle_io_will_be_throttled(__unused int lowpri_window_msecs, mount_t mp)
1840 {
1841         struct _throttle_io_info_t      *info;
1842
1843         /*
1844          * Should we just return zero if no mount point
1845          */
1846         if (mp == NULL) {
1847                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1848         } else if (mp->mnt_throttle_info == NULL) {
1849                 info = &_throttle_io_info[mp->mnt_devbsdunit];
1850         } else {
1851                 info = mp->mnt_throttle_info;
1852         }
1853
1854         if (info->throttle_is_fusion_with_priority) {
1855                 uthread_t ut = get_bsdthread_info(current_thread());
1856                 if (ut->uu_lowpri_window == 0) {
1857                         return THROTTLE_DISENGAGED;
1858                 }
1859         }
1860
1861         if (info->throttle_disabled) {
1862                 return THROTTLE_DISENGAGED;
1863         } else {
1864                 return throttle_io_will_be_throttled_internal(info, NULL, NULL);
1865         }
1866 }
1867
1868 /*
1869  * Routine to increment I/O throttling counters maintained in the proc
1870  */
1871
1872 static void
1873 throttle_update_proc_stats(pid_t throttling_pid, int count)
1874 {
1875         proc_t throttling_proc;
1876         proc_t throttled_proc = current_proc();
1877
1878         /* The throttled_proc is always the current proc; so we are not concerned with refs */
1879         OSAddAtomic64(count, &(throttled_proc->was_throttled));
1880
1881         /* The throttling pid might have exited by now */
1882         throttling_proc = proc_find(throttling_pid);
1883         if (throttling_proc != PROC_NULL) {
1884                 OSAddAtomic64(count, &(throttling_proc->did_throttle));
1885                 proc_rele(throttling_proc);
1886         }
1887 }
1888
1889 /*
1890  * Block until woken up by the throttle timer or by a rethrottle call.
1891  * As long as we hold the throttle_lock while querying the throttle tier, we're
1892  * safe against seeing an old throttle tier after a rethrottle.
1893  */
1894 uint32_t
1895 throttle_lowpri_io(int sleep_amount)
1896 {
1897         uthread_t ut;
1898         struct _throttle_io_info_t *info;
1899         int     throttle_type = 0;
1900         int     mylevel = 0;
1901         int     throttling_level = THROTTLE_LEVEL_NONE;
1902         int     sleep_cnt = 0;
1903         uint32_t  throttle_io_period_num = 0;
1904         boolean_t insert_tail = TRUE;
1905         boolean_t s;
1906
1907         ut = get_bsdthread_info(current_thread());
1908
1909         if (ut->uu_lowpri_window == 0) {
1910                 return 0;
1911         }
1912
1913         info = ut->uu_throttle_info;
1914
1915         if (info == NULL) {
1916                 ut->uu_throttle_bc = false;
1917                 ut->uu_lowpri_window = 0;
1918                 return 0;
1919         }
1920         lck_mtx_lock(&info->throttle_lock);
1921         assert(ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED);
1922
1923         if (sleep_amount == 0) {
1924                 goto done;
1925         }
1926
1927         if (sleep_amount == 1 && !ut->uu_throttle_bc) {
1928                 sleep_amount = 0;
1929         }
1930
1931         throttle_io_period_num = info->throttle_io_period_num;
1932
1933         ut->uu_was_rethrottled = false;
1934
1935         while ((throttle_type = throttle_io_will_be_throttled_internal(info, &mylevel, &throttling_level))) {
1936                 if (throttle_type == THROTTLE_ENGAGED) {
1937                         if (sleep_amount == 0) {
1938                                 break;
1939                         }
1940                         if (info->throttle_io_period_num < throttle_io_period_num) {
1941                                 break;
1942                         }
1943                         if ((info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) {
1944                                 break;
1945                         }
1946                 }
1947                 /*
1948                  * keep the same position in the list if "rethrottle_thread" changes our throttle level  and
1949                  * then puts us back to the original level before we get a chance to run
1950                  */
1951                 if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED && ut->uu_on_throttlelist != mylevel) {
1952                         /*
1953                          * must have been awakened via "rethrottle_thread" (the timer pulls us off the list)
1954                          * and we've changed our throttling level, so pull ourselves off of the appropriate list
1955                          * and make sure we get put on the tail of the new list since we're starting anew w/r to
1956                          * the throttling engine
1957                          */
1958                         TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1959                         ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1960                         insert_tail = TRUE;
1961                 }
1962                 if (ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED) {
1963                         if (throttle_add_to_list(info, ut, mylevel, insert_tail) == THROTTLE_LEVEL_END) {
1964                                 goto done;
1965                         }
1966                 }
1967                 assert(throttling_level >= THROTTLE_LEVEL_START && throttling_level <= THROTTLE_LEVEL_END);
1968
1969                 s = ml_set_interrupts_enabled(FALSE);
1970                 lck_spin_lock(&ut->uu_rethrottle_lock);
1971
1972                 /*
1973                  * this is the critical section w/r to our interaction
1974                  * with "rethrottle_thread"
1975                  */
1976                 if (ut->uu_was_rethrottled) {
1977                         lck_spin_unlock(&ut->uu_rethrottle_lock);
1978                         ml_set_interrupts_enabled(s);
1979                         lck_mtx_yield(&info->throttle_lock);
1980
1981                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 103)), thread_tid(ut->uu_thread), ut->uu_on_throttlelist, 0, 0, 0);
1982
1983                         ut->uu_was_rethrottled = false;
1984                         continue;
1985                 }
1986                 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, PROCESS_THROTTLED)) | DBG_FUNC_NONE,
1987                     info->throttle_last_IO_pid[throttling_level], throttling_level, proc_selfpid(), mylevel, 0);
1988
1989                 if (sleep_cnt == 0) {
1990                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
1991                             throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0);
1992                         throttled_count[mylevel]++;
1993                 }
1994                 ut->uu_wmesg = "throttle_lowpri_io";
1995
1996                 assert_wait((caddr_t)&ut->uu_on_throttlelist, THREAD_UNINT);
1997
1998                 ut->uu_is_throttled = true;
1999                 lck_spin_unlock(&ut->uu_rethrottle_lock);
2000                 ml_set_interrupts_enabled(s);
2001
2002                 lck_mtx_unlock(&info->throttle_lock);
2003
2004                 thread_block(THREAD_CONTINUE_NULL);
2005
2006                 ut->uu_wmesg = NULL;
2007
2008                 ut->uu_is_throttled = false;
2009                 ut->uu_was_rethrottled = false;
2010
2011                 lck_mtx_lock(&info->throttle_lock);
2012
2013                 sleep_cnt++;
2014
2015                 if (sleep_amount == 0) {
2016                         insert_tail = FALSE;
2017                 } else if (info->throttle_io_period_num < throttle_io_period_num ||
2018                     (info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) {
2019                         insert_tail = FALSE;
2020                         sleep_amount = 0;
2021                 }
2022         }
2023 done:
2024         if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
2025                 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
2026                 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
2027         }
2028         lck_mtx_unlock(&info->throttle_lock);
2029
2030         if (sleep_cnt) {
2031                 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
2032                     throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0);
2033                 /*
2034                  * We update the stats for the last pid which opened a throttle window for the throttled thread.
2035                  * This might not be completely accurate since the multiple throttles seen by the lower tier pid
2036                  * might have been caused by various higher prio pids. However, updating these stats accurately
2037                  * means doing a proc_find while holding the throttle lock which leads to deadlock.
2038                  */
2039                 throttle_update_proc_stats(info->throttle_last_IO_pid[throttling_level], sleep_cnt);
2040         }
2041
2042         ut->uu_throttle_info = NULL;
2043         ut->uu_throttle_bc = false;
2044         ut->uu_lowpri_window = 0;
2045
2046         throttle_info_rel(info);
2047
2048         return sleep_cnt;
2049 }
2050
2051 /*
2052  *  returns TRUE if the throttle_lowpri_io called with the same sleep_amount would've slept
2053  *  This function mimics the most of the throttle_lowpri_io checks but without actual sleeping
2054  */
2055 int
2056 throttle_lowpri_io_will_be_throttled(int sleep_amount)
2057 {
2058         if (sleep_amount == 0) {
2059                 return FALSE;
2060         }
2061
2062         uthread_t ut = get_bsdthread_info(current_thread());
2063         if (ut->uu_lowpri_window == 0) {
2064                 return FALSE;
2065         }
2066
2067         struct _throttle_io_info_t *info = ut->uu_throttle_info;
2068         if (info == NULL) {
2069                 return FALSE;
2070         }
2071
2072         lck_mtx_lock(&info->throttle_lock);
2073         assert(ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED);
2074
2075         if (sleep_amount == 1 && !ut->uu_throttle_bc) {
2076                 sleep_amount = 0;
2077         }
2078
2079         int result = FALSE;
2080
2081         int throttle_type = throttle_io_will_be_throttled_internal(info, NULL, NULL);
2082         if (throttle_type > THROTTLE_DISENGAGED) {
2083                 result = TRUE;
2084                 if ((throttle_type == THROTTLE_ENGAGED) && (sleep_amount == 0)) {
2085                         result = FALSE;
2086                 }
2087         }
2088
2089         lck_mtx_unlock(&info->throttle_lock);
2090
2091         return result;
2092 }
2093
2094
2095 /*
2096  * KPI routine
2097  *
2098  * set a kernel thread's IO policy.  policy can be:
2099  * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE, IOPOL_UTILITY, IOPOL_STANDARD
2100  *
2101  * explanations about these policies are in the man page of setiopolicy_np
2102  */
2103 void
2104 throttle_set_thread_io_policy(int policy)
2105 {
2106         proc_set_thread_policy(current_thread(), TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, policy);
2107 }
2108
2109 int
2110 throttle_get_thread_effective_io_policy()
2111 {
2112         return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
2113 }
2114
2115 void
2116 throttle_info_reset_window(uthread_t ut)
2117 {
2118         struct _throttle_io_info_t *info;
2119
2120         if (ut == NULL) {
2121                 ut = get_bsdthread_info(current_thread());
2122         }
2123
2124         if ((info = ut->uu_throttle_info)) {
2125                 throttle_info_rel(info);
2126
2127                 ut->uu_throttle_info = NULL;
2128                 ut->uu_lowpri_window = 0;
2129                 ut->uu_throttle_bc = false;
2130         }
2131 }
2132
2133 static
2134 void
2135 throttle_info_set_initial_window(uthread_t ut, struct _throttle_io_info_t *info, boolean_t BC_throttle, boolean_t isssd)
2136 {
2137         if (lowpri_throttle_enabled == 0 || info->throttle_disabled) {
2138                 return;
2139         }
2140
2141         if (info->throttle_io_periods == 0) {
2142                 throttle_init_throttle_period(info, isssd);
2143         }
2144         if (ut->uu_throttle_info == NULL) {
2145                 ut->uu_throttle_info = info;
2146                 throttle_info_ref(info);
2147                 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
2148
2149                 ut->uu_lowpri_window = 1;
2150                 ut->uu_throttle_bc = BC_throttle;
2151         }
2152 }
2153
2154 /*
2155  * Update inflight IO count and throttling window
2156  * Should be called when an IO is done
2157  *
2158  * Only affects IO that was sent through spec_strategy
2159  */
2160 void
2161 throttle_info_end_io(buf_t bp)
2162 {
2163         mount_t mp;
2164         struct bufattr *bap;
2165         struct _throttle_io_info_t *info;
2166         int io_tier;
2167
2168         bap = &bp->b_attr;
2169         if (!ISSET(bap->ba_flags, BA_STRATEGY_TRACKED_IO)) {
2170                 return;
2171         }
2172         CLR(bap->ba_flags, BA_STRATEGY_TRACKED_IO);
2173
2174         mp = buf_vnode(bp)->v_mount;
2175         if (mp != NULL) {
2176                 info = &_throttle_io_info[mp->mnt_devbsdunit];
2177         } else {
2178                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2179         }
2180
2181         io_tier = GET_BUFATTR_IO_TIER(bap);
2182         if (ISSET(bap->ba_flags, BA_IO_TIER_UPGRADE)) {
2183                 io_tier--;
2184         }
2185
2186         throttle_info_end_io_internal(info, io_tier);
2187 }
2188
2189 /*
2190  * Decrement inflight count initially incremented by throttle_info_update_internal
2191  */
2192 static
2193 void
2194 throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level)
2195 {
2196         if (throttle_level == THROTTLE_LEVEL_NONE) {
2197                 return;
2198         }
2199
2200         microuptime(&info->throttle_window_start_timestamp[throttle_level]);
2201         OSDecrementAtomic(&info->throttle_inflight_count[throttle_level]);
2202         assert(info->throttle_inflight_count[throttle_level] >= 0);
2203 }
2204
2205 /*
2206  * If inflight is TRUE and bap is NULL then the caller is responsible for calling
2207  * throttle_info_end_io_internal to avoid leaking in-flight I/O.
2208  */
2209 static
2210 int
2211 throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap)
2212 {
2213         int     thread_throttle_level;
2214
2215         if (lowpri_throttle_enabled == 0 || info->throttle_disabled) {
2216                 return THROTTLE_LEVEL_NONE;
2217         }
2218
2219         if (ut == NULL) {
2220                 ut = get_bsdthread_info(current_thread());
2221         }
2222
2223         if (bap && inflight && !ut->uu_throttle_bc) {
2224                 thread_throttle_level = GET_BUFATTR_IO_TIER(bap);
2225                 if (ISSET(bap->ba_flags, BA_IO_TIER_UPGRADE)) {
2226                         thread_throttle_level--;
2227                 }
2228         } else {
2229                 thread_throttle_level = throttle_get_thread_throttle_level(ut);
2230         }
2231
2232         if (thread_throttle_level != THROTTLE_LEVEL_NONE) {
2233                 if (!ISSET(flags, B_PASSIVE)) {
2234                         info->throttle_last_IO_pid[thread_throttle_level] = proc_selfpid();
2235                         if (inflight && !ut->uu_throttle_bc) {
2236                                 if (NULL != bap) {
2237                                         SET(bap->ba_flags, BA_STRATEGY_TRACKED_IO);
2238                                 }
2239                                 OSIncrementAtomic(&info->throttle_inflight_count[thread_throttle_level]);
2240                         } else {
2241                                 microuptime(&info->throttle_window_start_timestamp[thread_throttle_level]);
2242                         }
2243                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, OPEN_THROTTLE_WINDOW)) | DBG_FUNC_NONE,
2244                             current_proc()->p_pid, thread_throttle_level, 0, 0, 0);
2245                 }
2246                 microuptime(&info->throttle_last_IO_timestamp[thread_throttle_level]);
2247         }
2248
2249
2250         if (thread_throttle_level >= THROTTLE_LEVEL_THROTTLED) {
2251                 /*
2252                  * I'd really like to do the IOSleep here, but
2253                  * we may be holding all kinds of filesystem related locks
2254                  * and the pages for this I/O marked 'busy'...
2255                  * we don't want to cause a normal task to block on
2256                  * one of these locks while we're throttling a task marked
2257                  * for low priority I/O... we'll mark the uthread and
2258                  * do the delay just before we return from the system
2259                  * call that triggered this I/O or from vnode_pagein
2260                  */
2261                 OSAddAtomic(1, &info->throttle_io_count);
2262
2263                 throttle_info_set_initial_window(ut, info, FALSE, isssd);
2264         }
2265
2266         return thread_throttle_level;
2267 }
2268
2269 void *
2270 throttle_info_update_by_mount(mount_t mp)
2271 {
2272         struct _throttle_io_info_t *info;
2273         uthread_t ut;
2274         boolean_t isssd = FALSE;
2275
2276         ut = get_bsdthread_info(current_thread());
2277
2278         if (mp != NULL) {
2279                 if (disk_conditioner_mount_is_ssd(mp)) {
2280                         isssd = TRUE;
2281                 }
2282                 info = &_throttle_io_info[mp->mnt_devbsdunit];
2283         } else {
2284                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2285         }
2286
2287         if (!ut->uu_lowpri_window) {
2288                 throttle_info_set_initial_window(ut, info, FALSE, isssd);
2289         }
2290
2291         return info;
2292 }
2293
2294
2295 /*
2296  * KPI routine
2297  *
2298  * this is usually called before every I/O, used for throttled I/O
2299  * book keeping.  This routine has low overhead and does not sleep
2300  */
2301 void
2302 throttle_info_update(void *throttle_info, int flags)
2303 {
2304         if (throttle_info) {
2305                 throttle_info_update_internal(throttle_info, NULL, flags, FALSE, FALSE, NULL);
2306         }
2307 }
2308
2309 /*
2310  * KPI routine
2311  *
2312  * this is usually called before every I/O, used for throttled I/O
2313  * book keeping.  This routine has low overhead and does not sleep
2314  */
2315 void
2316 throttle_info_update_by_mask(void *throttle_info_handle, int flags)
2317 {
2318         void *throttle_info = throttle_info_handle;
2319
2320         /*
2321          * for now we only use the lowest bit of the throttle mask, so the
2322          * handle is the same as the throttle_info.  Later if we store a
2323          * set of throttle infos in the handle, we will want to loop through
2324          * them and call throttle_info_update in a loop
2325          */
2326         throttle_info_update(throttle_info, flags);
2327 }
2328 /*
2329  * KPI routine
2330  *
2331  * This routine marks the throttle info as disabled. Used for mount points which
2332  * support I/O scheduling.
2333  */
2334
2335 void
2336 throttle_info_disable_throttle(int devno, boolean_t isfusion)
2337 {
2338         struct _throttle_io_info_t *info;
2339
2340         if (devno < 0 || devno >= LOWPRI_MAX_NUM_DEV) {
2341                 panic("Illegal devno (%d) passed into throttle_info_disable_throttle()", devno);
2342         }
2343
2344         info = &_throttle_io_info[devno];
2345         // don't disable software throttling on devices that are part of a fusion device
2346         // and override the software throttle periods to use HDD periods
2347         if (isfusion) {
2348                 info->throttle_is_fusion_with_priority = isfusion;
2349                 throttle_init_throttle_period(info, FALSE);
2350         }
2351         info->throttle_disabled = !info->throttle_is_fusion_with_priority;
2352         return;
2353 }
2354
2355
2356 /*
2357  * KPI routine (private)
2358  * Called to determine if this IO is being throttled to this level so that it can be treated specially
2359  */
2360 int
2361 throttle_info_io_will_be_throttled(void * throttle_info, int policy)
2362 {
2363         struct _throttle_io_info_t *info = throttle_info;
2364         struct timeval elapsed;
2365         uint64_t elapsed_msecs;
2366         int     throttle_level;
2367         int     thread_throttle_level;
2368
2369         switch (policy) {
2370         case IOPOL_THROTTLE:
2371                 thread_throttle_level = THROTTLE_LEVEL_TIER3;
2372                 break;
2373         case IOPOL_UTILITY:
2374                 thread_throttle_level = THROTTLE_LEVEL_TIER2;
2375                 break;
2376         case IOPOL_STANDARD:
2377                 thread_throttle_level = THROTTLE_LEVEL_TIER1;
2378                 break;
2379         default:
2380                 thread_throttle_level = THROTTLE_LEVEL_TIER0;
2381                 break;
2382         }
2383         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
2384                 if (info->throttle_inflight_count[throttle_level]) {
2385                         break;
2386                 }
2387
2388                 microuptime(&elapsed);
2389                 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
2390                 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
2391
2392                 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level]) {
2393                         break;
2394                 }
2395         }
2396         if (throttle_level >= thread_throttle_level) {
2397                 /*
2398                  * we're beyond all of the throttle windows
2399                  * so go ahead and treat as normal I/O
2400                  */
2401                 return THROTTLE_DISENGAGED;
2402         }
2403         /*
2404          * we're in the throttle window
2405          */
2406         return THROTTLE_ENGAGED;
2407 }
2408
2409 int
2410 throttle_lowpri_window(void)
2411 {
2412         struct uthread *ut = get_bsdthread_info(current_thread());
2413         return ut->uu_lowpri_window;
2414 }
2415
2416
2417 #if CONFIG_IOSCHED
2418 int upl_get_cached_tier(void *);
2419 #endif
2420
2421 int
2422 spec_strategy(struct vnop_strategy_args *ap)
2423 {
2424         buf_t   bp;
2425         int     bflags;
2426         int     io_tier;
2427         int     passive;
2428         dev_t   bdev;
2429         uthread_t ut;
2430         mount_t mp;
2431         struct  bufattr *bap;
2432         int     strategy_ret;
2433         struct _throttle_io_info_t *throttle_info;
2434         boolean_t isssd = FALSE;
2435         boolean_t inflight = FALSE;
2436         boolean_t upgrade = FALSE;
2437         int code = 0;
2438
2439 #if !CONFIG_EMBEDDED
2440         proc_t curproc = current_proc();
2441 #endif /* !CONFIG_EMBEDDED */
2442
2443         bp = ap->a_bp;
2444         bdev = buf_device(bp);
2445         mp = buf_vnode(bp)->v_mount;
2446         bap = &bp->b_attr;
2447
2448 #if CONFIG_IOSCHED
2449         if (bp->b_flags & B_CLUSTER) {
2450                 io_tier = upl_get_cached_tier(bp->b_upl);
2451
2452                 if (io_tier == -1) {
2453                         io_tier = throttle_get_io_policy(&ut);
2454                 }
2455 #if DEVELOPMENT || DEBUG
2456                 else {
2457                         int my_io_tier = throttle_get_io_policy(&ut);
2458
2459                         if (io_tier != my_io_tier) {
2460                                 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, IO_TIER_UPL_MISMATCH)) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(bp), my_io_tier, io_tier, 0, 0);
2461                         }
2462                 }
2463 #endif
2464         } else {
2465                 io_tier = throttle_get_io_policy(&ut);
2466         }
2467 #else
2468         io_tier = throttle_get_io_policy(&ut);
2469 #endif
2470         passive = throttle_get_passive_io_policy(&ut);
2471
2472         /*
2473          * Mark if the I/O was upgraded by throttle_get_thread_throttle_level
2474          * while preserving the original issued tier (throttle_get_io_policy
2475          * does not return upgraded tiers)
2476          */
2477         if (mp && io_tier > throttle_get_thread_throttle_level_internal(ut, io_tier)) {
2478 #if CONFIG_IOSCHED
2479                 if (!(mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) {
2480                         upgrade = TRUE;
2481                 }
2482 #else /* CONFIG_IOSCHED */
2483                 upgrade = TRUE;
2484 #endif /* CONFIG_IOSCHED */
2485         }
2486
2487         if (bp->b_flags & B_META) {
2488                 bap->ba_flags |= BA_META;
2489         }
2490
2491 #if CONFIG_IOSCHED
2492         /*
2493          * For I/O Scheduling, we currently do not have a way to track and expedite metadata I/Os.
2494          * To ensure we dont get into priority inversions due to metadata I/Os, we use the following rules:
2495          * For metadata reads, ceil all I/Os to IOSCHED_METADATA_TIER & mark them passive if the I/O tier was upgraded
2496          * For metadata writes, unconditionally mark them as IOSCHED_METADATA_TIER and passive
2497          */
2498         if (bap->ba_flags & BA_META) {
2499                 if ((mp && (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) || (bap->ba_flags & BA_IO_SCHEDULED)) {
2500                         if (bp->b_flags & B_READ) {
2501                                 if (io_tier > IOSCHED_METADATA_TIER) {
2502                                         io_tier = IOSCHED_METADATA_TIER;
2503                                         passive = 1;
2504                                 }
2505                         } else {
2506                                 io_tier = IOSCHED_METADATA_TIER;
2507                                 passive = 1;
2508                         }
2509                 }
2510         }
2511 #endif /* CONFIG_IOSCHED */
2512
2513         SET_BUFATTR_IO_TIER(bap, io_tier);
2514
2515         if (passive) {
2516                 bp->b_flags |= B_PASSIVE;
2517                 bap->ba_flags |= BA_PASSIVE;
2518         }
2519
2520 #if !CONFIG_EMBEDDED
2521         if ((curproc != NULL) && ((curproc->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP)) {
2522                 bap->ba_flags |= BA_DELAYIDLESLEEP;
2523         }
2524 #endif /* !CONFIG_EMBEDDED */
2525
2526         bflags = bp->b_flags;
2527
2528         if (((bflags & B_READ) == 0) && ((bflags & B_ASYNC) == 0)) {
2529                 bufattr_markquickcomplete(bap);
2530         }
2531
2532         if (bflags & B_READ) {
2533                 code |= DKIO_READ;
2534         }
2535         if (bflags & B_ASYNC) {
2536                 code |= DKIO_ASYNC;
2537         }
2538
2539         if (bap->ba_flags & BA_META) {
2540                 code |= DKIO_META;
2541         } else if (bflags & B_PAGEIO) {
2542                 code |= DKIO_PAGING;
2543         }
2544
2545         if (io_tier != 0) {
2546                 code |= DKIO_THROTTLE;
2547         }
2548
2549         code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
2550
2551         if (bflags & B_PASSIVE) {
2552                 code |= DKIO_PASSIVE;
2553         }
2554
2555         if (bap->ba_flags & BA_NOCACHE) {
2556                 code |= DKIO_NOCACHE;
2557         }
2558
2559         if (upgrade) {
2560                 code |= DKIO_TIER_UPGRADE;
2561                 SET(bap->ba_flags, BA_IO_TIER_UPGRADE);
2562         }
2563
2564         if (kdebug_enable) {
2565                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
2566                     buf_kernel_addrperm_addr(bp), bdev, buf_blkno(bp), buf_count(bp), 0);
2567         }
2568
2569         thread_update_io_stats(current_thread(), buf_count(bp), code);
2570
2571         if (mp != NULL) {
2572                 if (disk_conditioner_mount_is_ssd(mp)) {
2573                         isssd = TRUE;
2574                 }
2575                 /*
2576                  * Partially initialized mounts don't have a final devbsdunit and should not be tracked.
2577                  * Verify that devbsdunit is initialized (non-zero) or that 0 is the correct initialized value
2578                  * (mnt_throttle_mask is initialized and num_trailing_0 would be 0)
2579                  */
2580                 if (mp->mnt_devbsdunit || (mp->mnt_throttle_mask != LOWPRI_MAX_NUM_DEV - 1 && mp->mnt_throttle_mask & 0x1)) {
2581                         inflight = TRUE;
2582                 }
2583                 throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
2584         } else {
2585                 throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2586         }
2587
2588         throttle_info_update_internal(throttle_info, ut, bflags, isssd, inflight, bap);
2589
2590         if ((bflags & B_READ) == 0) {
2591                 microuptime(&throttle_info->throttle_last_write_timestamp);
2592
2593                 if (mp) {
2594                         mp->mnt_last_write_issued_timestamp = throttle_info->throttle_last_write_timestamp;
2595                         INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
2596                 }
2597         } else if (mp) {
2598                 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
2599         }
2600         /*
2601          * The BootCache may give us special information about
2602          * the IO, so it returns special values that we check
2603          * for here.
2604          *
2605          * IO_SATISFIED_BY_CACHE
2606          * The read has been satisfied by the boot cache. Don't
2607          * throttle the thread unnecessarily.
2608          *
2609          * IO_SHOULD_BE_THROTTLED
2610          * The boot cache is playing back a playlist and this IO
2611          * cut through. Throttle it so we're not cutting through
2612          * the boot cache too often.
2613          *
2614          * Note that typical strategy routines are defined with
2615          * a void return so we'll get garbage here. In the
2616          * unlikely case the garbage matches our special return
2617          * value, it's not a big deal since we're only adjusting
2618          * the throttling delay.
2619          */
2620 #define IO_SATISFIED_BY_CACHE  ((int)0xcafefeed)
2621 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
2622         typedef int strategy_fcn_ret_t(struct buf *bp);
2623
2624         strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp);
2625
2626         // disk conditioner needs to track when this I/O actually starts
2627         // which means track it after `strategy` which may include delays
2628         // from inflight I/Os
2629         microuptime(&bp->b_timestamp_tv);
2630
2631         if (IO_SATISFIED_BY_CACHE == strategy_ret) {
2632                 /*
2633                  * If this was a throttled IO satisfied by the boot cache,
2634                  * don't delay the thread.
2635                  */
2636                 throttle_info_reset_window(ut);
2637         } else if (IO_SHOULD_BE_THROTTLED == strategy_ret) {
2638                 /*
2639                  * If the boot cache indicates this IO should be throttled,
2640                  * delay the thread.
2641                  */
2642                 throttle_info_set_initial_window(ut, throttle_info, TRUE, isssd);
2643         }
2644         return 0;
2645 }
2646
2647
2648 /*
2649  * This is a noop, simply returning what one has been given.
2650  */
2651 int
2652 spec_blockmap(__unused struct vnop_blockmap_args *ap)
2653 {
2654         return ENOTSUP;
2655 }
2656
2657
2658 /*
2659  * Device close routine
2660  */
2661 int
2662 spec_close(struct vnop_close_args *ap)
2663 {
2664         struct vnode *vp = ap->a_vp;
2665         dev_t dev = vp->v_rdev;
2666         int error = 0;
2667         int flags = ap->a_fflag;
2668         struct proc *p = vfs_context_proc(ap->a_context);
2669         struct session *sessp;
2670
2671         switch (vp->v_type) {
2672         case VCHR:
2673                 /*
2674                  * Hack: a tty device that is a controlling terminal
2675                  * has a reference from the session structure.
2676                  * We cannot easily tell that a character device is
2677                  * a controlling terminal, unless it is the closing
2678                  * process' controlling terminal.  In that case,
2679                  * if the reference count is 1 (this is the very
2680                  * last close)
2681                  */
2682                 sessp = proc_session(p);
2683                 devsw_lock(dev, S_IFCHR);
2684                 if (sessp != SESSION_NULL) {
2685                         if (vp == sessp->s_ttyvp && vcount(vp) == 1) {
2686                                 struct tty *tp = TTY_NULL;
2687
2688                                 devsw_unlock(dev, S_IFCHR);
2689                                 session_lock(sessp);
2690                                 if (vp == sessp->s_ttyvp) {
2691                                         tp = SESSION_TP(sessp);
2692                                         sessp->s_ttyvp = NULL;
2693                                         sessp->s_ttyvid = 0;
2694                                         sessp->s_ttyp = TTY_NULL;
2695                                         sessp->s_ttypgrpid = NO_PID;
2696                                 }
2697                                 session_unlock(sessp);
2698
2699                                 if (tp != TTY_NULL) {
2700                                         /*
2701                                          * We may have won a race with a proc_exit
2702                                          * of the session leader, the winner
2703                                          * clears the flag (even if not set)
2704                                          */
2705                                         tty_lock(tp);
2706                                         ttyclrpgrphup(tp);
2707                                         tty_unlock(tp);
2708
2709                                         ttyfree(tp);
2710                                 }
2711                                 devsw_lock(dev, S_IFCHR);
2712                         }
2713                         session_rele(sessp);
2714                 }
2715
2716                 if (--vp->v_specinfo->si_opencount < 0) {
2717                         panic("negative open count (c, %u, %u)", major(dev), minor(dev));
2718                 }
2719
2720                 /*
2721                  * close on last reference or on vnode revoke call
2722                  */
2723                 if (vcount(vp) == 0 || (flags & IO_REVOKE) != 0) {
2724                         error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
2725                 }
2726
2727                 devsw_unlock(dev, S_IFCHR);
2728                 break;
2729
2730         case VBLK:
2731                 /*
2732                  * If there is more than one outstanding open, don't
2733                  * send the close to the device.
2734                  */
2735                 devsw_lock(dev, S_IFBLK);
2736                 if (vcount(vp) > 1) {
2737                         vp->v_specinfo->si_opencount--;
2738                         devsw_unlock(dev, S_IFBLK);
2739                         return 0;
2740                 }
2741                 devsw_unlock(dev, S_IFBLK);
2742
2743                 /*
2744                  * On last close of a block device (that isn't mounted)
2745                  * we must invalidate any in core blocks, so that
2746                  * we can, for instance, change floppy disks.
2747                  */
2748                 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context))) {
2749                         return error;
2750                 }
2751
2752                 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
2753                 if (error) {
2754                         return error;
2755                 }
2756
2757                 devsw_lock(dev, S_IFBLK);
2758
2759                 if (--vp->v_specinfo->si_opencount < 0) {
2760                         panic("negative open count (b, %u, %u)", major(dev), minor(dev));
2761                 }
2762
2763                 if (vcount(vp) == 0) {
2764                         error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
2765                 }
2766
2767                 devsw_unlock(dev, S_IFBLK);
2768                 break;
2769
2770         default:
2771                 panic("spec_close: not special");
2772                 return EBADF;
2773         }
2774
2775         return error;
2776 }
2777
2778 /*
2779  * Return POSIX pathconf information applicable to special devices.
2780  */
2781 int
2782 spec_pathconf(struct vnop_pathconf_args *ap)
2783 {
2784         switch (ap->a_name) {
2785         case _PC_LINK_MAX:
2786                 *ap->a_retval = LINK_MAX;
2787                 return 0;
2788         case _PC_MAX_CANON:
2789                 *ap->a_retval = MAX_CANON;
2790                 return 0;
2791         case _PC_MAX_INPUT:
2792                 *ap->a_retval = MAX_INPUT;
2793                 return 0;
2794         case _PC_PIPE_BUF:
2795                 *ap->a_retval = PIPE_BUF;
2796                 return 0;
2797         case _PC_CHOWN_RESTRICTED:
2798                 *ap->a_retval = 200112;         /* _POSIX_CHOWN_RESTRICTED */
2799                 return 0;
2800         case _PC_VDISABLE:
2801                 *ap->a_retval = _POSIX_VDISABLE;
2802                 return 0;
2803         default:
2804                 return EINVAL;
2805         }
2806         /* NOTREACHED */
2807 }
2808
2809 /*
2810  * Special device failed operation
2811  */
2812 int
2813 spec_ebadf(__unused void *dummy)
2814 {
2815         return EBADF;
2816 }
2817
2818 /* Blktooff derives file offset from logical block number */
2819 int
2820 spec_blktooff(struct vnop_blktooff_args *ap)
2821 {
2822         struct vnode *vp = ap->a_vp;
2823
2824         switch (vp->v_type) {
2825         case VCHR:
2826                 *ap->a_offset = (off_t)-1; /* failure */
2827                 return ENOTSUP;
2828
2829         case VBLK:
2830                 printf("spec_blktooff: not implemented for VBLK\n");
2831                 *ap->a_offset = (off_t)-1; /* failure */
2832                 return ENOTSUP;
2833
2834         default:
2835                 panic("spec_blktooff type");
2836         }
2837         /* NOTREACHED */
2838
2839         return 0;
2840 }
2841
2842 /* Offtoblk derives logical block number from file offset */
2843 int
2844 spec_offtoblk(struct vnop_offtoblk_args *ap)
2845 {
2846         struct vnode *vp = ap->a_vp;
2847
2848         switch (vp->v_type) {
2849         case VCHR:
2850                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
2851                 return ENOTSUP;
2852
2853         case VBLK:
2854                 printf("spec_offtoblk: not implemented for VBLK\n");
2855                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
2856                 return ENOTSUP;
2857
2858         default:
2859                 panic("spec_offtoblk type");
2860         }
2861         /* NOTREACHED */
2862
2863         return 0;
2864 }
2865
2866 static void filt_specdetach(struct knote *kn);
2867 static int filt_specevent(struct knote *kn, long hint);
2868 static int filt_spectouch(struct knote *kn, struct kevent_qos_s *kev);
2869 static int filt_specprocess(struct knote *kn, struct kevent_qos_s *kev);
2870 static int filt_specpeek(struct knote *kn);
2871
2872 SECURITY_READ_ONLY_EARLY(struct filterops) spec_filtops = {
2873         .f_isfd    = 1,
2874         .f_attach  = filt_specattach,
2875         .f_detach  = filt_specdetach,
2876         .f_event   = filt_specevent,
2877         .f_touch   = filt_spectouch,
2878         .f_process = filt_specprocess,
2879         .f_peek    = filt_specpeek
2880 };
2881
2882
2883 /*
2884  * Given a waitq that is assumed to be embedded within a selinfo structure,
2885  * return the containing selinfo structure. While 'wq' is not really a queue
2886  * element, this macro simply does the offset_of calculation to get back to a
2887  * containing struct given the struct type and member name.
2888  */
2889 #define selinfo_from_waitq(wq) \
2890         qe_element((wq), struct selinfo, si_waitq)
2891
2892 static int
2893 spec_knote_select_and_link(struct knote *kn)
2894 {
2895         uthread_t uth;
2896         vfs_context_t ctx;
2897         vnode_t vp;
2898         struct waitq_set *old_wqs;
2899         uint64_t rsvd, rsvd_arg;
2900         uint64_t *rlptr = NULL;
2901         struct selinfo *si = NULL;
2902         int selres = 0;
2903
2904         uth = get_bsdthread_info(current_thread());
2905
2906         ctx = vfs_context_current();
2907         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
2908
2909         int error = vnode_getwithvid(vp, vnode_vid(vp));
2910         if (error != 0) {
2911                 knote_set_error(kn, ENOENT);
2912                 return 0;
2913         }
2914
2915         /*
2916          * This function may be called many times to link or re-link the
2917          * underlying vnode to the kqueue.  If we've already linked the two,
2918          * we will have a valid kn_hook_waitqid which ties us to the underlying
2919          * device's waitq via a the waitq's prepost table object. However,
2920          * devices can abort any select action by calling selthreadclear().
2921          * This is OK because the table object will be invalidated by the
2922          * driver (through a call to selthreadclear), so any attempt to access
2923          * the associated waitq will fail because the table object is invalid.
2924          *
2925          * Even if we've already registered, we need to pass a pointer
2926          * to a reserved link structure. Otherwise, selrecord() will
2927          * infer that we're in the second pass of select() and won't
2928          * actually do anything!
2929          */
2930         rsvd = rsvd_arg = waitq_link_reserve(NULL);
2931         rlptr = (void *)&rsvd_arg;
2932
2933         /*
2934          * Trick selrecord() into hooking kqueue's wait queue set into the device's
2935          * selinfo wait queue.
2936          */
2937         old_wqs = uth->uu_wqset;
2938         uth->uu_wqset = &(knote_get_kq(kn)->kq_wqs);
2939
2940         /*
2941          * Be sure that the waitq set is linked
2942          * before calling select to avoid possible
2943          * allocation under spinlocks.
2944          */
2945         waitq_set_lazy_init_link(uth->uu_wqset);
2946
2947         /*
2948          * Now these are the laws of VNOP_SELECT, as old and as true as the sky,
2949          * And the device that shall keep it may prosper, but the device that shall
2950          * break it must receive ENODEV:
2951          *
2952          * 1. Take a lock to protect against other selects on the same vnode.
2953          * 2. Return 1 if data is ready to be read.
2954          * 3. Return 0 and call `selrecord` on a handy `selinfo` structure if there
2955          *    is no data.
2956          * 4. Call `selwakeup` when the vnode has an active `selrecord` and data
2957          *    can be read or written (depending on the seltype).
2958          * 5. If there's a `selrecord` and no corresponding `selwakeup`, but the
2959          *    vnode is going away, call `selthreadclear`.
2960          */
2961         selres = VNOP_SELECT(vp, knote_get_seltype(kn), 0, rlptr, ctx);
2962         uth->uu_wqset = old_wqs;
2963
2964         /*
2965          * Make sure to cleanup the reserved link - this guards against
2966          * drivers that may not actually call selrecord().
2967          */
2968         waitq_link_release(rsvd);
2969         if (rsvd != rsvd_arg) {
2970                 /* The driver / handler called selrecord() */
2971                 struct waitq *wq;
2972                 memcpy(&wq, rlptr, sizeof(void *));
2973
2974                 /*
2975                  * The waitq is part of the selinfo structure managed by the
2976                  * driver. For certain drivers, we want to hook the knote into
2977                  * the selinfo structure's si_note field so selwakeup can call
2978                  * KNOTE.
2979                  */
2980                 si = selinfo_from_waitq(wq);
2981
2982                 /*
2983                  * The waitq_get_prepost_id() function will (potentially)
2984                  * allocate a prepost table object for the waitq and return
2985                  * the table object's ID to us.  It will also set the
2986                  * waitq_prepost_id field within the waitq structure.
2987                  *
2988                  * We can just overwrite kn_hook_waitqid because it's simply a
2989                  * table ID used to grab a reference when needed.
2990                  *
2991                  * We have a reference on the vnode, so we know that the
2992                  * device won't go away while we get this ID.
2993                  *
2994                  * Note: on 32bit this field is 32bit only.
2995                  */
2996                 kn->kn_hook_waitqid = (typeof(kn->kn_hook_waitqid))waitq_get_prepost_id(wq);
2997         } else if (selres == 0) {
2998                 /*
2999                  * The device indicated that there's no data to read, but didn't call
3000                  * `selrecord`.  Nothing will be notified of changes to this vnode, so
3001                  * return an error back to user space, to make it clear that the knote
3002                  * is not attached.
3003                  */
3004                 knote_set_error(kn, ENODEV);
3005         }
3006
3007         vnode_put(vp);
3008
3009         return selres;
3010 }
3011
3012 static int
3013 filt_spec_common(struct knote *kn, struct kevent_qos_s *kev, int selres)
3014 {
3015         int64_t data;
3016         int ret;
3017
3018         if (kn->kn_vnode_use_ofst) {
3019                 if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) {
3020                         data = 0;
3021                 } else {
3022                         data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset;
3023                 }
3024         } else {
3025                 data = selres;
3026         }
3027
3028         ret = data >= knote_low_watermark(kn);
3029
3030         if (ret && kev) {
3031                 knote_fill_kevent(kn, kev, data);
3032         }
3033
3034         return ret;
3035 }
3036
3037 static int
3038 filt_specattach(struct knote *kn, __unused struct kevent_qos_s *kev)
3039 {
3040         vnode_t vp;
3041         dev_t dev;
3042
3043         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */
3044
3045         assert(vnode_ischr(vp));
3046
3047         dev = vnode_specrdev(vp);
3048
3049         /*
3050          * For a few special kinds of devices, we can attach knotes with
3051          * no restrictions because their "select" vectors return the amount
3052          * of data available.  Others require an explicit NOTE_LOWAT with
3053          * data of 1, indicating that the caller doesn't care about actual
3054          * data counts, just an indication that the device has data.
3055          */
3056         if (!kn->kn_vnode_kqok &&
3057             ((kn->kn_sfflags & NOTE_LOWAT) == 0 || kn->kn_sdata != 1)) {
3058                 knote_set_error(kn, EINVAL);
3059                 return 0;
3060         }
3061
3062         /*
3063          * This forces the select fallback to call through VNOP_SELECT and hook
3064          * up selinfo on every filter routine.
3065          *
3066          * Pseudo-terminal controllers are opted out of native kevent support --
3067          * remove this when they get their own EVFILTID.
3068          */
3069         if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTC) {
3070                 kn->kn_vnode_kqok = 0;
3071         }
3072
3073         kn->kn_filtid = EVFILTID_SPEC;
3074         kn->kn_hook_waitqid = 0;
3075
3076         knote_markstayactive(kn);
3077         return spec_knote_select_and_link(kn);
3078 }
3079
3080 static void
3081 filt_specdetach(struct knote *kn)
3082 {
3083         knote_clearstayactive(kn);
3084
3085         /*
3086          * This is potentially tricky: the device's selinfo waitq that was
3087          * tricked into being part of this knote's waitq set may not be a part
3088          * of any other set, and the device itself may have revoked the memory
3089          * in which the waitq was held. We use the knote's kn_hook_waitqid field
3090          * to keep the ID of the waitq's prepost table object. This
3091          * object keeps a pointer back to the waitq, and gives us a safe way
3092          * to decouple the dereferencing of driver allocated memory: if the
3093          * driver goes away (taking the waitq with it) then the prepost table
3094          * object will be invalidated. The waitq details are handled in the
3095          * waitq API invoked here.
3096          */
3097         if (kn->kn_hook_waitqid) {
3098                 waitq_unlink_by_prepost_id(kn->kn_hook_waitqid, &(knote_get_kq(kn)->kq_wqs));
3099                 kn->kn_hook_waitqid = 0;
3100         }
3101 }
3102
3103 static int
3104 filt_specevent(struct knote *kn, __unused long hint)
3105 {
3106         /*
3107          * Nothing should call knote or knote_vanish on this knote.
3108          */
3109         panic("filt_specevent(%p)", kn);
3110         return 0;
3111 }
3112
3113 static int
3114 filt_spectouch(struct knote *kn, struct kevent_qos_s *kev)
3115 {
3116         kn->kn_sdata = kev->data;
3117         kn->kn_sfflags = kev->fflags;
3118
3119         if (kev->flags & EV_ENABLE) {
3120                 return spec_knote_select_and_link(kn);
3121         }
3122
3123         return 0;
3124 }
3125
3126 static int
3127 filt_specprocess(struct knote *kn, struct kevent_qos_s *kev)
3128 {
3129         vnode_t vp;
3130         uthread_t uth;
3131         vfs_context_t ctx;
3132         int res;
3133         int selres;
3134         int error;
3135
3136         uth = get_bsdthread_info(current_thread());
3137         ctx = vfs_context_current();
3138         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
3139
3140         error = vnode_getwithvid(vp, vnode_vid(vp));
3141         if (error != 0) {
3142                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3143                 knote_fill_kevent(kn, kev, 0);
3144                 return 1;
3145         }
3146
3147         selres = spec_knote_select_and_link(kn);
3148         res = filt_spec_common(kn, kev, selres);
3149
3150         vnode_put(vp);
3151
3152         return res;
3153 }
3154
3155 static int
3156 filt_specpeek(struct knote *kn)
3157 {
3158         int selres = 0;
3159
3160         selres = spec_knote_select_and_link(kn);
3161         return filt_spec_common(kn, NULL, selres);
3162 }