bsd/miscfs/specfs/spec_vnops.c

   1 /*
   2  * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1989, 1993, 1995
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)spec_vnops.c        8.14 (Berkeley) 5/21/95
  62  */
  63
  64 #include <sys/param.h>
  65 #include <sys/proc_internal.h>
  66 #include <sys/kauth.h>
  67 #include <sys/systm.h>
  68 #include <sys/kernel.h>
  69 #include <sys/conf.h>
  70 #include <sys/buf_internal.h>
  71 #include <sys/mount_internal.h>
  72 #include <sys/namei.h>
  73 #include <sys/vnode_internal.h>
  74 #include <sys/stat.h>
  75 #include <sys/errno.h>
  76 #include <sys/ioctl.h>
  77 #include <sys/file.h>
  78 #include <sys/user.h>
  79 #include <sys/malloc.h>
  80 #include <sys/disk.h>
  81 #include <sys/uio_internal.h>
  82 #include <sys/resource.h>
  83 #include <miscfs/specfs/specdev.h>
  84 #include <vfs/vfs_support.h>
  85
  86 #include <sys/kdebug.h>
  87
  88 /* XXX following three prototypes should be in a header file somewhere */
  89 extern int      isdisk(dev_t dev, int type);
  90 extern dev_t    chrtoblk(dev_t dev);
  91 extern int      iskmemdev(dev_t dev);
  92
  93 struct vnode *speclisth[SPECHSZ];
  94
  95 /* symbolic sleep message strings for devices */
  96 char    devopn[] = "devopn";
  97 char    devio[] = "devio";
  98 char    devwait[] = "devwait";
  99 char    devin[] = "devin";
 100 char    devout[] = "devout";
 101 char    devioc[] = "devioc";
 102 char    devcls[] = "devcls";
 103
 104 #define VOPFUNC int (*)(void *)
 105
 106 int (**spec_vnodeop_p)(void *);
 107 struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
 108         { &vnop_default_desc, (VOPFUNC)vn_default_error },
 109         { &vnop_lookup_desc, (VOPFUNC)spec_lookup },            /* lookup */
 110         { &vnop_create_desc, (VOPFUNC)err_create },             /* create */
 111         { &vnop_mknod_desc, (VOPFUNC)err_mknod },               /* mknod */
 112         { &vnop_open_desc, (VOPFUNC)spec_open },                        /* open */
 113         { &vnop_close_desc, (VOPFUNC)spec_close },              /* close */
 114         { &vnop_access_desc, (VOPFUNC)spec_access },            /* access */
 115         { &vnop_getattr_desc, (VOPFUNC)spec_getattr },          /* getattr */
 116         { &vnop_setattr_desc, (VOPFUNC)spec_setattr },          /* setattr */
 117         { &vnop_read_desc, (VOPFUNC)spec_read },                        /* read */
 118         { &vnop_write_desc, (VOPFUNC)spec_write },              /* write */
 119         { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl },              /* ioctl */
 120         { &vnop_select_desc, (VOPFUNC)spec_select },            /* select */
 121         { &vnop_revoke_desc, (VOPFUNC)nop_revoke },             /* revoke */
 122         { &vnop_mmap_desc, (VOPFUNC)err_mmap },                 /* mmap */
 123         { &vnop_fsync_desc, (VOPFUNC)spec_fsync },              /* fsync */
 124         { &vnop_remove_desc, (VOPFUNC)err_remove },             /* remove */
 125         { &vnop_link_desc, (VOPFUNC)err_link },                 /* link */
 126         { &vnop_rename_desc, (VOPFUNC)err_rename },             /* rename */
 127         { &vnop_mkdir_desc, (VOPFUNC)err_mkdir },               /* mkdir */
 128         { &vnop_rmdir_desc, (VOPFUNC)err_rmdir },               /* rmdir */
 129         { &vnop_symlink_desc, (VOPFUNC)err_symlink },           /* symlink */
 130         { &vnop_readdir_desc, (VOPFUNC)err_readdir },           /* readdir */
 131         { &vnop_readlink_desc, (VOPFUNC)err_readlink },         /* readlink */
 132         { &vnop_inactive_desc, (VOPFUNC)nop_inactive },         /* inactive */
 133         { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim },           /* reclaim */
 134         { &vnop_strategy_desc, (VOPFUNC)spec_strategy },                /* strategy */
 135         { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf },                /* pathconf */
 136         { &vnop_advlock_desc, (VOPFUNC)err_advlock },           /* advlock */
 137         { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite },            /* bwrite */
 138         { &vnop_pagein_desc, (VOPFUNC)err_pagein },             /* Pagein */
 139         { &vnop_pageout_desc, (VOPFUNC)err_pageout },           /* Pageout */
 140         { &vnop_copyfile_desc, (VOPFUNC)err_copyfile },         /* Copyfile */
 141         { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff },                /* blktooff */
 142         { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk },                /* offtoblk */
 143         { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap },                /* blockmap */
 144         { (struct vnodeop_desc*)NULL, (int(*)())NULL }
 145 };
 146 struct vnodeopv_desc spec_vnodeop_opv_desc =
 147         { &spec_vnodeop_p, spec_vnodeop_entries };
 148
 149
 150 static void set_blocksize(vnode_t, dev_t);
 151
 152
 153 /*
 154  * Trivial lookup routine that always fails.
 155  */
 156 int
 157 spec_lookup(struct vnop_lookup_args *ap)
 158 {
 159
 160         *ap->a_vpp = NULL;
 161         return (ENOTDIR);
 162 }
 163
 164 static void
 165 set_blocksize(struct vnode *vp, dev_t dev)
 166 {
 167     int (*size)(dev_t);
 168     int rsize;
 169
 170     if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
 171         rsize = (*size)(dev);
 172         if (rsize <= 0)        /* did size fail? */
 173             vp->v_specsize = DEV_BSIZE;
 174         else
 175             vp->v_specsize = rsize;
 176     }
 177     else
 178             vp->v_specsize = DEV_BSIZE;
 179 }
 180
 181 void
 182 set_fsblocksize(struct vnode *vp)
 183 {
 184
 185         if (vp->v_type == VBLK) {
 186                 dev_t dev = (dev_t)vp->v_rdev;
 187                 int maj = major(dev);
 188
 189                 if ((u_int)maj >= (u_int)nblkdev)
 190                         return;
 191
 192                 vnode_lock(vp);
 193                 set_blocksize(vp, dev);
 194                 vnode_unlock(vp);
 195         }
 196
 197 }
 198
 199
 200 /*
 201  * Open a special file.
 202  */
 203 int
 204 spec_open(struct vnop_open_args *ap)
 205 {
 206         struct proc *p = vfs_context_proc(ap->a_context);
 207         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
 208         struct vnode *vp = ap->a_vp;
 209         dev_t bdev, dev = (dev_t)vp->v_rdev;
 210         int maj = major(dev);
 211         int error;
 212
 213         /*
 214          * Don't allow open if fs is mounted -nodev.
 215          */
 216         if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
 217                 return (ENXIO);
 218
 219         switch (vp->v_type) {
 220
 221         case VCHR:
 222                 if ((u_int)maj >= (u_int)nchrdev)
 223                         return (ENXIO);
 224                 if (cred != FSCRED && (ap->a_mode & FWRITE)) {
 225                         /*
 226                          * When running in very secure mode, do not allow
 227                          * opens for writing of any disk character devices.
 228                          */
 229                         if (securelevel >= 2 && isdisk(dev, VCHR))
 230                                 return (EPERM);
 231                         /*
 232                          * When running in secure mode, do not allow opens
 233                          * for writing of /dev/mem, /dev/kmem, or character
 234                          * devices whose corresponding block devices are
 235                          * currently mounted.
 236                          */
 237                         if (securelevel >= 1) {
 238                                 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error))
 239                                         return (error);
 240                                 if (iskmemdev(dev))
 241                                         return (EPERM);
 242                         }
 243                 }
 244                 if (cdevsw[maj].d_type == D_TTY) {
 245                         vnode_lock(vp);
 246                         vp->v_flag |= VISTTY;
 247                         vnode_unlock(vp);
 248                 }
 249                 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
 250                 return (error);
 251
 252         case VBLK:
 253                 if ((u_int)maj >= (u_int)nblkdev)
 254                         return (ENXIO);
 255                 /*
 256                  * When running in very secure mode, do not allow
 257                  * opens for writing of any disk block devices.
 258                  */
 259                 if (securelevel >= 2 && cred != FSCRED &&
 260                     (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK)
 261                         return (EPERM);
 262                 /*
 263                  * Do not allow opens of block devices that are
 264                  * currently mounted.
 265                  */
 266                 if ( (error = vfs_mountedon(vp)) )
 267                         return (error);
 268                 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
 269                 if (!error) {
 270                     u_int64_t blkcnt;
 271                     u_int32_t blksize;
 272                         int setsize = 0;
 273                         u_int32_t size512 = 512;
 274
 275
 276                     if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
 277                                 /* Switch to 512 byte sectors (temporarily) */
 278
 279                                 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
 280                                 /* Get the number of 512 byte physical blocks. */
 281                                 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
 282                                                 setsize = 1;
 283                                 }
 284                                 }
 285                                 /* If it doesn't set back, we can't recover */
 286                                 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context))
 287                                 error = ENXIO;
 288                     }
 289
 290
 291                         vnode_lock(vp);
 292                     set_blocksize(vp, dev);
 293
 294                     /*
 295                      * Cache the size in bytes of the block device for later
 296                      * use by spec_write().
 297                      */
 298                         if (setsize)
 299                                 vp->v_specdevsize = blkcnt * (u_int64_t)size512;
 300                         else
 301                         vp->v_specdevsize = (u_int64_t)0;       /* Default: Can't get */
 302
 303                         vnode_unlock(vp);
 304
 305                 }
 306                 return(error);
 307         default:
 308                 panic("spec_open type");
 309         }
 310         return (0);
 311 }
 312
 313 /*
 314  * Vnode op for read
 315  */
 316 int
 317 spec_read(struct vnop_read_args *ap)
 318 {
 319         struct vnode *vp = ap->a_vp;
 320         struct uio *uio = ap->a_uio;
 321         struct buf *bp;
 322         daddr64_t bn, nextbn;
 323         long bsize, bscale;
 324         int devBlockSize=0;
 325         int n, on;
 326         int error = 0;
 327         dev_t dev;
 328
 329 #if DIAGNOSTIC
 330         if (uio->uio_rw != UIO_READ)
 331                 panic("spec_read mode");
 332         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
 333                 panic("spec_read proc");
 334 #endif
 335         if (uio_resid(uio) == 0)
 336                 return (0);
 337
 338         switch (vp->v_type) {
 339
 340         case VCHR:
 341                 error = (*cdevsw[major(vp->v_rdev)].d_read)
 342                         (vp->v_rdev, uio, ap->a_ioflag);
 343                 return (error);
 344
 345         case VBLK:
 346                 if (uio->uio_offset < 0)
 347                         return (EINVAL);
 348
 349                 dev = vp->v_rdev;
 350
 351                 devBlockSize = vp->v_specsize;
 352
 353                 if (devBlockSize > PAGE_SIZE)
 354                         return (EINVAL);
 355
 356                 bscale = PAGE_SIZE / devBlockSize;
 357                 bsize = bscale * devBlockSize;
 358
 359                 do {
 360                         on = uio->uio_offset % bsize;
 361
 362                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1));
 363
 364                         if (vp->v_speclastr + bscale == bn) {
 365                                 nextbn = bn + bscale;
 366                                 error = buf_breadn(vp, bn, (int)bsize, &nextbn,
 367                                                (int *)&bsize, 1, NOCRED, &bp);
 368                         } else
 369                                 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
 370
 371                         vnode_lock(vp);
 372                         vp->v_speclastr = bn;
 373                         vnode_unlock(vp);
 374
 375                         n = bsize - buf_resid(bp);
 376                         if ((on > n) || error) {
 377                                 if (!error)
 378                                         error = EINVAL;
 379                                 buf_brelse(bp);
 380                                 return (error);
 381                         }
 382                         // LP64todo - fix this!
 383                         n = min((unsigned)(n  - on), uio_resid(uio));
 384
 385                         error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio);
 386                         if (n + on == bsize)
 387                                 buf_markaged(bp);
 388                         buf_brelse(bp);
 389                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
 390                 return (error);
 391
 392         default:
 393                 panic("spec_read type");
 394         }
 395         /* NOTREACHED */
 396
 397         return (0);
 398 }
 399
 400 /*
 401  * Vnode op for write
 402  */
 403 int
 404 spec_write(struct vnop_write_args *ap)
 405 {
 406         struct vnode *vp = ap->a_vp;
 407         struct uio *uio = ap->a_uio;
 408         struct buf *bp;
 409         daddr64_t bn;
 410         int bsize, blkmask, bscale;
 411         int io_sync;
 412         int io_size;
 413         int devBlockSize=0;
 414         int n, on;
 415         int error = 0;
 416         dev_t dev;
 417
 418 #if DIAGNOSTIC
 419         if (uio->uio_rw != UIO_WRITE)
 420                 panic("spec_write mode");
 421         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
 422                 panic("spec_write proc");
 423 #endif
 424
 425         switch (vp->v_type) {
 426
 427         case VCHR:
 428                 error = (*cdevsw[major(vp->v_rdev)].d_write)
 429                         (vp->v_rdev, uio, ap->a_ioflag);
 430                 return (error);
 431
 432         case VBLK:
 433                 if (uio_resid(uio) == 0)
 434                         return (0);
 435                 if (uio->uio_offset < 0)
 436                         return (EINVAL);
 437
 438                 io_sync = (ap->a_ioflag & IO_SYNC);
 439                 // LP64todo - fix this!
 440                 io_size = uio_resid(uio);
 441
 442                 dev = (vp->v_rdev);
 443
 444                 devBlockSize = vp->v_specsize;
 445                 if (devBlockSize > PAGE_SIZE)
 446                         return(EINVAL);
 447
 448                 bscale = PAGE_SIZE / devBlockSize;
 449                 blkmask = bscale - 1;
 450                 bsize = bscale * devBlockSize;
 451
 452
 453                 do {
 454                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask);
 455                         on = uio->uio_offset % bsize;
 456
 457                         // LP64todo - fix this!
 458                         n = min((unsigned)(bsize - on), uio_resid(uio));
 459
 460                         /*
 461                          * Use buf_getblk() as an optimization IFF:
 462                          *
 463                          * 1)   We are reading exactly a block on a block
 464                          *      aligned boundary
 465                          * 2)   We know the size of the device from spec_open
 466                          * 3)   The read doesn't span the end of the device
 467                          *
 468                          * Otherwise, we fall back on buf_bread().
 469                          */
 470                         if (n == bsize &&
 471                             vp->v_specdevsize != (u_int64_t)0 &&
 472                             (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
 473                             /* reduce the size of the read to what is there */
 474                             n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
 475                         }
 476
 477                         if (n == bsize)
 478                                 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
 479                         else
 480                                 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
 481
 482                         /* Translate downstream error for upstream, if needed */
 483                         if (!error)
 484                                 error = (int)buf_error(bp);
 485                         if (error) {
 486                                 buf_brelse(bp);
 487                                 return (error);
 488                         }
 489                         n = min(n, bsize - buf_resid(bp));
 490
 491                         error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio);
 492                         if (error) {
 493                                 buf_brelse(bp);
 494                                 return (error);
 495                         }
 496                         buf_markaged(bp);
 497
 498                         if (io_sync)
 499                                 error = buf_bwrite(bp);
 500                         else {
 501                                 if ((n + on) == bsize)
 502                                         error = buf_bawrite(bp);
 503                                 else
 504                                         error = buf_bdwrite(bp);
 505                         }
 506                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
 507                 return (error);
 508
 509         default:
 510                 panic("spec_write type");
 511         }
 512         /* NOTREACHED */
 513
 514         return (0);
 515 }
 516
 517 /*
 518  * Device ioctl operation.
 519  */
 520 int
 521 spec_ioctl(struct vnop_ioctl_args *ap)
 522 {
 523         proc_t p = vfs_context_proc(ap->a_context);
 524         dev_t dev = ap->a_vp->v_rdev;
 525
 526         switch (ap->a_vp->v_type) {
 527
 528         case VCHR:
 529                 return ((*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
 530                     ap->a_fflag, p));
 531
 532         case VBLK:
 533                 if (ap->a_command == 0 && (unsigned int)ap->a_data == B_TAPE) {
 534                         if (bdevsw[major(dev)].d_type == D_TAPE)
 535                                 return (0);
 536                         else
 537                                 return (1);
 538                 }
 539                 return ((*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
 540                    ap->a_fflag, p));
 541
 542         default:
 543                 panic("spec_ioctl");
 544                 /* NOTREACHED */
 545         }
 546         return (0);
 547 }
 548
 549 int
 550 spec_select(struct vnop_select_args *ap)
 551 {
 552         proc_t p = vfs_context_proc(ap->a_context);
 553         dev_t dev;
 554
 555         switch (ap->a_vp->v_type) {
 556
 557         default:
 558                 return (1);             /* XXX */
 559
 560         case VCHR:
 561                 dev = ap->a_vp->v_rdev;
 562                 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
 563         }
 564 }
 565
 566 /*
 567  * Synch buffers associated with a block device
 568  */
 569 int
 570 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
 571 {
 572         if (vp->v_type == VCHR)
 573                 return (0);
 574         /*
 575          * Flush all dirty buffers associated with a block device.
 576          */
 577         buf_flushdirtyblks(vp, waitfor == MNT_WAIT, 0, "spec_fsync");
 578
 579         return (0);
 580 }
 581
 582 int
 583 spec_fsync(struct vnop_fsync_args *ap)
 584 {
 585         return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
 586 }
 587
 588 /*
 589  * Just call the device strategy routine
 590  */
 591 extern int hard_throttle_on_root;
 592 void IOSleep(int);
 593
 594 // the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
 595 #define LOWPRI_INITIAL_WINDOW_MSECS 100
 596 #define LOWPRI_WINDOW_MSECS_INC 50
 597 #define LOWPRI_MAX_WINDOW_MSECS 200
 598 #define LOWPRI_MAX_WAITING_MSECS 200
 599 #define LOWPRI_SLEEP_INTERVAL 5
 600
 601 struct _throttle_io_info_t {
 602         struct timeval  last_normal_IO_timestamp;
 603         struct timeval  last_IO_timestamp;
 604         SInt32 numthreads_throttling;
 605 };
 606
 607 struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
 608 int     lowpri_IO_initial_window_msecs  = LOWPRI_INITIAL_WINDOW_MSECS;
 609 int     lowpri_IO_window_msecs_inc  = LOWPRI_WINDOW_MSECS_INC;
 610 int     lowpri_max_window_msecs  = LOWPRI_MAX_WINDOW_MSECS;
 611 int     lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS;
 612
 613 SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
 614 SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, "");
 615 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
 616 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
 617
 618 void
 619 throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
 620 {
 621         size_t devbsdunit;
 622
 623         devbsdunit = mp->mnt_devbsdunit;
 624
 625         if (devbsdunit < LOWPRI_MAX_NUM_DEV) {
 626                 *tv = _throttle_io_info[devbsdunit].last_IO_timestamp;
 627         } else {
 628                 memset(tv, 0, sizeof(*tv));
 629         }
 630 }
 631
 632 void
 633 update_last_io_time(mount_t mp)
 634 {
 635         size_t devbsdunit;
 636
 637         devbsdunit = mp->mnt_devbsdunit;
 638
 639         if (devbsdunit < LOWPRI_MAX_NUM_DEV) {
 640                 microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp);
 641         }
 642 }
 643
 644 int throttle_io_will_be_throttled(int lowpri_window_msecs, size_t devbsdunit)
 645 {
 646         struct timeval elapsed;
 647         int elapsed_msecs;
 648
 649         microuptime(&elapsed);
 650         timevalsub(&elapsed, &_throttle_io_info[devbsdunit].last_normal_IO_timestamp);
 651         elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
 652
 653         if (lowpri_window_msecs == -1) // use the max waiting time
 654                 lowpri_window_msecs = lowpri_max_waiting_msecs;
 655
 656         return elapsed_msecs < lowpri_window_msecs;
 657 }
 658
 659 void throttle_lowpri_io(boolean_t ok_to_sleep)
 660 {
 661         int i;
 662         int max_try_num;
 663         struct uthread *ut;
 664
 665         ut = get_bsdthread_info(current_thread());
 666
 667         if (ut->uu_lowpri_window == 0)
 668                 return;
 669
 670         max_try_num = lowpri_max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, _throttle_io_info[ut->uu_devbsdunit].numthreads_throttling);
 671
 672         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
 673                      ut->uu_lowpri_window, 0, 0, 0, 0);
 674
 675         if (ok_to_sleep == TRUE) {
 676                 for (i=0; i<max_try_num; i++) {
 677                         if (throttle_io_will_be_throttled(ut->uu_lowpri_window, ut->uu_devbsdunit)) {
 678                                 IOSleep(LOWPRI_SLEEP_INTERVAL);
 679                         } else {
 680                                 break;
 681                         }
 682                 }
 683         }
 684         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
 685                      ut->uu_lowpri_window, i*5, 0, 0, 0);
 686         SInt32 oldValue;
 687         oldValue = OSDecrementAtomic(&_throttle_io_info[ut->uu_devbsdunit].numthreads_throttling);
 688         ut->uu_lowpri_window = 0;
 689
 690         if (oldValue <= 0) {
 691                 panic("%s: numthreads negative", __func__);
 692         }
 693 }
 694
 695 int throttle_get_io_policy(struct uthread **ut)
 696 {
 697         int policy = IOPOL_DEFAULT;
 698         proc_t p = current_proc();
 699
 700         *ut = get_bsdthread_info(current_thread());
 701
 702         if (p != NULL)
 703                 policy = p->p_iopol_disk;
 704
 705         if (*ut != NULL) {
 706                 // the I/O policy of the thread overrides that of the process
 707                 // unless the I/O policy of the thread is default
 708                 if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT)
 709                         policy = (*ut)->uu_iopol_disk;
 710         }
 711         return policy;
 712 }
 713
 714 int
 715 spec_strategy(struct vnop_strategy_args *ap)
 716 {
 717         buf_t   bp;
 718         int     bflags;
 719         dev_t   bdev;
 720
 721         bp = ap->a_bp;
 722         bdev = buf_device(bp);
 723         bflags = buf_flags(bp);
 724
 725         if (kdebug_enable) {
 726                 int    code = 0;
 727
 728                 if (bflags & B_READ)
 729                         code |= DKIO_READ;
 730                 if (bflags & B_ASYNC)
 731                         code |= DKIO_ASYNC;
 732
 733                 if (bflags & B_META)
 734                         code |= DKIO_META;
 735                 else if (bflags & B_PAGEIO)
 736                         code |= DKIO_PAGING;
 737
 738                 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
 739                                       (unsigned int)bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0);
 740         }
 741         if (((bflags & (B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
 742             (buf_vnode(bp)->v_mount->mnt_kern_flag & MNTK_ROOTDEV))
 743                 hard_throttle_on_root = 1;
 744
 745         if (lowpri_IO_initial_window_msecs) {
 746                 struct uthread  *ut;
 747                 int policy;
 748                 int is_throttleable_io = 0;
 749                 int is_passive_io = 0;
 750                 size_t devbsdunit;
 751                 SInt32 oldValue;
 752
 753                 policy = throttle_get_io_policy(&ut);
 754
 755                 switch (policy) {
 756                 case IOPOL_DEFAULT:
 757                 case IOPOL_NORMAL:
 758                         break;
 759                 case IOPOL_THROTTLE:
 760                         is_throttleable_io = 1;
 761                         break;
 762                 case IOPOL_PASSIVE:
 763                         is_passive_io = 1;
 764                         break;
 765                 default:
 766                         printf("unknown I/O policy %d", policy);
 767                         break;
 768                 }
 769
 770                 if (!is_throttleable_io && ISSET(bflags, B_PASSIVE))
 771                     is_passive_io |= 1;
 772
 773                 if (buf_vnode(bp)->v_mount != NULL)
 774                         devbsdunit = buf_vnode(bp)->v_mount->mnt_devbsdunit;
 775                 else
 776                         devbsdunit = LOWPRI_MAX_NUM_DEV - 1;
 777                 if (!is_throttleable_io) {
 778                         if (!is_passive_io){
 779                                 microuptime(&_throttle_io_info[devbsdunit].last_normal_IO_timestamp);
 780                         }
 781                 } else {
 782                         /*
 783                          * I'd really like to do the IOSleep here, but
 784                          * we may be holding all kinds of filesystem related locks
 785                          * and the pages for this I/O marked 'busy'...
 786                          * we don't want to cause a normal task to block on
 787                          * one of these locks while we're throttling a task marked
 788                          * for low priority I/O... we'll mark the uthread and
 789                          * do the delay just before we return from the system
 790                          * call that triggered this I/O or from vnode_pagein
 791                          */
 792                         if (ut->uu_lowpri_window == 0) {
 793                                 ut->uu_devbsdunit = devbsdunit;
 794                                 oldValue = OSIncrementAtomic(&_throttle_io_info[devbsdunit].numthreads_throttling);
 795                                 if (oldValue < 0) {
 796                                         panic("%s: numthreads negative", __func__);
 797                                 }
 798                                 ut->uu_lowpri_window = lowpri_IO_initial_window_msecs;
 799                                 ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue;
 800                         } else {
 801                                 if (ut->uu_devbsdunit != devbsdunit) { // the thread sends I/Os to different devices within the same system call
 802                                         // keep track of the numthreads in the right device
 803                                         OSDecrementAtomic(&_throttle_io_info[ut->uu_devbsdunit].numthreads_throttling);
 804                                         OSIncrementAtomic(&_throttle_io_info[devbsdunit].numthreads_throttling);
 805                                         ut->uu_devbsdunit = devbsdunit;
 806                                 }
 807                                 int numthreads = MAX(1, _throttle_io_info[devbsdunit].numthreads_throttling);
 808                                 ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads;
 809                                 if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads)
 810                                         ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads;
 811                         }
 812                 }
 813         }
 814
 815         if ((bflags & B_READ) == 0) {
 816                 size_t devbsdunit;
 817
 818                 if (buf_vnode(bp)->v_mount != NULL)
 819                         devbsdunit = buf_vnode(bp)->v_mount->mnt_devbsdunit;
 820                 else
 821                         devbsdunit = LOWPRI_MAX_NUM_DEV - 1;
 822
 823                 microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp);
 824         }
 825
 826         (*bdevsw[major(bdev)].d_strategy)(bp);
 827
 828         return (0);
 829 }
 830
 831
 832 /*
 833  * This is a noop, simply returning what one has been given.
 834  */
 835 int
 836 spec_blockmap(__unused struct vnop_blockmap_args *ap)
 837 {
 838         return (ENOTSUP);
 839 }
 840
 841
 842 /*
 843  * Device close routine
 844  */
 845 int
 846 spec_close(struct vnop_close_args *ap)
 847 {
 848         struct vnode *vp = ap->a_vp;
 849         dev_t dev = vp->v_rdev;
 850         int (*devclose)(dev_t, int, int, struct proc *);
 851         int mode, error;
 852         int flags = ap->a_fflag;
 853         struct proc *p = vfs_context_proc(ap->a_context);
 854         struct session *sessp;
 855
 856         switch (vp->v_type) {
 857
 858         case VCHR:
 859                 /*
 860                  * Hack: a tty device that is a controlling terminal
 861                  * has a reference from the session structure.
 862                  * We cannot easily tell that a character device is
 863                  * a controlling terminal, unless it is the closing
 864                  * process' controlling terminal.  In that case,
 865                  * if the reference count is 2 (this last descriptor
 866                  * plus the session), release the reference from the session.
 867                  */
 868                 sessp = proc_session(p);
 869                 if (sessp != SESSION_NULL) {
 870                         if ((vcount(vp) == 2) &&
 871                                 (vp == sessp->s_ttyvp)) {
 872                                 session_lock(sessp);
 873                                 sessp->s_ttyvp = NULL;
 874                                 sessp->s_ttyvid = 0;
 875                                 sessp->s_ttyp = NULL;
 876                                 sessp->s_ttypgrpid = NO_PID;
 877                                 session_unlock(sessp);
 878                                 vnode_rele(vp);
 879                         }
 880                         session_rele(sessp);
 881                 }
 882
 883                 devclose = cdevsw[major(dev)].d_close;
 884                 mode = S_IFCHR;
 885                 /*
 886                  * close on last reference or on vnode revoke call
 887                  */
 888                 if ((flags & IO_REVOKE) != 0)
 889                         break;
 890                 if (vcount(vp) > 1)
 891                         return (0);
 892                 break;
 893
 894         case VBLK:
 895 #ifdef DEVFS_IMPLEMENTS_LOCKING
 896                 /*
 897                  * On last close of a block device (that isn't mounted)
 898                  * we must invalidate any in core blocks, so that
 899                  * we can, for instance, change floppy disks.
 900                  */
 901                 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
 902                         return (error);
 903
 904                 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
 905                 if (error)
 906                         return (error);
 907                 /*
 908                  * Since every use (buffer, vnode, swap, blockmap)
 909                  * holds a reference to the vnode, and because we mark
 910                  * any other vnodes that alias this device, when the
 911                  * sum of the reference counts on all the aliased
 912                  * vnodes descends to one, we are on last close.
 913                  */
 914                 if (vcount(vp) > 0)
 915                         return (0);
 916 #else /* DEVFS_IMPLEMENTS_LOCKING */
 917                 /*
 918                  * Since every use (buffer, vnode, swap, blockmap)
 919                  * holds a reference to the vnode, and because we mark
 920                  * any other vnodes that alias this device, when the
 921                  * sum of the reference counts on all the aliased
 922                  * vnodes descends to one, we are on last close.
 923                  */
 924                 if (vcount(vp) > 0)
 925                         return (0);
 926
 927                 /*
 928                  * On last close of a block device (that isn't mounted)
 929                  * we must invalidate any in core blocks, so that
 930                  * we can, for instance, change floppy disks.
 931                  */
 932                 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
 933                         return (error);
 934
 935                 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
 936                 if (error)
 937                         return (error);
 938 #endif /* DEVFS_IMPLEMENTS_LOCKING */
 939                 devclose = bdevsw[major(dev)].d_close;
 940                 mode = S_IFBLK;
 941                 break;
 942
 943         default:
 944                 panic("spec_close: not special");
 945                 return(EBADF);
 946         }
 947
 948         return ((*devclose)(dev, flags, mode, p));
 949 }
 950
 951 /*
 952  * Return POSIX pathconf information applicable to special devices.
 953  */
 954 int
 955 spec_pathconf(struct vnop_pathconf_args *ap)
 956 {
 957
 958         switch (ap->a_name) {
 959         case _PC_LINK_MAX:
 960                 *ap->a_retval = LINK_MAX;
 961                 return (0);
 962         case _PC_MAX_CANON:
 963                 *ap->a_retval = MAX_CANON;
 964                 return (0);
 965         case _PC_MAX_INPUT:
 966                 *ap->a_retval = MAX_INPUT;
 967                 return (0);
 968         case _PC_PIPE_BUF:
 969                 *ap->a_retval = PIPE_BUF;
 970                 return (0);
 971         case _PC_CHOWN_RESTRICTED:
 972                 *ap->a_retval = 200112;         /* _POSIX_CHOWN_RESTRICTED */
 973                 return (0);
 974         case _PC_VDISABLE:
 975                 *ap->a_retval = _POSIX_VDISABLE;
 976                 return (0);
 977         default:
 978                 return (EINVAL);
 979         }
 980         /* NOTREACHED */
 981 }
 982
 983 /*
 984  * Special device failed operation
 985  */
 986 int
 987 spec_ebadf(__unused void *dummy)
 988 {
 989
 990         return (EBADF);
 991 }
 992
 993 /* Blktooff derives file offset from logical block number */
 994 int
 995 spec_blktooff(struct vnop_blktooff_args *ap)
 996 {
 997         struct vnode *vp = ap->a_vp;
 998
 999         switch (vp->v_type) {
1000         case VCHR:
1001                 *ap->a_offset = (off_t)-1; /* failure */
1002                 return (ENOTSUP);
1003
1004         case VBLK:
1005                 printf("spec_blktooff: not implemented for VBLK\n");
1006                 *ap->a_offset = (off_t)-1; /* failure */
1007                 return (ENOTSUP);
1008
1009         default:
1010                 panic("spec_blktooff type");
1011         }
1012         /* NOTREACHED */
1013
1014         return (0);
1015 }
1016
1017 /* Offtoblk derives logical block number from file offset */
1018 int
1019 spec_offtoblk(struct vnop_offtoblk_args *ap)
1020 {
1021         struct vnode *vp = ap->a_vp;
1022
1023         switch (vp->v_type) {
1024         case VCHR:
1025                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
1026                 return (ENOTSUP);
1027
1028         case VBLK:
1029                 printf("spec_offtoblk: not implemented for VBLK\n");
1030                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
1031                 return (ENOTSUP);
1032
1033         default:
1034                 panic("spec_offtoblk type");
1035         }
1036         /* NOTREACHED */
1037
1038         return (0);
1039 }