bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * Redistribution and use in source and binary forms, with or without
  28  * modification, are permitted provided that the following conditions
  29  * are met:
  30  * 1. Redistributions of source code must retain the above copyright
  31  *    notice, this list of conditions and the following disclaimer.
  32  * 2. Redistributions in binary form must reproduce the above copyright
  33  *    notice, this list of conditions and the following disclaimer in the
  34  *    documentation and/or other materials provided with the distribution.
  35  * 3. All advertising materials mentioning features or use of this software
  36  *    must display the following acknowledgement:
  37  *      This product includes software developed by the University of
  38  *      California, Berkeley and its contributors.
  39  * 4. Neither the name of the University nor the names of its contributors
  40  *    may be used to endorse or promote products derived from this software
  41  *    without specific prior written permission.
  42  *
  43  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  46  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  53  * SUCH DAMAGE.
  54  *
  55  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  56  */
  57
  58 #include <sys/param.h>
  59 #include <sys/proc.h>
  60 #include <sys/buf.h>
  61 #include <sys/vnode.h>
  62 #include <sys/mount.h>
  63 #include <sys/trace.h>
  64 #include <sys/malloc.h>
  65 #include <sys/time.h>
  66 #include <sys/kernel.h>
  67 #include <sys/resourcevar.h>
  68 #include <libkern/libkern.h>
  69 #include <machine/machine_routines.h>
  70
  71 #include <sys/ubc.h>
  72 #include <vm/vm_pageout.h>
  73
  74 #include <mach/mach_types.h>
  75 #include <mach/memory_object_types.h>
  76
  77 #include <sys/kdebug.h>
  78
  79
  80 #define CL_READ      0x01
  81 #define CL_ASYNC     0x02
  82 #define CL_COMMIT    0x04
  83 #define CL_PAGEOUT   0x10
  84 #define CL_AGE       0x20
  85 #define CL_DUMP      0x40
  86 #define CL_NOZERO    0x80
  87 #define CL_PAGEIN    0x100
  88 #define CL_DEV_MEMORY 0x200
  89 #define CL_PRESERVE   0x400
  90 #define CL_THROTTLE   0x800
  91
  92
  93 struct clios {
  94         u_int  io_completed;       /* amount of io that has currently completed */
  95         u_int  io_issued;          /* amount of io that was successfully issued */
  96         int    io_error;           /* error code of first error encountered */
  97         int    io_wanted;          /* someone is sleeping waiting for a change in state */
  98 };
  99
 100
 101 static void cluster_zero(upl_t upl, vm_offset_t   upl_offset,
 102                 int size, struct buf *bp);
 103 static int cluster_read_x(struct vnode *vp, struct uio *uio,
 104                 off_t filesize, int devblocksize, int flags);
 105 static int cluster_write_x(struct vnode *vp, struct uio *uio,
 106                 off_t oldEOF, off_t newEOF, off_t headOff,
 107                 off_t tailOff, int devblocksize, int flags);
 108 static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
 109                 off_t filesize, int devblocksize, int flags);
 110 static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
 111                 off_t newEOF, int devblocksize, int flags);
 112 static int cluster_phys_read(struct vnode *vp, struct uio *uio,
 113                 off_t filesize, int devblocksize, int flags);
 114 static int cluster_phys_write(struct vnode *vp, struct uio *uio,
 115                 off_t newEOF, int devblocksize, int flags);
 116 static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
 117                 addr64_t usr_paddr, int xsize, int devblocksize, int flags);
 118 static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
 119 static int cluster_try_push(struct vnode *vp, off_t EOF, int can_delay, int push_all);
 120
 121 static int sparse_cluster_switch(struct vnode *vp, off_t EOF);
 122 static int sparse_cluster_push(struct vnode *vp, off_t EOF, int push_all);
 123 static int sparse_cluster_add(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last);
 124
 125 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp);
 126 static kern_return_t vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length);
 127 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
 128 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
 129
 130 int     ubc_page_op_with_control __P((memory_object_control_t, off_t, int, ppnum_t *, int *));
 131
 132
 133 /*
 134  * throttle the number of async writes that
 135  * can be outstanding on a single vnode
 136  * before we issue a synchronous write
 137  */
 138 #define ASYNC_THROTTLE  18
 139 #define HARD_THROTTLE_MAXCNT 1
 140 #define HARD_THROTTLE_MAXSIZE (64 * 1024)
 141
 142 int hard_throttle_on_root = 0;
 143 struct timeval priority_IO_timestamp_for_root;
 144
 145
 146 static int
 147 cluster_hard_throttle_on(vp)
 148         struct vnode *vp;
 149 {
 150         static struct timeval hard_throttle_maxelapsed = { 0, 300000 };
 151
 152         if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
 153                 struct timeval elapsed;
 154
 155                 if (hard_throttle_on_root)
 156                         return(1);
 157
 158                 elapsed = time;
 159                 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
 160
 161                 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
 162                         return(1);
 163         }
 164         return(0);
 165 }
 166
 167
 168 static int
 169 cluster_iodone(bp)
 170         struct buf *bp;
 171 {
 172         int         b_flags;
 173         int         error;
 174         int         total_size;
 175         int         total_resid;
 176         int         upl_offset;
 177         int         zero_offset;
 178         upl_t       upl;
 179         struct buf *cbp;
 180         struct buf *cbp_head;
 181         struct buf *cbp_next;
 182         struct buf *real_bp;
 183         struct vnode *vp;
 184         struct clios *iostate;
 185         int         commit_size;
 186         int         pg_offset;
 187
 188
 189         cbp_head = (struct buf *)(bp->b_trans_head);
 190
 191         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 192                      (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 193
 194         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 195                 /*
 196                  * all I/O requests that are part of this transaction
 197                  * have to complete before we can process it
 198                  */
 199                 if ( !(cbp->b_flags & B_DONE)) {
 200
 201                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 202                                      (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
 203
 204                         return 0;
 205                 }
 206         }
 207         error       = 0;
 208         total_size  = 0;
 209         total_resid = 0;
 210
 211         cbp        = cbp_head;
 212         upl_offset = cbp->b_uploffset;
 213         upl        = cbp->b_pagelist;
 214         b_flags    = cbp->b_flags;
 215         real_bp    = cbp->b_real_bp;
 216         vp         = cbp->b_vp;
 217         zero_offset= cbp->b_validend;
 218         iostate    = (struct clios *)cbp->b_iostate;
 219
 220         while (cbp) {
 221                 if ((cbp->b_flags & B_ERROR) && error == 0)
 222                         error = cbp->b_error;
 223
 224                 total_resid += cbp->b_resid;
 225                 total_size  += cbp->b_bcount;
 226
 227                 cbp_next = cbp->b_trans_next;
 228
 229                 free_io_buf(cbp);
 230
 231                 cbp = cbp_next;
 232         }
 233         if (zero_offset)
 234                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 235
 236         if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
 237                 vp->v_flag &= ~VTHROTTLED;
 238                 wakeup((caddr_t)&vp->v_numoutput);
 239         }
 240         if (iostate) {
 241                 /*
 242                  * someone has issued multiple I/Os asynchrounsly
 243                  * and is waiting for them to complete (streaming)
 244                  */
 245                 if (error && iostate->io_error == 0)
 246                         iostate->io_error = error;
 247
 248                 iostate->io_completed += total_size;
 249
 250                 if (iostate->io_wanted) {
 251                         /*
 252                          * someone is waiting for the state of
 253                          * this io stream to change
 254                          */
 255                         iostate->io_wanted = 0;
 256                         wakeup((caddr_t)&iostate->io_wanted);
 257                 }
 258         }
 259         if ((b_flags & B_NEED_IODONE) && real_bp) {
 260                 if (error) {
 261                         real_bp->b_flags |= B_ERROR;
 262                         real_bp->b_error = error;
 263                 }
 264                 real_bp->b_resid = total_resid;
 265
 266                 biodone(real_bp);
 267         }
 268         if (error == 0 && total_resid)
 269                 error = EIO;
 270
 271         if (b_flags & B_COMMIT_UPL) {
 272                 pg_offset   = upl_offset & PAGE_MASK;
 273                 commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 274
 275                 if (error || (b_flags & B_NOCACHE)) {
 276                         int upl_abort_code;
 277
 278                         if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
 279                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 280                         else if (b_flags & B_PGIN)
 281                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 282                         else
 283                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 284
 285                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 286                                         upl_abort_code);
 287
 288                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 289                                      (int)upl, upl_offset - pg_offset, commit_size,
 290                                      0x80000000|upl_abort_code, 0);
 291
 292                 } else {
 293                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 294
 295                         if (b_flags & B_PHYS) {
 296                                 if (b_flags & B_READ)
 297                                         upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
 298                         } else if ( !(b_flags & B_PAGEOUT))
 299                                 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
 300
 301                         if (b_flags & B_AGE)
 302                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 303
 304                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 305                                         upl_commit_flags);
 306
 307                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 308                                      (int)upl, upl_offset - pg_offset, commit_size,
 309                                      upl_commit_flags, 0);
 310                 }
 311         } else
 312                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 313                              (int)upl, upl_offset, 0, error, 0);
 314
 315         return (error);
 316 }
 317
 318
 319 static void
 320 cluster_zero(upl, upl_offset, size, bp)
 321         upl_t         upl;
 322         vm_offset_t   upl_offset;
 323         int           size;
 324         struct buf   *bp;
 325 {
 326         upl_page_info_t *pl;
 327
 328         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
 329                      upl_offset, size, (int)bp, 0, 0);
 330
 331         if (bp == NULL || bp->b_data == NULL) {
 332
 333                 pl = ubc_upl_pageinfo(upl);
 334
 335                 while (size) {
 336                         int           page_offset;
 337                         int           page_index;
 338                         addr64_t      zero_addr;
 339                         int           zero_cnt;
 340
 341                         page_index  = upl_offset / PAGE_SIZE;
 342                         page_offset = upl_offset & PAGE_MASK;
 343
 344                         zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
 345                         zero_cnt  = min(PAGE_SIZE - page_offset, size);
 346
 347                         bzero_phys(zero_addr, zero_cnt);
 348
 349                         size       -= zero_cnt;
 350                         upl_offset += zero_cnt;
 351                 }
 352         } else
 353                 bzero((caddr_t)((vm_offset_t)bp->b_data + upl_offset), size);
 354
 355         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
 356                      upl_offset, size, 0, 0, 0);
 357 }
 358
 359 static int
 360 cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
 361         struct vnode *vp;
 362         upl_t         upl;
 363         vm_offset_t   upl_offset;
 364         off_t         f_offset;
 365         int           non_rounded_size;
 366         int           devblocksize;
 367         int           flags;
 368         struct buf   *real_bp;
 369         struct clios *iostate;
 370 {
 371         struct buf   *cbp;
 372         u_int         size;
 373         u_int         io_size;
 374         int           io_flags;
 375         int           error = 0;
 376         int           retval = 0;
 377         struct buf   *cbp_head = 0;
 378         struct buf   *cbp_tail = 0;
 379         int buf_count = 0;
 380         int pg_count;
 381         int pg_offset;
 382         u_int max_iosize;
 383         u_int max_vectors;
 384         int priv;
 385         int zero_offset = 0;
 386         int async_throttle;
 387
 388         if (devblocksize)
 389                 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
 390         else
 391                 size = non_rounded_size;
 392
 393         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 394                      (int)f_offset, size, upl_offset, flags, 0);
 395
 396
 397         if (flags & CL_READ) {
 398                 io_flags = (B_VECTORLIST | B_READ);
 399
 400                 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
 401         } else {
 402                 io_flags = (B_VECTORLIST | B_WRITEINPROG);
 403
 404                 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
 405         }
 406         /*
 407          * make sure the maximum iosize are at least the size of a page
 408          * and that they are multiples of the page size
 409          */
 410         max_iosize  &= ~PAGE_MASK;
 411
 412         if (flags & CL_THROTTLE) {
 413                 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
 414                         if (max_iosize > HARD_THROTTLE_MAXSIZE)
 415                                 max_iosize = HARD_THROTTLE_MAXSIZE;
 416                         async_throttle = HARD_THROTTLE_MAXCNT;
 417                 } else
 418                         async_throttle = ASYNC_THROTTLE;
 419         }
 420         if (flags & CL_AGE)
 421                 io_flags |= B_AGE;
 422         if (flags & CL_DUMP)
 423                 io_flags |= B_NOCACHE;
 424         if (flags & CL_PAGEIN)
 425                 io_flags |= B_PGIN;
 426         if (flags & CL_PAGEOUT)
 427                 io_flags |= B_PAGEOUT;
 428         if (flags & CL_COMMIT)
 429                 io_flags |= B_COMMIT_UPL;
 430         if (flags & CL_PRESERVE)
 431                 io_flags |= B_PHYS;
 432
 433         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 434                 /*
 435                  * then we are going to end up
 436                  * with a page that we can't complete (the file size wasn't a multiple
 437                  * of PAGE_SIZE and we're trying to read to the end of the file
 438                  * so we'll go ahead and zero out the portion of the page we can't
 439                  * read in from the file
 440                  */
 441                 zero_offset = upl_offset + non_rounded_size;
 442         }
 443         while (size) {
 444                 int vsize;
 445                 int i;
 446                 int pg_resid;
 447                 int num_contig;
 448                 daddr_t lblkno;
 449                 daddr_t blkno;
 450
 451                 if (size > max_iosize)
 452                         io_size = max_iosize;
 453                 else
 454                         io_size = size;
 455
 456                 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
 457                         if (error == EOPNOTSUPP)
 458                                 panic("VOP_CMAP Unimplemented");
 459                         break;
 460                 }
 461
 462                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 463                              (int)f_offset, (int)blkno, io_size, zero_offset, 0);
 464
 465                 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
 466                         if (flags & CL_PAGEOUT) {
 467                                 error = EINVAL;
 468                                 break;
 469                         };
 470
 471                         /* Try paging out the page individually before
 472                            giving up entirely and dumping it (it could
 473                            be mapped in a "hole" and require allocation
 474                            before the I/O:
 475                          */
 476                          ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
 477                          if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
 478                                 error = EINVAL;
 479                                 break;
 480                          };
 481
 482                         f_offset   += PAGE_SIZE_64;
 483                         upl_offset += PAGE_SIZE;
 484                         size       -= PAGE_SIZE;
 485                         continue;
 486                 }
 487                 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
 488                 /*
 489                  * we have now figured out how much I/O we can do - this is in 'io_size'
 490                  * pg_offset is the starting point in the first page for the I/O
 491                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 492                  */
 493                 pg_offset = upl_offset & PAGE_MASK;
 494
 495                 if (flags & CL_DEV_MEMORY) {
 496                         /*
 497                          * currently, can't deal with reading 'holes' in file
 498                          */
 499                         if ((long)blkno == -1) {
 500                                 error = EINVAL;
 501                                 break;
 502                         }
 503                         /*
 504                          * treat physical requests as one 'giant' page
 505                          */
 506                         pg_count = 1;
 507                 } else
 508                         pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 509
 510                 if ((flags & CL_READ) && (long)blkno == -1) {
 511                         int bytes_to_zero;
 512
 513                         /*
 514                          * if we're reading and blkno == -1, then we've got a
 515                          * 'hole' in the file that we need to deal with by zeroing
 516                          * out the affected area in the upl
 517                          */
 518                         if (zero_offset && io_size == size) {
 519                                 /*
 520                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
 521                                  * than 'zero_offset' will be non-zero
 522                                  * if the 'hole' returned by VOP_CMAP extends all the way to the eof
 523                                  * (indicated by the io_size finishing off the I/O request for this UPL)
 524                                  * than we're not going to issue an I/O for the
 525                                  * last page in this upl... we need to zero both the hole and the tail
 526                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
 527                                  */
 528                                 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
 529
 530                                 zero_offset = 0;
 531                         } else
 532                                 bytes_to_zero = io_size;
 533
 534                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
 535
 536                         if (cbp_head)
 537                                 /*
 538                                  * if there is a current I/O chain pending
 539                                  * then the first page of the group we just zero'd
 540                                  * will be handled by the I/O completion if the zero
 541                                  * fill started in the middle of the page
 542                                  */
 543                                 pg_count = (io_size - pg_offset) / PAGE_SIZE;
 544                         else {
 545                                 /*
 546                                  * no pending I/O to pick up that first page
 547                                  * so, we have to make sure it gets committed
 548                                  * here.
 549                                  * set the pg_offset to 0 so that the upl_commit_range
 550                                  * starts with this page
 551                                  */
 552                                 pg_count = (io_size + pg_offset) / PAGE_SIZE;
 553                                 pg_offset = 0;
 554                         }
 555                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 556                                 /*
 557                                  * if we're done with the request for this UPL
 558                                  * then we have to make sure to commit the last page
 559                                  * even if we only partially zero-filled it
 560                                  */
 561                                 pg_count++;
 562
 563                         if (pg_count) {
 564                                 if (pg_offset)
 565                                         pg_resid = PAGE_SIZE - pg_offset;
 566                                 else
 567                                         pg_resid = 0;
 568
 569                                 if (flags & CL_COMMIT)
 570                                         ubc_upl_commit_range(upl,
 571                                                         (upl_offset + pg_resid) & ~PAGE_MASK,
 572                                                         pg_count * PAGE_SIZE,
 573                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 574                         }
 575                         upl_offset += io_size;
 576                         f_offset   += io_size;
 577                         size       -= io_size;
 578
 579                         if (cbp_head && pg_count)
 580                                 goto start_io;
 581                         continue;
 582
 583                 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
 584                         real_bp->b_blkno = blkno;
 585                 }
 586
 587                 if (pg_count > max_vectors) {
 588                         io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 589
 590                         if (io_size < 0) {
 591                                 io_size = PAGE_SIZE - pg_offset;
 592                                 pg_count = 1;
 593                         } else
 594                                 pg_count = max_vectors;
 595                 }
 596
 597                 if ( !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV))
 598                         /*
 599                          * if we're not targeting a virtual device i.e. a disk image
 600                          * it's safe to dip into the reserve pool since real devices
 601                          * can complete this I/O request without requiring additional
 602                          * bufs from the alloc_io_buf pool
 603                          */
 604                         priv = 1;
 605                 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 606                         /*
 607                          * Throttle the speculative IO
 608                          */
 609                         priv = 0;
 610                 else
 611                         priv = 1;
 612
 613                 cbp = alloc_io_buf(vp, priv);
 614
 615
 616                 if (flags & CL_PAGEOUT) {
 617                         for (i = 0; i < pg_count; i++) {
 618                                 int         s;
 619                                 struct buf *bp;
 620
 621                                 s = splbio();
 622                                 if (bp = incore(vp, lblkno + i)) {
 623                                         if (!ISSET(bp->b_flags, B_BUSY)) {
 624                                                 bremfree(bp);
 625                                                 SET(bp->b_flags, (B_BUSY | B_INVAL));
 626                                                 splx(s);
 627                                                 brelse(bp);
 628                                         } else
 629                                                 panic("BUSY bp found in cluster_io");
 630                                 }
 631                                 splx(s);
 632                         }
 633                 }
 634                 if (flags & CL_ASYNC) {
 635                         cbp->b_flags |= (B_CALL | B_ASYNC);
 636                         cbp->b_iodone = (void *)cluster_iodone;
 637                 }
 638                 cbp->b_flags |= io_flags;
 639
 640                 cbp->b_lblkno = lblkno;
 641                 cbp->b_blkno  = blkno;
 642                 cbp->b_bcount = io_size;
 643                 cbp->b_pagelist  = upl;
 644                 cbp->b_uploffset = upl_offset;
 645                 cbp->b_trans_next = (struct buf *)0;
 646
 647                 if (cbp->b_iostate = (void *)iostate)
 648                         /*
 649                          * caller wants to track the state of this
 650                          * io... bump the amount issued against this stream
 651                          */
 652                         iostate->io_issued += io_size;
 653
 654                 if (flags & CL_READ)
 655                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 656                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 657                 else
 658                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 659                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 660
 661                 if (cbp_head) {
 662                         cbp_tail->b_trans_next = cbp;
 663                         cbp_tail = cbp;
 664                 } else {
 665                         cbp_head = cbp;
 666                         cbp_tail = cbp;
 667                 }
 668                 (struct buf *)(cbp->b_trans_head) = cbp_head;
 669                 buf_count++;
 670
 671                 upl_offset += io_size;
 672                 f_offset   += io_size;
 673                 size       -= io_size;
 674
 675                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
 676                         /*
 677                          * if we have no more I/O to issue or
 678                          * the current I/O we've prepared fully
 679                          * completes the last page in this request
 680                          * and it's either an ASYNC request or
 681                          * we've already accumulated more than 8 I/O's into
 682                          * this transaction and it's not an I/O directed to
 683                          * special DEVICE memory
 684                          * then go ahead and issue the I/O
 685                          */
 686 start_io:
 687                         if (real_bp) {
 688                                 cbp_head->b_flags |= B_NEED_IODONE;
 689                                 cbp_head->b_real_bp = real_bp;
 690                         } else
 691                                 cbp_head->b_real_bp = (struct buf *)NULL;
 692
 693                         if (size == 0) {
 694                                 /*
 695                                  * we're about to issue the last I/O for this upl
 696                                  * if this was a read to the eof and the eof doesn't
 697                                  * finish on a page boundary, than we need to zero-fill
 698                                  * the rest of the page....
 699                                  */
 700                                 cbp_head->b_validend = zero_offset;
 701                         } else
 702                                 cbp_head->b_validend = 0;
 703
 704                         if (flags & CL_THROTTLE) {
 705                                 while (vp->v_numoutput >= async_throttle) {
 706                                         vp->v_flag |= VTHROTTLED;
 707                                         tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_io", 0);
 708                                 }
 709                         }
 710                         for (cbp = cbp_head; cbp;) {
 711                                 struct buf * cbp_next;
 712
 713                                 if (io_flags & B_WRITEINPROG)
 714                                         cbp->b_vp->v_numoutput++;
 715
 716                                 cbp_next = cbp->b_trans_next;
 717
 718                                 (void) VOP_STRATEGY(cbp);
 719                                 cbp = cbp_next;
 720                         }
 721                         if ( !(flags & CL_ASYNC)) {
 722                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 723                                         biowait(cbp);
 724
 725                                 if (error = cluster_iodone(cbp_head)) {
 726                                         if ((flags & CL_PAGEOUT) && (error == ENXIO))
 727                                                 retval = 0;     /* drop the error */
 728                                         else
 729                                                 retval = error;
 730                                         error  = 0;
 731                                 }
 732                         }
 733                         cbp_head = (struct buf *)0;
 734                         cbp_tail = (struct buf *)0;
 735
 736                         buf_count = 0;
 737                 }
 738         }
 739         if (error) {
 740                 int abort_size;
 741
 742                 io_size = 0;
 743
 744                 for (cbp = cbp_head; cbp;) {
 745                         struct buf * cbp_next;
 746
 747                         upl_offset -= cbp->b_bcount;
 748                         size       += cbp->b_bcount;
 749                         io_size    += cbp->b_bcount;
 750
 751                         cbp_next = cbp->b_trans_next;
 752                         free_io_buf(cbp);
 753                         cbp = cbp_next;
 754                 }
 755                 if (iostate) {
 756                         /*
 757                          * update the error condition for this stream
 758                          * since we never really issued the io
 759                          * just go ahead and adjust it back
 760                          */
 761                         if (iostate->io_error == 0)
 762                                 iostate->io_error = error;
 763                         iostate->io_issued -= io_size;
 764
 765                         if (iostate->io_wanted) {
 766                                 /*
 767                                  * someone is waiting for the state of
 768                                  * this io stream to change
 769                                  */
 770                                 iostate->io_wanted = 0;
 771                                 wakeup((caddr_t)&iostate->io_wanted);
 772                         }
 773                 }
 774                 pg_offset  = upl_offset & PAGE_MASK;
 775                 abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 776
 777                 if (flags & CL_COMMIT) {
 778                         int upl_abort_code;
 779
 780                         if (flags & CL_PRESERVE) {
 781                                 ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
 782                                                      UPL_COMMIT_FREE_ON_EMPTY);
 783                         } else {
 784                                 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
 785                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 786                                 else if (flags & CL_PAGEIN)
 787                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 788                                 else
 789                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 790
 791                                 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
 792                                                 upl_abort_code);
 793                         }
 794                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
 795                                      (int)upl, upl_offset - pg_offset, abort_size, error, 0);
 796                 }
 797                 if (real_bp) {
 798                         real_bp->b_flags |= B_ERROR;
 799                         real_bp->b_error  = error;
 800
 801                         biodone(real_bp);
 802                 }
 803                 if (retval == 0)
 804                         retval = error;
 805         }
 806         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
 807                      (int)f_offset, size, upl_offset, retval, 0);
 808
 809         return (retval);
 810 }
 811
 812
 813 static int
 814 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
 815         struct vnode *vp;
 816         off_t         f_offset;
 817         u_int         size;
 818         off_t         filesize;
 819         int           devblocksize;
 820 {
 821         int           pages_in_prefetch;
 822
 823         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
 824                      (int)f_offset, size, (int)filesize, 0, 0);
 825
 826         if (f_offset >= filesize) {
 827                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 828                              (int)f_offset, 0, 0, 0, 0);
 829                 return(0);
 830         }
 831         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
 832                 size = (MAX_UPL_TRANSFER * PAGE_SIZE);
 833         else
 834                 size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 835
 836         if ((off_t)size > (filesize - f_offset))
 837                 size = filesize - f_offset;
 838         pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
 839
 840         advisory_read(vp, filesize, f_offset, size, devblocksize);
 841
 842         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 843                      (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
 844
 845         return (pages_in_prefetch);
 846 }
 847
 848
 849
 850 static void
 851 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
 852         struct vnode *vp;
 853         daddr_t       b_lblkno;
 854         daddr_t       e_lblkno;
 855         off_t         filesize;
 856         int           devblocksize;
 857 {
 858         daddr_t       r_lblkno;
 859         off_t         f_offset;
 860         int           size_of_prefetch;
 861
 862         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
 863                      b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
 864
 865         if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
 866                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 867                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
 868                 return;
 869         }
 870         if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
 871                                  (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
 872                 vp->v_ralen = 0;
 873                 vp->v_maxra = 0;
 874
 875                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 876                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
 877
 878                 return;
 879         }
 880         if (e_lblkno < vp->v_maxra) {
 881                 if ((vp->v_maxra - e_lblkno) > (MAX_UPL_TRANSFER / 4)) {
 882
 883                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 884                                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
 885                         return;
 886                 }
 887         }
 888         r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
 889         f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
 890
 891         size_of_prefetch = 0;
 892
 893         ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
 894
 895         if (size_of_prefetch) {
 896                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 897                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
 898                 return;
 899         }
 900         if (f_offset < filesize) {
 901                 vp->v_ralen = vp->v_ralen ? min(MAX_UPL_TRANSFER, vp->v_ralen << 1) : 1;
 902
 903                 if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
 904                         vp->v_ralen = min(MAX_UPL_TRANSFER, (e_lblkno + 1) - b_lblkno);
 905
 906                 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
 907
 908                 if (size_of_prefetch)
 909                         vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
 910         }
 911         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 912                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 4, 0);
 913 }
 914
 915 int
 916 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 917         struct vnode *vp;
 918         upl_t         upl;
 919         vm_offset_t   upl_offset;
 920         off_t         f_offset;
 921         int           size;
 922         off_t         filesize;
 923         int           devblocksize;
 924         int           flags;
 925 {
 926         int           io_size;
 927         int           rounded_size;
 928         off_t         max_size;
 929         int           local_flags;
 930
 931         if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
 932                 /*
 933                  * if we know we're issuing this I/O to a virtual device (i.e. disk image)
 934                  * then we don't want to enforce this throttle... if we do, we can
 935                  * potentially deadlock since we're stalling the pageout thread at a time
 936                  * when the disk image might need additional memory (which won't be available
 937                  * if the pageout thread can't run)... instead we'll just depend on the throttle
 938                  * that the pageout thread now has in place to deal with external files
 939                  */
 940                 local_flags = CL_PAGEOUT;
 941         else
 942                 local_flags = CL_PAGEOUT | CL_THROTTLE;
 943
 944         if ((flags & UPL_IOSYNC) == 0)
 945                 local_flags |= CL_ASYNC;
 946         if ((flags & UPL_NOCOMMIT) == 0)
 947                 local_flags |= CL_COMMIT;
 948
 949
 950         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
 951                      (int)f_offset, size, (int)filesize, local_flags, 0);
 952
 953         /*
 954          * If they didn't specify any I/O, then we are done...
 955          * we can't issue an abort because we don't know how
 956          * big the upl really is
 957          */
 958         if (size <= 0)
 959                 return (EINVAL);
 960
 961         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
 962                 if (local_flags & CL_COMMIT)
 963                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 964                 return (EROFS);
 965         }
 966         /*
 967          * can't page-in from a negative offset
 968          * or if we're starting beyond the EOF
 969          * or if the file offset isn't page aligned
 970          * or the size requested isn't a multiple of PAGE_SIZE
 971          */
 972         if (f_offset < 0 || f_offset >= filesize ||
 973            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 974                 if (local_flags & CL_COMMIT)
 975                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 976                 return (EINVAL);
 977         }
 978         max_size = filesize - f_offset;
 979
 980         if (size < max_size)
 981                 io_size = size;
 982         else
 983                 io_size = max_size;
 984
 985         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 986
 987         if (size > rounded_size) {
 988                 if (local_flags & CL_COMMIT)
 989                         ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
 990                                         UPL_ABORT_FREE_ON_EMPTY);
 991         }
 992         vp->v_flag |= VHASBEENPAGED;
 993
 994         return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
 995                            local_flags, (struct buf *)0, (struct clios *)0));
 996 }
 997
 998 int
 999 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
1000         struct vnode *vp;
1001         upl_t         upl;
1002         vm_offset_t   upl_offset;
1003         off_t         f_offset;
1004         int           size;
1005         off_t         filesize;
1006         int           devblocksize;
1007         int           flags;
1008 {
1009         u_int         io_size;
1010         int           rounded_size;
1011         off_t         max_size;
1012         int           retval;
1013         int           local_flags = 0;
1014
1015         if (upl == NULL || size < 0)
1016                 panic("cluster_pagein: NULL upl passed in");
1017
1018         if ((flags & UPL_IOSYNC) == 0)
1019                 local_flags |= CL_ASYNC;
1020         if ((flags & UPL_NOCOMMIT) == 0)
1021                 local_flags |= CL_COMMIT;
1022
1023
1024         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1025                      (int)f_offset, size, (int)filesize, local_flags, 0);
1026
1027         /*
1028          * can't page-in from a negative offset
1029          * or if we're starting beyond the EOF
1030          * or if the file offset isn't page aligned
1031          * or the size requested isn't a multiple of PAGE_SIZE
1032          */
1033         if (f_offset < 0 || f_offset >= filesize ||
1034            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1035                 if (local_flags & CL_COMMIT)
1036                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1037                 return (EINVAL);
1038         }
1039         max_size = filesize - f_offset;
1040
1041         if (size < max_size)
1042                 io_size = size;
1043         else
1044                 io_size = max_size;
1045
1046         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1047
1048         if (size > rounded_size && (local_flags & CL_COMMIT))
1049                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1050                                     size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1051
1052         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
1053                            local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
1054
1055         if (retval == 0) {
1056                 int b_lblkno;
1057                 int e_lblkno;
1058
1059                 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
1060                 e_lblkno = (int)
1061                         ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1062
1063                 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
1064                         /*
1065                          * we haven't read the last page in of the file yet
1066                          * so let's try to read ahead if we're in
1067                          * a sequential access pattern
1068                          */
1069                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1070                 }
1071                 vp->v_lastr = e_lblkno;
1072         }
1073         return (retval);
1074 }
1075
1076 int
1077 cluster_bp(bp)
1078         struct buf *bp;
1079 {
1080         off_t  f_offset;
1081         int    flags;
1082
1083         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1084                      (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1085
1086         if (bp->b_pagelist == (upl_t) 0)
1087                 panic("cluster_bp: can't handle NULL upl yet\n");
1088         if (bp->b_flags & B_READ)
1089                 flags = CL_ASYNC | CL_READ;
1090         else
1091                 flags = CL_ASYNC;
1092
1093         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1094
1095         return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
1096 }
1097
1098 int
1099 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1100         struct vnode *vp;
1101         struct uio   *uio;
1102         off_t         oldEOF;
1103         off_t         newEOF;
1104         off_t         headOff;
1105         off_t         tailOff;
1106         int           devblocksize;
1107         int           flags;
1108 {
1109         int           prev_resid;
1110         int           clip_size;
1111         off_t         max_io_size;
1112         struct iovec  *iov;
1113         int           upl_size;
1114         int           upl_flags;
1115         upl_t         upl;
1116         int           retval = 0;
1117
1118
1119         if (vp->v_flag & VHASBEENPAGED)
1120           {
1121             /*
1122              * this vnode had pages cleaned to it by
1123              * the pager which indicates that either
1124              * it's not very 'hot', or the system is
1125              * being overwhelmed by a lot of dirty
1126              * data being delayed in the VM cache...
1127              * in either event, we'll push our remaining
1128              * delayed data at this point...  this will
1129              * be more efficient than paging out 1 page at
1130              * a time, and will also act as a throttle
1131              * by delaying this client from writing any
1132              * more data until all his delayed data has
1133              * at least been queued to the uderlying driver.
1134              */
1135             cluster_push(vp);
1136
1137             vp->v_flag &= ~VHASBEENPAGED;
1138           }
1139
1140         if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
1141           {
1142             /*
1143              * go do a write through the cache if one of the following is true....
1144              *   NOCACHE is not true
1145              *   there is no uio structure or it doesn't target USERSPACE
1146              */
1147             return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags));
1148           }
1149
1150         while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1151           {
1152             /*
1153              * we know we have a resid, so this is safe
1154              * skip over any emtpy vectors
1155              */
1156             iov = uio->uio_iov;
1157
1158             while (iov->iov_len == 0) {
1159               uio->uio_iov++;
1160               uio->uio_iovcnt--;
1161               iov = uio->uio_iov;
1162             }
1163             upl_size  = PAGE_SIZE;
1164             upl_flags = UPL_QUERY_OBJECT_TYPE;
1165
1166             if ((vm_map_get_upl(current_map(),
1167                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1168                                &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS)
1169               {
1170                 /*
1171                  * the user app must have passed in an invalid address
1172                  */
1173                 return (EFAULT);
1174               }
1175
1176             /*
1177              * We check every vector target but if it is physically
1178              * contiguous space, we skip the sanity checks.
1179              */
1180             if (upl_flags & UPL_PHYS_CONTIG)
1181               {
1182                 if (flags & IO_HEADZEROFILL)
1183                   {
1184                     flags &= ~IO_HEADZEROFILL;
1185
1186                     if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1187                         return(retval);
1188                   }
1189
1190                 retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
1191
1192                 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1193                   {
1194                     return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL));
1195                   }
1196               }
1197             else if ((uio->uio_resid < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1198               {
1199                 /*
1200                  * we're here because we're don't have a physically contiguous target buffer
1201                  * go do a write through the cache if one of the following is true....
1202                  *   the total xfer size is less than a page...
1203                  *   we're being asked to ZEROFILL either the head or the tail of the I/O...
1204                  */
1205                 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags));
1206               }
1207             else if (((int)uio->uio_offset & PAGE_MASK) || ((int)iov->iov_base & PAGE_MASK))
1208               {
1209                 if (((int)uio->uio_offset & PAGE_MASK) == ((int)iov->iov_base & PAGE_MASK))
1210                   {
1211                     /*
1212                      * Bring the file offset write up to a pagesize boundary
1213                      * this will also bring the base address to a page boundary
1214                      * since they both are currently on the same offset within a page
1215                      * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1216                      * so the computed clip_size must always be less than the current uio_resid
1217                      */
1218                     clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1219
1220                     /*
1221                      * Fake the resid going into the cluster_write_x call
1222                      * and restore it on the way out.
1223                      */
1224                     prev_resid = uio->uio_resid;
1225                     uio->uio_resid = clip_size;
1226                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1227                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1228                   }
1229                 else
1230                   {
1231                     /*
1232                      * can't get both the file offset and the buffer offset aligned to a page boundary
1233                      * so fire an I/O through the cache for this entire vector
1234                      */
1235                     clip_size = iov->iov_len;
1236                     prev_resid = uio->uio_resid;
1237                     uio->uio_resid = clip_size;
1238                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1239                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1240                   }
1241               }
1242             else
1243               {
1244                 /*
1245                  * If we come in here, we know the offset into
1246                  * the file is on a pagesize boundary and the
1247                  * target buffer address is also on a page boundary
1248                  */
1249                 max_io_size = newEOF - uio->uio_offset;
1250                 clip_size = uio->uio_resid;
1251                 if (iov->iov_len < clip_size)
1252                   clip_size = iov->iov_len;
1253                 if (max_io_size < clip_size)
1254                   clip_size = max_io_size;
1255
1256                 if (clip_size < PAGE_SIZE)
1257                   {
1258                     /*
1259                      * Take care of tail end of write in this vector
1260                      */
1261                     prev_resid = uio->uio_resid;
1262                     uio->uio_resid = clip_size;
1263                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1264                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1265                   }
1266                 else
1267                   {
1268                     /* round clip_size down to a multiple of pagesize */
1269                     clip_size = clip_size & ~(PAGE_MASK);
1270                     prev_resid = uio->uio_resid;
1271                     uio->uio_resid = clip_size;
1272                     retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1273                     if ((retval == 0) && uio->uio_resid)
1274                       retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1275                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1276                   }
1277               } /* end else */
1278           } /* end while */
1279         return(retval);
1280 }
1281
1282
1283 static int
1284 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1285         struct vnode *vp;
1286         struct uio   *uio;
1287         off_t         newEOF;
1288         int           devblocksize;
1289         int           flags;
1290 {
1291         upl_t            upl;
1292         upl_page_info_t  *pl;
1293         off_t            upl_f_offset;
1294         vm_offset_t      upl_offset;
1295         off_t            max_io_size;
1296         int              io_size;
1297         int              io_flag;
1298         int              upl_size;
1299         int              upl_needed_size;
1300         int              pages_in_pl;
1301         int              upl_flags;
1302         kern_return_t    kret;
1303         struct iovec     *iov;
1304         int              i;
1305         int              force_data_sync;
1306         int              error  = 0;
1307         struct clios     iostate;
1308
1309         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1310                      (int)uio->uio_offset, (int)uio->uio_resid,
1311                      (int)newEOF, devblocksize, 0);
1312
1313         /*
1314          * When we enter this routine, we know
1315          *  -- the offset into the file is on a pagesize boundary
1316          *  -- the resid is a page multiple
1317          *  -- the resid will not exceed iov_len
1318          */
1319         cluster_try_push(vp, newEOF, 0, 1);
1320
1321         iostate.io_completed = 0;
1322         iostate.io_issued = 0;
1323         iostate.io_error = 0;
1324         iostate.io_wanted = 0;
1325
1326         iov = uio->uio_iov;
1327
1328         while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1329                 io_size = uio->uio_resid;
1330
1331                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1332                         io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1333
1334                 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
1335                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1336
1337                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1338                              (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1339
1340                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1341                         pages_in_pl = 0;
1342                         upl_size = upl_needed_size;
1343                         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1344                                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1345
1346                         kret = vm_map_get_upl(current_map(),
1347                                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1348                                               &upl_size,
1349                                               &upl,
1350                                               NULL,
1351                                               &pages_in_pl,
1352                                               &upl_flags,
1353                                               force_data_sync);
1354
1355                         if (kret != KERN_SUCCESS) {
1356                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1357                                              0, 0, 0, kret, 0);
1358                                 /*
1359                                  * cluster_nocopy_write: failed to get pagelist
1360                                  *
1361                                  * we may have already spun some portion of this request
1362                                  * off as async requests... we need to wait for the I/O
1363                                  * to complete before returning
1364                                  */
1365                                 goto wait_for_writes;
1366                         }
1367                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1368                         pages_in_pl = upl_size / PAGE_SIZE;
1369
1370                         for (i = 0; i < pages_in_pl; i++) {
1371                                 if (!upl_valid_page(pl, i))
1372                                         break;
1373                         }
1374                         if (i == pages_in_pl)
1375                                 break;
1376
1377                         /*
1378                          * didn't get all the pages back that we
1379                          * needed... release this upl and try again
1380                          */
1381                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1382                                             UPL_ABORT_FREE_ON_EMPTY);
1383                 }
1384                 if (force_data_sync >= 3) {
1385                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1386                                      i, pages_in_pl, upl_size, kret, 0);
1387                         /*
1388                          * for some reason, we couldn't acquire a hold on all
1389                          * the pages needed in the user's address space
1390                          *
1391                          * we may have already spun some portion of this request
1392                          * off as async requests... we need to wait for the I/O
1393                          * to complete before returning
1394                          */
1395                         goto wait_for_writes;
1396                 }
1397
1398                 /*
1399                  * Consider the possibility that upl_size wasn't satisfied.
1400                  */
1401                 if (upl_size != upl_needed_size)
1402                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1403
1404                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1405                              (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1406
1407                 if (io_size == 0) {
1408                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1409                                             UPL_ABORT_FREE_ON_EMPTY);
1410                         /*
1411                          * we may have already spun some portion of this request
1412                          * off as async requests... we need to wait for the I/O
1413                          * to complete before returning
1414                          */
1415                         goto wait_for_writes;
1416                 }
1417                 /*
1418                  * Now look for pages already in the cache
1419                  * and throw them away.
1420                  * uio->uio_offset is page aligned within the file
1421                  * io_size is a multiple of PAGE_SIZE
1422                  */
1423                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
1424
1425                 /*
1426                  * we want push out these writes asynchronously so that we can overlap
1427                  * the preparation of the next I/O
1428                  * if there are already too many outstanding writes
1429                  * wait until some complete before issuing the next
1430                  */
1431                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1432                         iostate.io_wanted = 1;
1433                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1434                 }
1435                 if (iostate.io_error) {
1436                         /*
1437                          * one of the earlier writes we issued ran into a hard error
1438                          * don't issue any more writes, cleanup the UPL
1439                          * that was just created but not used, then
1440                          * go wait for all writes that are part of this stream
1441                          * to complete before returning the error to the caller
1442                          */
1443                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1444                                             UPL_ABORT_FREE_ON_EMPTY);
1445
1446                         goto wait_for_writes;
1447                 }
1448                 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE;
1449
1450                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1451                              (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1452
1453                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1454                                    io_size, devblocksize, io_flag, (struct buf *)0, &iostate);
1455
1456                 iov->iov_len    -= io_size;
1457                 iov->iov_base   += io_size;
1458                 uio->uio_resid  -= io_size;
1459                 uio->uio_offset += io_size;
1460
1461                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1462                              (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1463
1464         } /* end while */
1465
1466 wait_for_writes:
1467         /*
1468          * make sure all async writes issued as part of this stream
1469          * have completed before we return
1470          */
1471         while (iostate.io_issued != iostate.io_completed) {
1472                 iostate.io_wanted = 1;
1473                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1474         }
1475         if (iostate.io_error)
1476                 error = iostate.io_error;
1477
1478         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1479                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1480
1481         return (error);
1482 }
1483
1484
1485 static int
1486 cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
1487         struct vnode *vp;
1488         struct uio   *uio;
1489         off_t        newEOF;
1490         int          devblocksize;
1491         int          flags;
1492 {
1493         upl_page_info_t *pl;
1494         addr64_t         src_paddr;
1495         upl_t            upl;
1496         vm_offset_t      upl_offset;
1497         int              tail_size;
1498         int              io_size;
1499         int              upl_size;
1500         int              upl_needed_size;
1501         int              pages_in_pl;
1502         int              upl_flags;
1503         kern_return_t    kret;
1504         struct iovec     *iov;
1505         int              error  = 0;
1506
1507         /*
1508          * When we enter this routine, we know
1509          *  -- the resid will not exceed iov_len
1510          *  -- the vector target address is physcially contiguous
1511          */
1512         cluster_try_push(vp, newEOF, 0, 1);
1513
1514         iov = uio->uio_iov;
1515         io_size = iov->iov_len;
1516         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
1517         upl_needed_size = upl_offset + io_size;
1518
1519         pages_in_pl = 0;
1520         upl_size = upl_needed_size;
1521         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1522                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1523
1524         kret = vm_map_get_upl(current_map(),
1525                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1526                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1527
1528         if (kret != KERN_SUCCESS) {
1529                 /*
1530                  * cluster_phys_write: failed to get pagelist
1531                  * note: return kret here
1532                  */
1533               return(EINVAL);
1534         }
1535         /*
1536          * Consider the possibility that upl_size wasn't satisfied.
1537          * This is a failure in the physical memory case.
1538          */
1539         if (upl_size < upl_needed_size) {
1540                 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1541                 return(EINVAL);
1542         }
1543         pl = ubc_upl_pageinfo(upl);
1544
1545         src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)((u_int)iov->iov_base & PAGE_MASK));
1546
1547         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1548                 int   head_size;
1549
1550                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1551
1552                 if (head_size > io_size)
1553                         head_size = io_size;
1554
1555                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
1556
1557                 if (error) {
1558                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1559
1560                         return(EINVAL);
1561                 }
1562                 upl_offset += head_size;
1563                 src_paddr  += head_size;
1564                 io_size    -= head_size;
1565         }
1566         tail_size = io_size & (devblocksize - 1);
1567         io_size  -= tail_size;
1568
1569         if (io_size) {
1570                 /*
1571                  * issue a synchronous write to cluster_io
1572                  */
1573                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1574                                    io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
1575         }
1576         if (error == 0) {
1577                 /*
1578                  * The cluster_io write completed successfully,
1579                  * update the uio structure
1580                  */
1581                 uio->uio_resid  -= io_size;
1582                 iov->iov_len    -= io_size;
1583                 iov->iov_base   += io_size;
1584                 uio->uio_offset += io_size;
1585                 src_paddr       += io_size;
1586
1587                 if (tail_size)
1588                         error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
1589         }
1590         /*
1591          * just release our hold on the physically contiguous
1592          * region without changing any state
1593          */
1594         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1595
1596         return (error);
1597 }
1598
1599
1600 static int
1601 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1602         struct vnode *vp;
1603         struct uio   *uio;
1604         off_t         oldEOF;
1605         off_t         newEOF;
1606         off_t         headOff;
1607         off_t         tailOff;
1608         int           devblocksize;
1609         int           flags;
1610 {
1611         upl_page_info_t *pl;
1612         upl_t            upl;
1613         vm_offset_t      upl_offset;
1614         int              upl_size;
1615         off_t            upl_f_offset;
1616         int              pages_in_upl;
1617         int              start_offset;
1618         int              xfer_resid;
1619         int              io_size;
1620         int              io_flags;
1621         int              io_offset;
1622         int              bytes_to_zero;
1623         int              bytes_to_move;
1624         kern_return_t    kret;
1625         int              retval = 0;
1626         int              uio_resid;
1627         long long        total_size;
1628         long long        zero_cnt;
1629         off_t            zero_off;
1630         long long        zero_cnt1;
1631         off_t            zero_off1;
1632         daddr_t          start_blkno;
1633         daddr_t          last_blkno;
1634         int              intersection;
1635
1636
1637         if (uio) {
1638                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1639                              (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1640
1641                 uio_resid = uio->uio_resid;
1642         } else {
1643                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1644                              0, 0, (int)oldEOF, (int)newEOF, 0);
1645
1646                 uio_resid = 0;
1647         }
1648         zero_cnt  = 0;
1649         zero_cnt1 = 0;
1650
1651         if (flags & IO_HEADZEROFILL) {
1652                 /*
1653                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1654                  * so we zero fill the intervening space between the old EOF and the offset
1655                  * where the next chunk of real data begins.... ftruncate will also use this
1656                  * routine to zero fill to the new EOF when growing a file... in this case, the
1657                  * uio structure will not be provided
1658                  */
1659                 if (uio) {
1660                         if (headOff < uio->uio_offset) {
1661                                 zero_cnt = uio->uio_offset - headOff;
1662                                 zero_off = headOff;
1663                         }
1664                 } else if (headOff < newEOF) {
1665                         zero_cnt = newEOF - headOff;
1666                         zero_off = headOff;
1667                 }
1668         }
1669         if (flags & IO_TAILZEROFILL) {
1670                 if (uio) {
1671                         zero_off1 = uio->uio_offset + uio->uio_resid;
1672
1673                         if (zero_off1 < tailOff)
1674                                 zero_cnt1 = tailOff - zero_off1;
1675                 }
1676         }
1677         if (zero_cnt == 0 && uio == (struct uio *) 0) {
1678             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1679                          retval, 0, 0, 0, 0);
1680             return (0);
1681         }
1682
1683         while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1684                 /*
1685                  * for this iteration of the loop, figure out where our starting point is
1686                  */
1687                 if (zero_cnt) {
1688                         start_offset = (int)(zero_off & PAGE_MASK_64);
1689                         upl_f_offset = zero_off - start_offset;
1690                 } else if (uio_resid) {
1691                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1692                         upl_f_offset = uio->uio_offset - start_offset;
1693                 } else {
1694                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1695                         upl_f_offset = zero_off1 - start_offset;
1696                 }
1697                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1698                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1699
1700                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1701                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1702
1703                 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1704
1705                 if (uio && !(vp->v_flag & VNOCACHE_DATA) &&
1706                    (flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0) {
1707                         /*
1708                          * assumption... total_size <= uio_resid
1709                          * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1710                          */
1711                         if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
1712                                 total_size -= start_offset;
1713                         xfer_resid = total_size;
1714
1715                         retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
1716
1717                         if (retval)
1718                                 break;
1719
1720                         uio_resid   -= (total_size - xfer_resid);
1721                         total_size   = xfer_resid;
1722                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1723                         upl_f_offset = uio->uio_offset - start_offset;
1724
1725                         if (total_size == 0) {
1726                                 if (start_offset) {
1727                                         /*
1728                                          * the write did not finish on a page boundary
1729                                          * which will leave upl_f_offset pointing to the
1730                                          * beginning of the last page written instead of
1731                                          * the page beyond it... bump it in this case
1732                                          * so that the cluster code records the last page
1733                                          * written as dirty
1734                                          */
1735                                         upl_f_offset += PAGE_SIZE_64;
1736                                 }
1737                                 upl_size = 0;
1738
1739                                 goto check_cluster;
1740                         }
1741                 }
1742                 /*
1743                  * compute the size of the upl needed to encompass
1744                  * the requested write... limit each call to cluster_io
1745                  * to the maximum UPL size... cluster_io will clip if
1746                  * this exceeds the maximum io_size for the device,
1747                  * make sure to account for
1748                  * a starting offset that's not page aligned
1749                  */
1750                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1751
1752                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1753                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1754
1755                 pages_in_upl = upl_size / PAGE_SIZE;
1756                 io_size      = upl_size - start_offset;
1757
1758                 if ((long long)io_size > total_size)
1759                         io_size = total_size;
1760
1761                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
1762
1763
1764                 kret = ubc_create_upl(vp,
1765                                                         upl_f_offset,
1766                                                         upl_size,
1767                                                         &upl,
1768                                                         &pl,
1769                                                         UPL_SET_LITE);
1770                 if (kret != KERN_SUCCESS)
1771                         panic("cluster_write: failed to get pagelist");
1772
1773                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
1774                         (int)upl, (int)upl_f_offset, start_offset, 0, 0);
1775
1776                 if (start_offset && !upl_valid_page(pl, 0)) {
1777                         int   read_size;
1778
1779                         /*
1780                          * we're starting in the middle of the first page of the upl
1781                          * and the page isn't currently valid, so we're going to have
1782                          * to read it in first... this is a synchronous operation
1783                          */
1784                         read_size = PAGE_SIZE;
1785
1786                         if ((upl_f_offset + read_size) > newEOF)
1787                                 read_size = newEOF - upl_f_offset;
1788
1789                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
1790                                             CL_READ, (struct buf *)0, (struct clios *)0);
1791                         if (retval) {
1792                                 /*
1793                                  * we had an error during the read which causes us to abort
1794                                  * the current cluster_write request... before we do, we need
1795                                  * to release the rest of the pages in the upl without modifying
1796                                  * there state and mark the failed page in error
1797                                  */
1798                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1799                                 ubc_upl_abort_range(upl, 0, upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1800
1801                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1802                                              (int)upl, 0, 0, retval, 0);
1803                                 break;
1804                         }
1805                 }
1806                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1807                         /*
1808                          * the last offset we're writing to in this upl does not end on a page
1809                          * boundary... if it's not beyond the old EOF, then we'll also need to
1810                          * pre-read this page in if it isn't already valid
1811                          */
1812                         upl_offset = upl_size - PAGE_SIZE;
1813
1814                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1815                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1816                                 int   read_size;
1817
1818                                 read_size = PAGE_SIZE;
1819
1820                                 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1821                                         read_size = newEOF - (upl_f_offset + upl_offset);
1822
1823                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
1824                                                     CL_READ, (struct buf *)0, (struct clios *)0);
1825                                 if (retval) {
1826                                         /*
1827                                          * we had an error during the read which causes us to abort
1828                                          * the current cluster_write request... before we do, we
1829                                          * need to release the rest of the pages in the upl without
1830                                          * modifying there state and mark the failed page in error
1831                                          */
1832                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1833                                         ubc_upl_abort_range(upl, 0,          upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1834
1835                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1836                                                      (int)upl, 0, 0, retval, 0);
1837                                         break;
1838                                 }
1839                         }
1840                 }
1841                 xfer_resid = io_size;
1842                 io_offset = start_offset;
1843
1844                 while (zero_cnt && xfer_resid) {
1845
1846                         if (zero_cnt < (long long)xfer_resid)
1847                                 bytes_to_zero = zero_cnt;
1848                         else
1849                                 bytes_to_zero = xfer_resid;
1850
1851                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1852                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1853                         } else {
1854                                 int zero_pg_index;
1855
1856                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1857                                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1858
1859                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1860                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1861
1862                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1863                                            !upl_dirty_page(pl, zero_pg_index)) {
1864                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1865                                 }
1866                         }
1867                         xfer_resid -= bytes_to_zero;
1868                         zero_cnt   -= bytes_to_zero;
1869                         zero_off   += bytes_to_zero;
1870                         io_offset  += bytes_to_zero;
1871                 }
1872                 if (xfer_resid && uio_resid) {
1873                         bytes_to_move = min(uio_resid, xfer_resid);
1874
1875                         retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
1876
1877                         if (retval) {
1878
1879                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1880
1881                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1882                                              (int)upl, 0, 0, retval, 0);
1883                         } else {
1884                                 uio_resid  -= bytes_to_move;
1885                                 xfer_resid -= bytes_to_move;
1886                                 io_offset  += bytes_to_move;
1887                         }
1888                 }
1889                 while (xfer_resid && zero_cnt1 && retval == 0) {
1890
1891                         if (zero_cnt1 < (long long)xfer_resid)
1892                                 bytes_to_zero = zero_cnt1;
1893                         else
1894                                 bytes_to_zero = xfer_resid;
1895
1896                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1897                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1898                         } else {
1899                                 int zero_pg_index;
1900
1901                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1902                                 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1903
1904                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1905                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1906                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1907                                            !upl_dirty_page(pl, zero_pg_index)) {
1908                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1909                                 }
1910                         }
1911                         xfer_resid -= bytes_to_zero;
1912                         zero_cnt1  -= bytes_to_zero;
1913                         zero_off1  += bytes_to_zero;
1914                         io_offset  += bytes_to_zero;
1915                 }
1916
1917                 if (retval == 0) {
1918                         int cl_index;
1919                         int can_delay;
1920
1921                         io_size += start_offset;
1922
1923                         if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1924                                 /*
1925                                  * if we're extending the file with this write
1926                                  * we'll zero fill the rest of the page so that
1927                                  * if the file gets extended again in such a way as to leave a
1928                                  * hole starting at this EOF, we'll have zero's in the correct spot
1929                                  */
1930                                 cluster_zero(upl, io_size, upl_size - io_size, NULL);
1931                         }
1932                         if (flags & IO_SYNC)
1933                                 /*
1934                                  * if the IO_SYNC flag is set than we need to
1935                                  * bypass any clusters and immediately issue
1936                                  * the I/O
1937                                  */
1938                                 goto issue_io;
1939 check_cluster:
1940                         /*
1941                          * calculate the last logical block number
1942                          * that this delayed I/O encompassed
1943                          */
1944                         last_blkno = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
1945
1946                         if (vp->v_flag & VHASDIRTY) {
1947
1948                                 if ( !(vp->v_flag & VNOCACHE_DATA)) {
1949                                         /*
1950                                          * we've fallen into the sparse
1951                                          * cluster method of delaying dirty pages
1952                                          * first, we need to release the upl if we hold one
1953                                          * since pages in it may be present in the sparse cluster map
1954                                          * and may span 2 separate buckets there... if they do and
1955                                          * we happen to have to flush a bucket to make room and it intersects
1956                                          * this upl, a deadlock may result on page BUSY
1957                                          */
1958                                         if (upl_size)
1959                                                 ubc_upl_commit_range(upl, 0, upl_size,
1960                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1961
1962                                         sparse_cluster_add(vp, newEOF, start_blkno, last_blkno);
1963
1964                                         continue;
1965                                 }
1966                                 /*
1967                                  * must have done cached writes that fell into
1968                                  * the sparse cluster mechanism... we've switched
1969                                  * to uncached writes on the file, so go ahead
1970                                  * and push whatever's in the sparse map
1971                                  * and switch back to normal clustering
1972                                  *
1973                                  * see the comment above concerning a possible deadlock...
1974                                  */
1975                                 if (upl_size) {
1976                                         ubc_upl_commit_range(upl, 0, upl_size,
1977                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1978                                         /*
1979                                          * setting upl_size to 0 keeps us from committing a
1980                                          * second time in the start_new_cluster path
1981                                          */
1982                                         upl_size = 0;
1983                                 }
1984                                 sparse_cluster_push(vp, ubc_getsize(vp), 1);
1985
1986                                 /*
1987                                  * no clusters of either type present at this point
1988                                  * so just go directly to start_new_cluster since
1989                                  * we know we need to delay this I/O since we've
1990                                  * already released the pages back into the cache
1991                                  * to avoid the deadlock with sparse_cluster_push
1992                                  */
1993                                 goto start_new_cluster;
1994                         }
1995                         upl_offset = 0;
1996
1997                         if (vp->v_clen == 0)
1998                                 /*
1999                                  * no clusters currently present
2000                                  */
2001                                 goto start_new_cluster;
2002
2003                         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
2004                                 /*
2005                                  * check each cluster that we currently hold
2006                                  * try to merge some or all of this write into
2007                                  * one or more of the existing clusters... if
2008                                  * any portion of the write remains, start a
2009                                  * new cluster
2010                                  */
2011                                 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
2012                                         /*
2013                                          * the current write starts at or after the current cluster
2014                                          */
2015                                         if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
2016                                                 /*
2017                                                  * we have a write that fits entirely
2018                                                  * within the existing cluster limits
2019                                                  */
2020                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg)
2021                                                         /*
2022                                                          * update our idea of where the cluster ends
2023                                                          */
2024                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
2025                                                 break;
2026                                         }
2027                                         if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
2028                                                 /*
2029                                                  * we have a write that starts in the middle of the current cluster
2030                                                  * but extends beyond the cluster's limit... we know this because
2031                                                  * of the previous checks
2032                                                  * we'll extend the current cluster to the max
2033                                                  * and update the start_blkno for the current write to reflect that
2034                                                  * the head of it was absorbed into this cluster...
2035                                                  * note that we'll always have a leftover tail in this case since
2036                                                  * full absorbtion would have occurred in the clause above
2037                                                  */
2038                                                 vp->v_clusters[cl_index].last_pg = vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER;
2039
2040                                                 if (upl_size) {
2041                                                         int  start_pg_in_upl;
2042
2043                                                         start_pg_in_upl = upl_f_offset / PAGE_SIZE_64;
2044
2045                                                         if (start_pg_in_upl < vp->v_clusters[cl_index].last_pg) {
2046                                                                 intersection = (vp->v_clusters[cl_index].last_pg - start_pg_in_upl) * PAGE_SIZE;
2047
2048                                                                 ubc_upl_commit_range(upl, upl_offset, intersection,
2049                                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2050                                                                 upl_f_offset += intersection;
2051                                                                 upl_offset   += intersection;
2052                                                                 upl_size     -= intersection;
2053                                                         }
2054                                                 }
2055                                                 start_blkno = vp->v_clusters[cl_index].last_pg;
2056                                         }
2057                                         /*
2058                                          * we come here for the case where the current write starts
2059                                          * beyond the limit of the existing cluster or we have a leftover
2060                                          * tail after a partial absorbtion
2061                                          *
2062                                          * in either case, we'll check the remaining clusters before
2063                                          * starting a new one
2064                                          */
2065                                 } else {
2066                                         /*
2067                                          * the current write starts in front of the cluster we're currently considering
2068                                          */
2069                                         if ((vp->v_clusters[cl_index].last_pg - start_blkno) <= MAX_UPL_TRANSFER) {
2070                                                 /*
2071                                                  * we can just merge the new request into
2072                                                  * this cluster and leave it in the cache
2073                                                  * since the resulting cluster is still
2074                                                  * less than the maximum allowable size
2075                                                  */
2076                                                 vp->v_clusters[cl_index].start_pg = start_blkno;
2077
2078                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
2079                                                         /*
2080                                                          * the current write completely
2081                                                          * envelops the existing cluster and since
2082                                                          * each write is limited to at most MAX_UPL_TRANSFER bytes
2083                                                          * we can just use the start and last blocknos of the write
2084                                                          * to generate the cluster limits
2085                                                          */
2086                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
2087                                                 }
2088                                                 break;
2089                                         }
2090
2091                                         /*
2092                                          * if we were to combine this write with the current cluster
2093                                          * we would exceed the cluster size limit.... so,
2094                                          * let's see if there's any overlap of the new I/O with
2095                                          * the cluster we're currently considering... in fact, we'll
2096                                          * stretch the cluster out to it's full limit and see if we
2097                                          * get an intersection with the current write
2098                                          *
2099                                          */
2100                                         if (last_blkno > vp->v_clusters[cl_index].last_pg - MAX_UPL_TRANSFER) {
2101                                                 /*
2102                                                  * the current write extends into the proposed cluster
2103                                                  * clip the length of the current write after first combining it's
2104                                                  * tail with the newly shaped cluster
2105                                                  */
2106                                                 vp->v_clusters[cl_index].start_pg = vp->v_clusters[cl_index].last_pg - MAX_UPL_TRANSFER;
2107
2108                                                 if (upl_size) {
2109                                                         intersection = (last_blkno - vp->v_clusters[cl_index].start_pg) * PAGE_SIZE;
2110
2111                                                         if (intersection > upl_size)
2112                                                                 /*
2113                                                                  * because the current write may consist of a number of pages found in the cache
2114                                                                  * which are not part of the UPL, we may have an intersection that exceeds
2115                                                                  * the size of the UPL that is also part of this write
2116                                                                  */
2117                                                                 intersection = upl_size;
2118
2119                                                         ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2120                                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2121                                                         upl_size -= intersection;
2122                                                 }
2123                                                 last_blkno = vp->v_clusters[cl_index].start_pg;
2124                                         }
2125                                         /*
2126                                          * if we get here, there was no way to merge
2127                                          * any portion of this write with this cluster
2128                                          * or we could only merge part of it which
2129                                          * will leave a tail...
2130                                          * we'll check the remaining clusters before starting a new one
2131                                          */
2132                                 }
2133                         }
2134                         if (cl_index < vp->v_clen)
2135                                 /*
2136                                  * we found an existing cluster(s) that we
2137                                  * could entirely merge this I/O into
2138                                  */
2139                                 goto delay_io;
2140
2141                         if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
2142                                 /*
2143                                  * we didn't find an existing cluster to
2144                                  * merge into, but there's room to start
2145                                  * a new one
2146                                  */
2147                                 goto start_new_cluster;
2148
2149                         /*
2150                          * no exisitng cluster to merge with and no
2151                          * room to start a new one... we'll try
2152                          * pushing one of the existing ones... if none of
2153                          * them are able to be pushed, we'll switch
2154                          * to the sparse cluster mechanism
2155                          * cluster_try_push updates v_clen to the
2156                          * number of remaining clusters... and
2157                          * returns the number of currently unused clusters
2158                          */
2159                         if (vp->v_flag & VNOCACHE_DATA)
2160                                 can_delay = 0;
2161                         else
2162                                 can_delay = 1;
2163
2164                         if (cluster_try_push(vp, newEOF, can_delay, 0) == 0) {
2165                                 /*
2166                                  * no more room in the normal cluster mechanism
2167                                  * so let's switch to the more expansive but expensive
2168                                  * sparse mechanism....
2169                                  * first, we need to release the upl if we hold one
2170                                  * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2171                                  * and may span 2 separate buckets there... if they do and
2172                                  * we happen to have to flush a bucket to make room and it intersects
2173                                  * this upl, a deadlock may result on page BUSY
2174                                  */
2175                                 if (upl_size)
2176                                         ubc_upl_commit_range(upl, upl_offset, upl_size,
2177                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2178
2179                                 sparse_cluster_switch(vp, newEOF);
2180                                 sparse_cluster_add(vp, newEOF, start_blkno, last_blkno);
2181
2182                                 continue;
2183                         }
2184                         /*
2185                          * we pushed one cluster successfully, so we must be sequentially writing this file
2186                          * otherwise, we would have failed and fallen into the sparse cluster support
2187                          * so let's take the opportunity to push out additional clusters as long as we
2188                          * remain below the throttle... this will give us better I/O locality if we're
2189                          * in a copy loop (i.e.  we won't jump back and forth between the read and write points
2190                          * however, we don't want to push so much out that the write throttle kicks in and
2191                          * hangs this thread up until some of the I/O completes...
2192                          */
2193                         while (vp->v_clen && (vp->v_numoutput <= (ASYNC_THROTTLE / 2)))
2194                                 cluster_try_push(vp, newEOF, 0, 0);
2195
2196 start_new_cluster:
2197                         if (vp->v_clen == 0)
2198                                 vp->v_ciosiz = devblocksize;
2199
2200                         vp->v_clusters[vp->v_clen].start_pg = start_blkno;
2201                         vp->v_clusters[vp->v_clen].last_pg  = last_blkno;
2202                         vp->v_clen++;
2203
2204 delay_io:
2205                         if (upl_size)
2206                                 ubc_upl_commit_range(upl, upl_offset, upl_size,
2207                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2208                         continue;
2209 issue_io:
2210                         /*
2211                          * in order to maintain some semblance of coherency with mapped writes
2212                          * we need to write the cluster back out as a multiple of the PAGESIZE
2213                          * unless the cluster encompasses the last page of the file... in this
2214                          * case we'll round out to the nearest device block boundary
2215                          */
2216                         io_size = upl_size;
2217
2218                         if ((upl_f_offset + io_size) > newEOF) {
2219                                 io_size = newEOF - upl_f_offset;
2220                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2221                         }
2222
2223                         if (flags & IO_SYNC)
2224                                 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE;
2225                         else
2226                                 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | CL_ASYNC;
2227
2228                         if (vp->v_flag & VNOCACHE_DATA)
2229                                 io_flags |= CL_DUMP;
2230
2231                         retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
2232                                             io_flags, (struct buf *)0, (struct clios *)0);
2233                 }
2234         }
2235         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2236                      retval, 0, uio_resid, 0, 0);
2237
2238         return (retval);
2239 }
2240
2241 int
2242 cluster_read(vp, uio, filesize, devblocksize, flags)
2243         struct vnode *vp;
2244         struct uio   *uio;
2245         off_t         filesize;
2246         int           devblocksize;
2247         int           flags;
2248 {
2249         int           prev_resid;
2250         int           clip_size;
2251         off_t         max_io_size;
2252         struct iovec  *iov;
2253         int           upl_size;
2254         int           upl_flags;
2255         upl_t         upl;
2256         int           retval = 0;
2257
2258
2259         if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
2260           {
2261             /*
2262              * go do a read through the cache if one of the following is true....
2263              *   NOCACHE is not true
2264              *   the uio request doesn't target USERSPACE
2265              */
2266             return (cluster_read_x(vp, uio, filesize, devblocksize, flags));
2267           }
2268
2269         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2270           {
2271             /*
2272              * we know we have a resid, so this is safe
2273              * skip over any emtpy vectors
2274              */
2275             iov = uio->uio_iov;
2276
2277             while (iov->iov_len == 0) {
2278               uio->uio_iov++;
2279               uio->uio_iovcnt--;
2280               iov = uio->uio_iov;
2281             }
2282             upl_size  = PAGE_SIZE;
2283             upl_flags = UPL_QUERY_OBJECT_TYPE;
2284
2285             if ((vm_map_get_upl(current_map(),
2286                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2287                                &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS)
2288               {
2289                 /*
2290                  * the user app must have passed in an invalid address
2291                  */
2292                 return (EFAULT);
2293               }
2294
2295             /*
2296              * We check every vector target but if it is physically
2297              * contiguous space, we skip the sanity checks.
2298              */
2299             if (upl_flags & UPL_PHYS_CONTIG)
2300               {
2301                 retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
2302               }
2303             else if (uio->uio_resid < PAGE_SIZE)
2304               {
2305                 /*
2306                  * we're here because we're don't have a physically contiguous target buffer
2307                  * go do a read through the cache if
2308                  *   the total xfer size is less than a page...
2309                  */
2310                 return (cluster_read_x(vp, uio, filesize, devblocksize, flags));
2311               }
2312             else if (((int)uio->uio_offset & PAGE_MASK) || ((int)iov->iov_base & PAGE_MASK))
2313               {
2314                 if (((int)uio->uio_offset & PAGE_MASK) == ((int)iov->iov_base & PAGE_MASK))
2315                   {
2316                     /*
2317                      * Bring the file offset read up to a pagesize boundary
2318                      * this will also bring the base address to a page boundary
2319                      * since they both are currently on the same offset within a page
2320                      * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2321                      * so the computed clip_size must always be less than the current uio_resid
2322                      */
2323                     clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2324
2325                     /*
2326                      * Fake the resid going into the cluster_read_x call
2327                      * and restore it on the way out.
2328                      */
2329                     prev_resid = uio->uio_resid;
2330                     uio->uio_resid = clip_size;
2331                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2332                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2333                   }
2334                 else
2335                   {
2336                     /*
2337                      * can't get both the file offset and the buffer offset aligned to a page boundary
2338                      * so fire an I/O through the cache for this entire vector
2339                      */
2340                     clip_size = iov->iov_len;
2341                     prev_resid = uio->uio_resid;
2342                     uio->uio_resid = clip_size;
2343                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2344                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2345                   }
2346               }
2347             else
2348               {
2349                 /*
2350                  * If we come in here, we know the offset into
2351                  * the file is on a pagesize boundary
2352                  */
2353
2354                 max_io_size = filesize - uio->uio_offset;
2355                 clip_size = uio->uio_resid;
2356                 if (iov->iov_len < clip_size)
2357                   clip_size = iov->iov_len;
2358                 if (max_io_size < clip_size)
2359                   clip_size = (int)max_io_size;
2360
2361                 if (clip_size < PAGE_SIZE)
2362                   {
2363                     /*
2364                      * Take care of the tail end of the read in this vector.
2365                      */
2366                     prev_resid = uio->uio_resid;
2367                     uio->uio_resid = clip_size;
2368                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2369                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2370                   }
2371                 else
2372                   {
2373                     /* round clip_size down to a multiple of pagesize */
2374                     clip_size = clip_size & ~(PAGE_MASK);
2375                     prev_resid = uio->uio_resid;
2376                     uio->uio_resid = clip_size;
2377                     retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2378                     if ((retval==0) && uio->uio_resid)
2379                       retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2380                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2381                   }
2382               } /* end else */
2383           } /* end while */
2384
2385         return(retval);
2386 }
2387
2388 static int
2389 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2390         struct vnode *vp;
2391         struct uio   *uio;
2392         off_t         filesize;
2393         int           devblocksize;
2394         int           flags;
2395 {
2396         upl_page_info_t *pl;
2397         upl_t            upl;
2398         vm_offset_t      upl_offset;
2399         int              upl_size;
2400         off_t            upl_f_offset;
2401         int              start_offset;
2402         int              start_pg;
2403         int              last_pg;
2404         int              uio_last;
2405         int              pages_in_upl;
2406         off_t            max_size;
2407         off_t            last_ioread_offset;
2408         off_t            last_request_offset;
2409         u_int            size_of_prefetch;
2410         int              io_size;
2411         kern_return_t    kret;
2412         int              error  = 0;
2413         int              retval = 0;
2414         u_int            b_lblkno;
2415         u_int            e_lblkno;
2416         struct clios     iostate;
2417         u_int            max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2418         u_int            rd_ahead_enabled = 1;
2419         u_int            prefetch_enabled = 1;
2420
2421
2422         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2423                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2424
2425         if (cluster_hard_throttle_on(vp)) {
2426                 rd_ahead_enabled = 0;
2427                 prefetch_enabled = 0;
2428
2429                 max_rd_size = HARD_THROTTLE_MAXSIZE;
2430         }
2431         if (vp->v_flag & (VRAOFF|VNOCACHE_DATA))
2432                 rd_ahead_enabled = 0;
2433
2434         last_request_offset = uio->uio_offset + uio->uio_resid;
2435
2436         if (last_request_offset > filesize)
2437                 last_request_offset = filesize;
2438         b_lblkno = (u_int)(uio->uio_offset / PAGE_SIZE_64);
2439         e_lblkno = (u_int)((last_request_offset - 1) / PAGE_SIZE_64);
2440
2441         if (vp->v_ralen && (vp->v_lastr == b_lblkno || (vp->v_lastr + 1) == b_lblkno)) {
2442                 /*
2443                  * determine if we already have a read-ahead in the pipe courtesy of the
2444                  * last read systemcall that was issued...
2445                  * if so, pick up it's extent to determine where we should start
2446                  * with respect to any read-ahead that might be necessary to
2447                  * garner all the data needed to complete this read systemcall
2448                  */
2449                 last_ioread_offset = (vp->v_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
2450
2451                 if (last_ioread_offset < uio->uio_offset)
2452                         last_ioread_offset = (off_t)0;
2453                 else if (last_ioread_offset > last_request_offset)
2454                         last_ioread_offset = last_request_offset;
2455         } else
2456                 last_ioread_offset = (off_t)0;
2457
2458         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2459                 /*
2460                  * compute the size of the upl needed to encompass
2461                  * the requested read... limit each call to cluster_io
2462                  * to the maximum UPL size... cluster_io will clip if
2463                  * this exceeds the maximum io_size for the device,
2464                  * make sure to account for
2465                  * a starting offset that's not page aligned
2466                  */
2467                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2468                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2469                 max_size     = filesize - uio->uio_offset;
2470
2471                 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2472                         io_size = uio->uio_resid;
2473                 else
2474                         io_size = max_size;
2475
2476                 if (!(vp->v_flag & VNOCACHE_DATA)) {
2477
2478                         while (io_size) {
2479                                 u_int io_resid;
2480                                 u_int io_requested;
2481
2482                                 /*
2483                                  * if we keep finding the pages we need already in the cache, then
2484                                  * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2485                                  * to determine that we have all the pages we need... once we miss in
2486                                  * the cache and have issued an I/O, than we'll assume that we're likely
2487                                  * to continue to miss in the cache and it's to our advantage to try and prefetch
2488                                  */
2489                                 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
2490                                         if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
2491                                                 /*
2492                                                  * we've already issued I/O for this request and
2493                                                  * there's still work to do and
2494                                                  * our prefetch stream is running dry, so issue a
2495                                                  * pre-fetch I/O... the I/O latency will overlap
2496                                                  * with the copying of the data
2497                                                  */
2498                                                 if (size_of_prefetch > max_rd_size)
2499                                                         size_of_prefetch = max_rd_size;
2500
2501                                                 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, devblocksize);
2502
2503                                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2504
2505                                                 if (last_ioread_offset > last_request_offset)
2506                                                         last_ioread_offset = last_request_offset;
2507                                         }
2508                                 }
2509                                 /*
2510                                  * limit the size of the copy we're about to do so that
2511                                  * we can notice that our I/O pipe is running dry and
2512                                  * get the next I/O issued before it does go dry
2513                                  */
2514                                 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
2515                                         io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
2516                                 else
2517                                         io_resid = io_size;
2518
2519                                 io_requested = io_resid;
2520
2521                                 retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
2522
2523                                 io_size -= (io_requested - io_resid);
2524
2525                                 if (retval || io_resid)
2526                                         /*
2527                                          * if we run into a real error or
2528                                          * a page that is not in the cache
2529                                          * we need to leave streaming mode
2530                                          */
2531                                         break;
2532
2533                                 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
2534                                         /*
2535                                          * we're already finished the I/O for this read request
2536                                          * let's see if we should do a read-ahead
2537                                          */
2538                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2539                                 }
2540                         }
2541                         if (retval)
2542                                 break;
2543                         if (io_size == 0) {
2544                                 if (e_lblkno < vp->v_lastr)
2545                                         vp->v_maxra = 0;
2546                                 vp->v_lastr = e_lblkno;
2547
2548                                 break;
2549                         }
2550                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2551                         upl_f_offset = uio->uio_offset - (off_t)start_offset;
2552                         max_size     = filesize - uio->uio_offset;
2553                 }
2554                 if (io_size > max_rd_size)
2555                         io_size = max_rd_size;
2556
2557                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2558
2559                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2560                         upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
2561                 pages_in_upl = upl_size / PAGE_SIZE;
2562
2563                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2564                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2565
2566                 kret = ubc_create_upl(vp,
2567                                                 upl_f_offset,
2568                                                 upl_size,
2569                                                 &upl,
2570                                                 &pl,
2571                                                 UPL_SET_LITE);
2572                 if (kret != KERN_SUCCESS)
2573                         panic("cluster_read: failed to get pagelist");
2574
2575                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2576                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2577
2578                 /*
2579                  * scan from the beginning of the upl looking for the first
2580                  * non-valid page.... this will become the first page in
2581                  * the request we're going to make to 'cluster_io'... if all
2582                  * of the pages are valid, we won't call through to 'cluster_io'
2583                  */
2584                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2585                         if (!upl_valid_page(pl, start_pg))
2586                                 break;
2587                 }
2588
2589                 /*
2590                  * scan from the starting invalid page looking for a valid
2591                  * page before the end of the upl is reached, if we
2592                  * find one, then it will be the last page of the request to
2593                  * 'cluster_io'
2594                  */
2595                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2596                         if (upl_valid_page(pl, last_pg))
2597                                 break;
2598                 }
2599                 iostate.io_completed = 0;
2600                 iostate.io_issued = 0;
2601                 iostate.io_error = 0;
2602                 iostate.io_wanted = 0;
2603
2604                 if (start_pg < last_pg) {
2605                         /*
2606                          * we found a range of 'invalid' pages that must be filled
2607                          * if the last page in this range is the last page of the file
2608                          * we may have to clip the size of it to keep from reading past
2609                          * the end of the last physical block associated with the file
2610                          */
2611                         upl_offset = start_pg * PAGE_SIZE;
2612                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2613
2614                         if ((upl_f_offset + upl_offset + io_size) > filesize)
2615                                 io_size = filesize - (upl_f_offset + upl_offset);
2616
2617                         /*
2618                          * issue an asynchronous read to cluster_io
2619                          */
2620
2621                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2622                                            io_size, devblocksize, CL_READ | CL_ASYNC, (struct buf *)0, &iostate);
2623                 }
2624                 if (error == 0) {
2625                         /*
2626                          * if the read completed successfully, or there was no I/O request
2627                          * issued, than copy the data into user land via 'cluster_upl_copy_data'
2628                          * we'll first add on any 'valid'
2629                          * pages that were present in the upl when we acquired it.
2630                          */
2631                         u_int  val_size;
2632
2633                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2634                                 if (!upl_valid_page(pl, uio_last))
2635                                         break;
2636                         }
2637                         /*
2638                          * compute size to transfer this round,  if uio->uio_resid is
2639                          * still non-zero after this attempt, we'll loop around and
2640                          * set up for another I/O.
2641                          */
2642                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2643
2644                         if (val_size > max_size)
2645                                 val_size = max_size;
2646
2647                         if (val_size > uio->uio_resid)
2648                                 val_size = uio->uio_resid;
2649
2650                         if (last_ioread_offset == 0)
2651                                 last_ioread_offset = uio->uio_offset + val_size;
2652
2653                         if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
2654                                 /*
2655                                  * if there's still I/O left to do for this request, and...
2656                                  * we're not in hard throttle mode, then issue a
2657                                  * pre-fetch I/O... the I/O latency will overlap
2658                                  * with the copying of the data
2659                                  */
2660                                 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, devblocksize);
2661
2662                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2663
2664                                 if (last_ioread_offset > last_request_offset)
2665                                         last_ioread_offset = last_request_offset;
2666
2667                         } else if ((uio->uio_offset + val_size) == last_request_offset) {
2668                                 /*
2669                                  * this transfer will finish this request, so...
2670                                  * let's try to read ahead if we're in
2671                                  * a sequential access pattern and we haven't
2672                                  * explicitly disabled it
2673                                  */
2674                                 if (rd_ahead_enabled)
2675                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2676
2677                                 if (e_lblkno < vp->v_lastr)
2678                                         vp->v_maxra = 0;
2679                                 vp->v_lastr = e_lblkno;
2680                         }
2681                         while (iostate.io_issued != iostate.io_completed) {
2682                                 iostate.io_wanted = 1;
2683                                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_read_x", 0);
2684                         }
2685                         if (iostate.io_error)
2686                                 error = iostate.io_error;
2687                         else
2688                                 retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
2689                 }
2690                 if (start_pg < last_pg) {
2691                         /*
2692                          * compute the range of pages that we actually issued an I/O for
2693                          * and either commit them as valid if the I/O succeeded
2694                          * or abort them if the I/O failed
2695                          */
2696                         io_size = (last_pg - start_pg) * PAGE_SIZE;
2697
2698                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2699                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2700
2701                         if (error || (vp->v_flag & VNOCACHE_DATA))
2702                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2703                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2704                         else
2705                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2706                                                      UPL_COMMIT_CLEAR_DIRTY |
2707                                                      UPL_COMMIT_FREE_ON_EMPTY |
2708                                                      UPL_COMMIT_INACTIVATE);
2709
2710                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2711                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2712                 }
2713                 if ((last_pg - start_pg) < pages_in_upl) {
2714                         int cur_pg;
2715                         int commit_flags;
2716
2717                         /*
2718                          * the set of pages that we issued an I/O for did not encompass
2719                          * the entire upl... so just release these without modifying
2720                          * their state
2721                          */
2722                         if (error)
2723                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2724                         else {
2725                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2726                                              (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2727
2728                                 if (start_pg) {
2729                                         /*
2730                                          * we found some already valid pages at the beginning of
2731                                          * the upl commit these back to the inactive list with
2732                                          * reference cleared
2733                                          */
2734                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2735                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2736                                                                    | UPL_COMMIT_INACTIVATE;
2737
2738                                                 if (upl_dirty_page(pl, cur_pg))
2739                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2740
2741                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2742                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2743                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2744                                                 else
2745                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2746                                                                 PAGE_SIZE, commit_flags);
2747                                         }
2748                                 }
2749                                 if (last_pg < uio_last) {
2750                                         /*
2751                                          * we found some already valid pages immediately after the
2752                                          * pages we issued I/O for, commit these back to the
2753                                          * inactive list with reference cleared
2754                                          */
2755                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2756                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
2757                                                                                 | UPL_COMMIT_INACTIVATE;
2758
2759                                                 if (upl_dirty_page(pl, cur_pg))
2760                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2761
2762                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2763                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2764                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2765                                                 else
2766                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2767                                                                 PAGE_SIZE, commit_flags);
2768                                         }
2769                                 }
2770                                 if (uio_last < pages_in_upl) {
2771                                         /*
2772                                          * there were some invalid pages beyond the valid pages
2773                                          * that we didn't issue an I/O for, just release them
2774                                          * unchanged
2775                                          */
2776                                         ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2777                                                             (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
2778                                 }
2779
2780                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2781                                         (int)upl, -1, -1, 0, 0);
2782                         }
2783                 }
2784                 if (retval == 0)
2785                         retval = error;
2786         }
2787         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2788                      (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2789
2790         return (retval);
2791 }
2792
2793
2794 static int
2795 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2796         struct vnode *vp;
2797         struct uio   *uio;
2798         off_t         filesize;
2799         int           devblocksize;
2800         int           flags;
2801 {
2802         upl_t            upl;
2803         upl_page_info_t  *pl;
2804         vm_offset_t      upl_offset;
2805         off_t            max_io_size;
2806         int              io_size;
2807         int              upl_size;
2808         int              upl_needed_size;
2809         int              pages_in_pl;
2810         int              upl_flags;
2811         kern_return_t    kret;
2812         struct iovec     *iov;
2813         int              i;
2814         int              force_data_sync;
2815         int              retval = 0;
2816         struct clios     iostate;
2817         u_int            max_rd_size  = MAX_UPL_TRANSFER * PAGE_SIZE;
2818         u_int            max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
2819
2820
2821         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2822                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2823
2824         /*
2825          * When we enter this routine, we know
2826          *  -- the offset into the file is on a pagesize boundary
2827          *  -- the resid is a page multiple
2828          *  -- the resid will not exceed iov_len
2829          */
2830
2831         iostate.io_completed = 0;
2832         iostate.io_issued = 0;
2833         iostate.io_error = 0;
2834         iostate.io_wanted = 0;
2835
2836         iov = uio->uio_iov;
2837
2838         if (cluster_hard_throttle_on(vp)) {
2839                 max_rd_size  = HARD_THROTTLE_MAXSIZE;
2840                 max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
2841         }
2842         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2843
2844                 max_io_size = filesize - uio->uio_offset;
2845
2846                 if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2847                         io_size = max_io_size;
2848                 else
2849                         io_size = uio->uio_resid;
2850
2851                 /*
2852                  * First look for pages already in the cache
2853                  * and move them to user space.
2854                  */
2855                 retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
2856
2857                 if (retval) {
2858                         /*
2859                          * we may have already spun some portion of this request
2860                          * off as async requests... we need to wait for the I/O
2861                          * to complete before returning
2862                          */
2863                         goto wait_for_reads;
2864                 }
2865                 /*
2866                  * If we are already finished with this read, then return
2867                  */
2868                 if (io_size == 0) {
2869                         /*
2870                          * we may have already spun some portion of this request
2871                          * off as async requests... we need to wait for the I/O
2872                          * to complete before returning
2873                          */
2874                         goto wait_for_reads;
2875                 }
2876                 max_io_size = io_size;
2877
2878                 if (max_io_size > max_rd_size)
2879                         max_io_size = max_rd_size;
2880
2881                 io_size = 0;
2882
2883                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
2884
2885                 if (io_size == 0)
2886                         /*
2887                          * we may have already spun some portion of this request
2888                          * off as async requests... we need to wait for the I/O
2889                          * to complete before returning
2890                          */
2891                         goto wait_for_reads;
2892
2893                 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
2894                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2895
2896                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2897                              (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
2898
2899                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2900                         pages_in_pl = 0;
2901                         upl_size = upl_needed_size;
2902                         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2903
2904                         kret = vm_map_get_upl(current_map(),
2905                                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2906                                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2907
2908                         if (kret != KERN_SUCCESS) {
2909                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2910                                              (int)upl_offset, upl_size, io_size, kret, 0);
2911                                 /*
2912                                  * cluster_nocopy_read: failed to get pagelist
2913                                  *
2914                                  * we may have already spun some portion of this request
2915                                  * off as async requests... we need to wait for the I/O
2916                                  * to complete before returning
2917                                  */
2918                                 goto wait_for_reads;
2919                         }
2920                         pages_in_pl = upl_size / PAGE_SIZE;
2921                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2922
2923                         for (i = 0; i < pages_in_pl; i++) {
2924                                 if (!upl_valid_page(pl, i))
2925                                         break;
2926                         }
2927                         if (i == pages_in_pl)
2928                                 break;
2929
2930                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2931                                             UPL_ABORT_FREE_ON_EMPTY);
2932                 }
2933                 if (force_data_sync >= 3) {
2934                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2935                                      (int)upl_offset, upl_size, io_size, kret, 0);
2936
2937                         goto wait_for_reads;
2938                 }
2939                 /*
2940                  * Consider the possibility that upl_size wasn't satisfied.
2941                  */
2942                 if (upl_size != upl_needed_size)
2943                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2944
2945                 if (io_size == 0) {
2946                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2947                                             UPL_ABORT_FREE_ON_EMPTY);
2948                         goto wait_for_reads;
2949                 }
2950                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2951                              (int)upl_offset, upl_size, io_size, kret, 0);
2952
2953                 /*
2954                  * request asynchronously so that we can overlap
2955                  * the preparation of the next I/O
2956                  * if there are already too many outstanding reads
2957                  * wait until some have completed before issuing the next read
2958                  */
2959                 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
2960                         iostate.io_wanted = 1;
2961                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2962                 }
2963                 if (iostate.io_error) {
2964                         /*
2965                          * one of the earlier reads we issued ran into a hard error
2966                          * don't issue any more reads, cleanup the UPL
2967                          * that was just created but not used, then
2968                          * go wait for any other reads to complete before
2969                          * returning the error to the caller
2970                          */
2971                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2972                                             UPL_ABORT_FREE_ON_EMPTY);
2973
2974                         goto wait_for_reads;
2975                 }
2976                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2977                              (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
2978
2979                 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2980                                    io_size, devblocksize,
2981                                    CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
2982                                    (struct buf *)0, &iostate);
2983
2984                 /*
2985                  * update the uio structure
2986                  */
2987                 iov->iov_base   += io_size;
2988                 iov->iov_len    -= io_size;
2989                 uio->uio_resid  -= io_size;
2990                 uio->uio_offset += io_size;
2991
2992                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2993                              (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, retval, 0);
2994
2995         } /* end while */
2996
2997 wait_for_reads:
2998         /*
2999          * make sure all async reads that are part of this stream
3000          * have completed before we return
3001          */
3002         while (iostate.io_issued != iostate.io_completed) {
3003                 iostate.io_wanted = 1;
3004                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
3005         }
3006         if (iostate.io_error)
3007                 retval = iostate.io_error;
3008
3009         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
3010                      (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
3011
3012         return (retval);
3013 }
3014
3015
3016 static int
3017 cluster_phys_read(vp, uio, filesize, devblocksize, flags)
3018         struct vnode *vp;
3019         struct uio   *uio;
3020         off_t        filesize;
3021         int          devblocksize;
3022         int          flags;
3023 {
3024         upl_page_info_t *pl;
3025         upl_t            upl;
3026         vm_offset_t      upl_offset;
3027         addr64_t         dst_paddr;
3028         off_t            max_size;
3029         int              io_size;
3030         int              tail_size;
3031         int              upl_size;
3032         int              upl_needed_size;
3033         int              pages_in_pl;
3034         int              upl_flags;
3035         kern_return_t    kret;
3036         struct iovec     *iov;
3037         struct clios     iostate;
3038         int              error;
3039
3040         /*
3041          * When we enter this routine, we know
3042          *  -- the resid will not exceed iov_len
3043          *  -- the target address is physically contiguous
3044          */
3045
3046         iov = uio->uio_iov;
3047
3048         max_size = filesize - uio->uio_offset;
3049
3050         if (max_size > (off_t)((unsigned int)iov->iov_len))
3051                 io_size = iov->iov_len;
3052         else
3053                 io_size = max_size;
3054
3055         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
3056         upl_needed_size = upl_offset + io_size;
3057
3058         error       = 0;
3059         pages_in_pl = 0;
3060         upl_size = upl_needed_size;
3061         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3062
3063         kret = vm_map_get_upl(current_map(),
3064                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
3065                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
3066
3067         if (kret != KERN_SUCCESS) {
3068                 /*
3069                  * cluster_phys_read: failed to get pagelist
3070                  */
3071                 return(EINVAL);
3072         }
3073         if (upl_size < upl_needed_size) {
3074                 /*
3075                  * The upl_size wasn't satisfied.
3076                  */
3077                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3078
3079                 return(EINVAL);
3080         }
3081         pl = ubc_upl_pageinfo(upl);
3082
3083         dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)((u_int)iov->iov_base & PAGE_MASK));
3084
3085         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3086                 int   head_size;
3087
3088                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
3089
3090                 if (head_size > io_size)
3091                         head_size = io_size;
3092
3093                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
3094
3095                 if (error) {
3096                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3097
3098                         return(EINVAL);
3099                 }
3100                 upl_offset += head_size;
3101                 dst_paddr  += head_size;
3102                 io_size    -= head_size;
3103         }
3104         tail_size = io_size & (devblocksize - 1);
3105         io_size  -= tail_size;
3106
3107         iostate.io_completed = 0;
3108         iostate.io_issued = 0;
3109         iostate.io_error = 0;
3110         iostate.io_wanted = 0;
3111
3112         while (io_size && error == 0) {
3113                 int  xsize;
3114
3115                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3116                         xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
3117                 else
3118                         xsize = io_size;
3119                 /*
3120                  * request asynchronously so that we can overlap
3121                  * the preparation of the next I/O... we'll do
3122                  * the commit after all the I/O has completed
3123                  * since its all issued against the same UPL
3124                  * if there are already too many outstanding reads
3125                  * wait until some have completed before issuing the next
3126                  */
3127                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
3128                         iostate.io_wanted = 1;
3129                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3130                 }
3131
3132                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0,
3133                                    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3134                                    (struct buf *)0, &iostate);
3135                 /*
3136                  * The cluster_io read was issued successfully,
3137                  * update the uio structure
3138                  */
3139                 if (error == 0) {
3140                         uio->uio_resid  -= xsize;
3141                         iov->iov_len    -= xsize;
3142                         iov->iov_base   += xsize;
3143                         uio->uio_offset += xsize;
3144                         dst_paddr       += xsize;
3145                         upl_offset      += xsize;
3146                         io_size         -= xsize;
3147                 }
3148         }
3149         /*
3150          * make sure all async reads that are part of this stream
3151          * have completed before we proceed
3152          */
3153         while (iostate.io_issued != iostate.io_completed) {
3154                 iostate.io_wanted = 1;
3155                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3156         }
3157         if (iostate.io_error) {
3158                 error = iostate.io_error;
3159         }
3160         if (error == 0 && tail_size)
3161                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
3162
3163         /*
3164          * just release our hold on the physically contiguous
3165          * region without changing any state
3166          */
3167         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3168
3169         return (error);
3170 }
3171
3172
3173 /*
3174  * generate advisory I/O's in the largest chunks possible
3175  * the completed pages will be released into the VM cache
3176  */
3177 int
3178 advisory_read(vp, filesize, f_offset, resid, devblocksize)
3179         struct vnode *vp;
3180         off_t         filesize;
3181         off_t         f_offset;
3182         int           resid;
3183         int           devblocksize;
3184 {
3185         upl_page_info_t *pl;
3186         upl_t            upl;
3187         vm_offset_t      upl_offset;
3188         int              upl_size;
3189         off_t            upl_f_offset;
3190         int              start_offset;
3191         int              start_pg;
3192         int              last_pg;
3193         int              pages_in_upl;
3194         off_t            max_size;
3195         int              io_size;
3196         kern_return_t    kret;
3197         int              retval = 0;
3198         int              issued_io;
3199         int              skip_range;
3200
3201         if (!UBCINFOEXISTS(vp))
3202                 return(EINVAL);
3203
3204         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3205                      (int)f_offset, resid, (int)filesize, devblocksize, 0);
3206
3207         while (resid && f_offset < filesize && retval == 0) {
3208                 /*
3209                  * compute the size of the upl needed to encompass
3210                  * the requested read... limit each call to cluster_io
3211                  * to the maximum UPL size... cluster_io will clip if
3212                  * this exceeds the maximum io_size for the device,
3213                  * make sure to account for
3214                  * a starting offset that's not page aligned
3215                  */
3216                 start_offset = (int)(f_offset & PAGE_MASK_64);
3217                 upl_f_offset = f_offset - (off_t)start_offset;
3218                 max_size     = filesize - f_offset;
3219
3220                 if (resid < max_size)
3221                         io_size = resid;
3222                 else
3223                         io_size = max_size;
3224
3225                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3226                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3227                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3228
3229                 skip_range = 0;
3230                 /*
3231                  * return the number of contiguously present pages in the cache
3232                  * starting at upl_f_offset within the file
3233                  */
3234                 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
3235
3236                 if (skip_range) {
3237                         /*
3238                          * skip over pages already present in the cache
3239                          */
3240                         io_size = skip_range - start_offset;
3241
3242                         f_offset += io_size;
3243                         resid    -= io_size;
3244
3245                         if (skip_range == upl_size)
3246                                 continue;
3247                         /*
3248                          * have to issue some real I/O
3249                          * at this point, we know it's starting on a page boundary
3250                          * because we've skipped over at least the first page in the request
3251                          */
3252                         start_offset = 0;
3253                         upl_f_offset += skip_range;
3254                         upl_size     -= skip_range;
3255                 }
3256                 pages_in_upl = upl_size / PAGE_SIZE;
3257
3258                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
3259                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3260
3261                 kret = ubc_create_upl(vp,
3262                                                 upl_f_offset,
3263                                                 upl_size,
3264                                                 &upl,
3265                                                 &pl,
3266                                                 UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
3267                 if (kret != KERN_SUCCESS)
3268                         return(retval);
3269                 issued_io = 0;
3270
3271                 /*
3272                  * before we start marching forward, we must make sure we end on
3273                  * a present page, otherwise we will be working with a freed
3274                  * upl
3275                  */
3276                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3277                         if (upl_page_present(pl, last_pg))
3278                                 break;
3279                 }
3280                 pages_in_upl = last_pg + 1;
3281
3282
3283                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
3284                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3285
3286
3287                 for (last_pg = 0; last_pg < pages_in_upl; ) {
3288                         /*
3289                          * scan from the beginning of the upl looking for the first
3290                          * page that is present.... this will become the first page in
3291                          * the request we're going to make to 'cluster_io'... if all
3292                          * of the pages are absent, we won't call through to 'cluster_io'
3293                          */
3294                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3295                                 if (upl_page_present(pl, start_pg))
3296                                         break;
3297                         }
3298
3299                         /*
3300                          * scan from the starting present page looking for an absent
3301                          * page before the end of the upl is reached, if we
3302                          * find one, then it will terminate the range of pages being
3303                          * presented to 'cluster_io'
3304                          */
3305                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3306                                 if (!upl_page_present(pl, last_pg))
3307                                         break;
3308                         }
3309
3310                         if (last_pg > start_pg) {
3311                                 /*
3312                                  * we found a range of pages that must be filled
3313                                  * if the last page in this range is the last page of the file
3314                                  * we may have to clip the size of it to keep from reading past
3315                                  * the end of the last physical block associated with the file
3316                                  */
3317                                 upl_offset = start_pg * PAGE_SIZE;
3318                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
3319
3320                                 if ((upl_f_offset + upl_offset + io_size) > filesize)
3321                                         io_size = filesize - (upl_f_offset + upl_offset);
3322
3323                                 /*
3324                                  * issue an asynchronous read to cluster_io
3325                                  */
3326                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
3327                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
3328
3329                                 issued_io = 1;
3330                         }
3331                 }
3332                 if (issued_io == 0)
3333                         ubc_upl_abort(upl, 0);
3334
3335                 io_size = upl_size - start_offset;
3336
3337                 if (io_size > resid)
3338                         io_size = resid;
3339                 f_offset += io_size;
3340                 resid    -= io_size;
3341         }
3342
3343         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3344                      (int)f_offset, resid, retval, 0, 0);
3345
3346         return(retval);
3347 }
3348
3349
3350 int
3351 cluster_push(vp)
3352         struct vnode *vp;
3353 {
3354         int  retval;
3355
3356         if (!UBCINFOEXISTS(vp) || (vp->v_clen == 0 && !(vp->v_flag & VHASDIRTY)))
3357                 return(0);
3358
3359         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3360                      vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
3361
3362         if (vp->v_flag & VHASDIRTY) {
3363                 sparse_cluster_push(vp, ubc_getsize(vp), 1);
3364
3365                 vp->v_clen = 0;
3366                 retval = 1;
3367         } else
3368                 retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
3369
3370         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3371                      vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
3372
3373         return (retval);
3374 }
3375
3376
3377 int
3378 cluster_release(vp)
3379         struct vnode *vp;
3380 {
3381         off_t offset;
3382         u_int length;
3383
3384         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3385
3386         if (vp->v_flag & VHASDIRTY) {
3387                 vfs_drt_control(&(vp->v_scmap), 0);
3388
3389                 vp->v_flag &= ~VHASDIRTY;
3390         }
3391         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3392 }
3393
3394
3395 static int
3396 cluster_try_push(vp, EOF, can_delay, push_all)
3397         struct vnode *vp;
3398         off_t  EOF;
3399         int    can_delay;
3400         int    push_all;
3401 {
3402         int cl_index;
3403         int cl_index1;
3404         int min_index;
3405         int cl_len;
3406         int cl_total;
3407         int cl_pushed = 0;
3408         struct v_cluster l_clusters[MAX_CLUSTERS];
3409
3410         /*
3411          * make a local 'sorted' copy of the clusters
3412          * and clear vp->v_clen so that new clusters can
3413          * be developed
3414          */
3415         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3416                 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3417                         if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3418                                 continue;
3419                         if (min_index == -1)
3420                                 min_index = cl_index1;
3421                         else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3422                                 min_index = cl_index1;
3423                 }
3424                 if (min_index == -1)
3425                         break;
3426                 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3427                 l_clusters[cl_index].last_pg  = vp->v_clusters[min_index].last_pg;
3428
3429                 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3430         }
3431         cl_len     = cl_index;
3432         vp->v_clen = 0;
3433
3434         if (can_delay && cl_len == MAX_CLUSTERS) {
3435                 int   i;
3436
3437                 /*
3438                  * determine if we appear to be writing the file sequentially
3439                  * if not, by returning without having pushed any clusters
3440                  * we will cause this vnode to be pushed into the sparse cluster mechanism
3441                  * used for managing more random I/O patterns
3442                  *
3443                  * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3444                  * that's why we're in try_push with can_delay true...
3445                  *
3446                  * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3447                  * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3448                  * so we can just make a simple pass through up, to but not including the last one...
3449                  * note that last_pg is not inclusive, so it will be equal to the start_pg of the next cluster if they
3450                  * are sequential
3451                  *
3452                  * we let the last one be partial as long as it was adjacent to the previous one...
3453                  * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3454                  * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3455                  */
3456                 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
3457                         if ((l_clusters[i].last_pg - l_clusters[i].start_pg) != MAX_UPL_TRANSFER)
3458                                 goto dont_try;
3459                         if (l_clusters[i].last_pg != l_clusters[i+1].start_pg)
3460                                 goto dont_try;
3461                 }
3462         }
3463         for (cl_index = 0; cl_index < cl_len; cl_index++) {
3464                 /*
3465                  * try to push each cluster in turn...  cluster_push_x may not
3466                  * push the cluster if can_delay is TRUE and the cluster doesn't
3467                  * meet the critera for an immediate push
3468                  */
3469                 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3470                         l_clusters[cl_index].start_pg = 0;
3471                         l_clusters[cl_index].last_pg  = 0;
3472
3473                         cl_pushed++;
3474
3475                         if (push_all == 0)
3476                                 break;
3477                 }
3478         }
3479 dont_try:
3480         if (cl_len > cl_pushed) {
3481                /*
3482                 * we didn't push all of the clusters, so
3483                 * lets try to merge them back in to the vnode
3484                 */
3485                 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3486                         /*
3487                          * we picked up some new clusters while we were trying to
3488                          * push the old ones (I don't think this can happen because
3489                          * I'm holding the lock, but just in case)... the sum of the
3490                          * leftovers plus the new cluster count exceeds our ability
3491                          * to represent them, so switch to the sparse cluster mechanism
3492                          */
3493
3494                         /*
3495                          * first collect the new clusters sitting in the vp
3496                          */
3497                         sparse_cluster_switch(vp, EOF);
3498
3499                         for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
3500                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3501                                         continue;
3502                                 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3503                                 vp->v_clusters[cl_index1].last_pg  = l_clusters[cl_index].last_pg;
3504
3505                                 cl_index1++;
3506                         }
3507                         /*
3508                          * update the cluster count
3509                          */
3510                         vp->v_clen = cl_index1;
3511
3512                         /*
3513                          * and collect the original clusters that were moved into the
3514                          * local storage for sorting purposes
3515                          */
3516                         sparse_cluster_switch(vp, EOF);
3517
3518                 } else {
3519                         /*
3520                          * we've got room to merge the leftovers back in
3521                          * just append them starting at the next 'hole'
3522                          * represented by vp->v_clen
3523                          */
3524                         for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3525                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3526                                         continue;
3527
3528                                 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3529                                 vp->v_clusters[cl_index1].last_pg  = l_clusters[cl_index].last_pg;
3530
3531                                 cl_index1++;
3532                         }
3533                         /*
3534                          * update the cluster count
3535                          */
3536                         vp->v_clen = cl_index1;
3537                 }
3538         }
3539         return(MAX_CLUSTERS - vp->v_clen);
3540 }
3541
3542
3543
3544 static int
3545 cluster_push_x(vp, EOF, first, last, can_delay)
3546         struct vnode *vp;
3547         off_t  EOF;
3548         daddr_t first;
3549         daddr_t last;
3550         int    can_delay;
3551 {
3552         upl_page_info_t *pl;
3553         upl_t            upl;
3554         vm_offset_t      upl_offset;
3555         int              upl_size;
3556         off_t            upl_f_offset;
3557         int              pages_in_upl;
3558         int              start_pg;
3559         int              last_pg;
3560         int              io_size;
3561         int              io_flags;
3562         int              upl_flags;
3563         int              size;
3564         kern_return_t    kret;
3565
3566
3567         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3568                      vp->v_clen, first, last, EOF, 0);
3569
3570         if ((pages_in_upl = last - first) == 0) {
3571                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
3572
3573                 return (1);
3574         }
3575         upl_size = pages_in_upl * PAGE_SIZE;
3576         upl_f_offset = ((off_t)first) * PAGE_SIZE_64;
3577
3578         if (upl_f_offset + upl_size >= EOF) {
3579
3580                 if (upl_f_offset >= EOF) {
3581                         /*
3582                          * must have truncated the file and missed
3583                          * clearing a dangling cluster (i.e. it's completely
3584                          * beyond the new EOF
3585                          */
3586                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3587
3588                         return(1);
3589                 }
3590                 size = EOF - upl_f_offset;
3591
3592                 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3593                 pages_in_upl = upl_size / PAGE_SIZE;
3594         } else
3595                 size = upl_size;
3596
3597         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
3598
3599         if (vp->v_flag & VNOCACHE_DATA)
3600                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
3601         else
3602                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
3603
3604         kret = ubc_create_upl(vp,
3605                                 upl_f_offset,
3606                                 upl_size,
3607                                 &upl,
3608                                 &pl,
3609                                 upl_flags);
3610         if (kret != KERN_SUCCESS)
3611                 panic("cluster_push: failed to get pagelist");
3612
3613         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
3614
3615         /*
3616          * since we only asked for the dirty pages back
3617          * it's possible that we may only get a few or even none, so...
3618          * before we start marching forward, we must make sure we know
3619          * where the last present page is in the UPL, otherwise we could
3620          * end up working with a freed upl due to the FREE_ON_EMPTY semantics
3621          * employed by commit_range and abort_range.
3622          */
3623         for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3624                 if (upl_page_present(pl, last_pg))
3625                         break;
3626         }
3627         pages_in_upl = last_pg + 1;
3628
3629         if (pages_in_upl == 0) {
3630                 ubc_upl_abort(upl, 0);
3631
3632                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
3633                 return(1);
3634         }
3635
3636         for (last_pg = 0; last_pg < pages_in_upl; ) {
3637                 /*
3638                  * find the next dirty page in the UPL
3639                  * this will become the first page in the
3640                  * next I/O to generate
3641                  */
3642                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3643                         if (upl_dirty_page(pl, start_pg))
3644                                 break;
3645                         if (upl_page_present(pl, start_pg))
3646                                 /*
3647                                  * RET_ONLY_DIRTY will return non-dirty 'precious' pages
3648                                  * just release these unchanged since we're not going
3649                                  * to steal them or change their state
3650                                  */
3651                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
3652                 }
3653                 if (start_pg >= pages_in_upl)
3654                         /*
3655                          * done... no more dirty pages to push
3656                          */
3657                         break;
3658                 if (start_pg > last_pg)
3659                         /*
3660                          * skipped over some non-dirty pages
3661                          */
3662                         size -= ((start_pg - last_pg) * PAGE_SIZE);
3663
3664                 /*
3665                  * find a range of dirty pages to write
3666                  */
3667                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3668                         if (!upl_dirty_page(pl, last_pg))
3669                                 break;
3670                 }
3671                 upl_offset = start_pg * PAGE_SIZE;
3672
3673                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3674
3675                 if (vp->v_flag & VNOCACHE_DATA)
3676                         io_flags = CL_THROTTLE | CL_COMMIT | CL_ASYNC | CL_DUMP;
3677                 else
3678                         io_flags = CL_THROTTLE | CL_COMMIT | CL_ASYNC;
3679
3680                 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
3681
3682                 size -= io_size;
3683         }
3684         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3685
3686         return(1);
3687 }
3688
3689
3690 static int
3691 sparse_cluster_switch(struct vnode *vp, off_t EOF)
3692 {
3693         int cl_index;
3694
3695         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3696
3697         if ( !(vp->v_flag & VHASDIRTY)) {
3698                 vp->v_flag |= VHASDIRTY;
3699                 vp->v_scdirty = 0;
3700                 vp->v_scmap   = 0;
3701         }
3702         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3703                 int    flags;
3704                 int    start_pg;
3705                 int    last_pg;
3706
3707                 for (start_pg = vp->v_clusters[cl_index].start_pg; start_pg < vp->v_clusters[cl_index].last_pg; start_pg++) {
3708
3709                         if (ubc_page_op(vp, (off_t)(((off_t)start_pg) * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
3710                                 if (flags & UPL_POP_DIRTY)
3711                                         sparse_cluster_add(vp, EOF, start_pg, start_pg + 1);
3712                         }
3713                 }
3714         }
3715         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3716 }
3717
3718
3719 static int
3720 sparse_cluster_push(struct vnode *vp, off_t EOF, int push_all)
3721 {
3722         daddr_t first;
3723         daddr_t last;
3724         off_t offset;
3725         u_int length;
3726
3727         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, push_all, 0);
3728
3729         if (push_all)
3730                 vfs_drt_control(&(vp->v_scmap), 1);
3731
3732         for (;;) {
3733                 if (vfs_drt_get_cluster(&(vp->v_scmap), &offset, &length) != KERN_SUCCESS) {
3734                         vp->v_flag &= ~VHASDIRTY;
3735                         vp->v_clen = 0;
3736                         break;
3737                 }
3738                 first = (daddr_t)(offset / PAGE_SIZE_64);
3739                 last  = (daddr_t)((offset + length) / PAGE_SIZE_64);
3740
3741                 cluster_push_x(vp, EOF, first, last, 0);
3742
3743                 vp->v_scdirty -= (last - first);
3744
3745                 if (push_all == 0)
3746                         break;
3747         }
3748         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3749 }
3750
3751
3752 static int
3753 sparse_cluster_add(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last)
3754 {
3755         u_int new_dirty;
3756         u_int length;
3757         off_t offset;
3758
3759         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)vp->v_scmap, vp->v_scdirty, first, last, 0);
3760
3761         offset = (off_t)first * PAGE_SIZE_64;
3762         length = (last - first) * PAGE_SIZE;
3763
3764         while (vfs_drt_mark_pages(&(vp->v_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
3765                 /*
3766                  * no room left in the map
3767                  * only a partial update was done
3768                  * push out some pages and try again
3769                  */
3770                 vp->v_scdirty += new_dirty;
3771
3772                 sparse_cluster_push(vp, EOF, 0);
3773
3774                 offset += (new_dirty * PAGE_SIZE_64);
3775                 length -= (new_dirty * PAGE_SIZE);
3776         }
3777         vp->v_scdirty += new_dirty;
3778
3779         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3780 }
3781
3782
3783 static int
3784 cluster_align_phys_io(struct vnode *vp, struct uio *uio, addr64_t usr_paddr, int xsize, int devblocksize, int flags)
3785 {
3786         struct iovec     *iov;
3787         upl_page_info_t  *pl;
3788         upl_t            upl;
3789         addr64_t         ubc_paddr;
3790         kern_return_t    kret;
3791         int              error = 0;
3792
3793         iov = uio->uio_iov;
3794
3795         kret = ubc_create_upl(vp,
3796                               uio->uio_offset & ~PAGE_MASK_64,
3797                               PAGE_SIZE,
3798                               &upl,
3799                               &pl,
3800                               UPL_SET_LITE);
3801
3802         if (kret != KERN_SUCCESS)
3803                 return(EINVAL);
3804
3805         if (!upl_valid_page(pl, 0)) {
3806                 /*
3807                  * issue a synchronous read to cluster_io
3808                  */
3809                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3810                                    CL_READ, (struct buf *)0, (struct clios *)0);
3811                 if (error) {
3812                           ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3813
3814                           return(error);
3815                 }
3816         }
3817         ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
3818
3819 /*
3820  *      NOTE:  There is no prototype for the following in BSD. It, and the definitions
3821  *      of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
3822  *      osfmk/ppc/mappings.h.  They are not included here because there appears to be no
3823  *      way to do so without exporting them to kexts as well.
3824  */
3825         if (flags & CL_READ)
3826 //              copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);    /* Copy physical to physical and flush the destination */
3827                 copypv(ubc_paddr, usr_paddr, xsize,        2 |        1 |        4);    /* Copy physical to physical and flush the destination */
3828         else
3829 //              copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);    /* Copy physical to physical and flush the source */
3830                 copypv(usr_paddr, ubc_paddr, xsize,        2 |        1 |        8);    /* Copy physical to physical and flush the source */
3831
3832         if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
3833                 /*
3834                  * issue a synchronous write to cluster_io
3835                  */
3836                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3837                                         0, (struct buf *)0, (struct clios *)0);
3838         }
3839         if (error == 0) {
3840                 uio->uio_offset += xsize;
3841                 iov->iov_base   += xsize;
3842                 iov->iov_len    -= xsize;
3843                 uio->uio_resid  -= xsize;
3844         }
3845         ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3846
3847         return (error);
3848 }
3849
3850
3851
3852 int
3853 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
3854 {
3855         int       pg_offset;
3856         int       pg_index;
3857         int       csize;
3858         int       segflg;
3859         int       retval = 0;
3860         upl_page_info_t *pl;
3861         boolean_t funnel_state = FALSE;
3862
3863
3864         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
3865                      (int)uio->uio_offset, uio->uio_resid, upl_offset, xsize, 0);
3866
3867         if (xsize >= (16 * 1024))
3868                 funnel_state = thread_funnel_set(kernel_flock, FALSE);
3869
3870         segflg = uio->uio_segflg;
3871
3872         switch(segflg) {
3873
3874           case UIO_USERSPACE:
3875           case UIO_USERISPACE:
3876                 uio->uio_segflg = UIO_PHYS_USERSPACE;
3877                 break;
3878
3879           case UIO_SYSSPACE:
3880                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
3881                 break;
3882         }
3883         pl = ubc_upl_pageinfo(upl);
3884
3885         pg_index  = upl_offset / PAGE_SIZE;
3886         pg_offset = upl_offset & PAGE_MASK;
3887         csize     = min(PAGE_SIZE - pg_offset, xsize);
3888
3889         while (xsize && retval == 0) {
3890                 addr64_t  paddr;
3891
3892                 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
3893
3894                 retval = uiomove64(paddr, csize, uio);
3895
3896                 pg_index += 1;
3897                 pg_offset = 0;
3898                 xsize    -= csize;
3899                 csize     = min(PAGE_SIZE, xsize);
3900         }
3901         uio->uio_segflg = segflg;
3902
3903         if (funnel_state == TRUE)
3904                 thread_funnel_set(kernel_flock, TRUE);
3905
3906         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3907                      (int)uio->uio_offset, uio->uio_resid, retval, segflg, 0);
3908
3909         return (retval);
3910 }
3911
3912
3913 int
3914 cluster_copy_ubc_data(struct vnode *vp, struct uio *uio, int *io_resid, int mark_dirty)
3915 {
3916         int       segflg;
3917         int       io_size;
3918         int       xsize;
3919         int       start_offset;
3920         off_t     f_offset;
3921         int       retval = 0;
3922         memory_object_control_t  control;
3923         int       op_flags = UPL_POP_SET | UPL_POP_BUSY;
3924         boolean_t funnel_state = FALSE;
3925
3926
3927         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
3928                      (int)uio->uio_offset, uio->uio_resid, 0, *io_resid, 0);
3929
3930         control = ubc_getobject(vp, UBC_FLAGS_NONE);
3931         if (control == MEMORY_OBJECT_CONTROL_NULL) {
3932                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3933                              (int)uio->uio_offset, uio->uio_resid, retval, 3, 0);
3934
3935                 return(0);
3936         }
3937         if (mark_dirty)
3938                 op_flags |= UPL_POP_DIRTY;
3939
3940         segflg = uio->uio_segflg;
3941
3942         switch(segflg) {
3943
3944           case UIO_USERSPACE:
3945           case UIO_USERISPACE:
3946                 uio->uio_segflg = UIO_PHYS_USERSPACE;
3947                 break;
3948
3949           case UIO_SYSSPACE:
3950                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
3951                 break;
3952         }
3953         io_size      = *io_resid;
3954         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3955         f_offset     = uio->uio_offset - start_offset;
3956         xsize        = min(PAGE_SIZE - start_offset, io_size);
3957
3958         while (io_size && retval == 0) {
3959                 ppnum_t pgframe;
3960
3961                 if (ubc_page_op_with_control(control, f_offset, op_flags, &pgframe, 0) != KERN_SUCCESS)
3962                         break;
3963
3964                 if (funnel_state == FALSE && io_size >= (16 * 1024))
3965                         funnel_state = thread_funnel_set(kernel_flock, FALSE);
3966
3967                 retval = uiomove64((addr64_t)(((addr64_t)pgframe << 12) + start_offset), xsize, uio);
3968
3969                 ubc_page_op_with_control(control, f_offset, UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
3970
3971                 io_size     -= xsize;
3972                 start_offset = 0;
3973                 f_offset     = uio->uio_offset;
3974                 xsize        = min(PAGE_SIZE, io_size);
3975         }
3976         uio->uio_segflg = segflg;
3977         *io_resid       = io_size;
3978
3979         if (funnel_state == TRUE)
3980                 thread_funnel_set(kernel_flock, TRUE);
3981
3982         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3983                      (int)uio->uio_offset, uio->uio_resid, retval, 0x80000000 | segflg, 0);
3984
3985         return(retval);
3986 }
3987
3988
3989 int
3990 is_file_clean(struct vnode *vp, off_t filesize)
3991 {
3992         off_t f_offset;
3993         int   flags;
3994         int   total_dirty = 0;
3995
3996         for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
3997                 if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
3998                         if (flags & UPL_POP_DIRTY) {
3999                                 total_dirty++;
4000                         }
4001                 }
4002         }
4003         if (total_dirty)
4004                 return(EINVAL);
4005
4006         return (0);
4007 }
4008
4009
4010
4011 /*
4012  * Dirty region tracking/clustering mechanism.
4013  *
4014  * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4015  * dirty regions within a larger space (file).  It is primarily intended to
4016  * support clustering in large files with many dirty areas.
4017  *
4018  * The implementation assumes that the dirty regions are pages.
4019  *
4020  * To represent dirty pages within the file, we store bit vectors in a
4021  * variable-size circular hash.
4022  */
4023
4024 /*
4025  * Bitvector size.  This determines the number of pages we group in a
4026  * single hashtable entry.  Each hashtable entry is aligned to this
4027  * size within the file.
4028  */
4029 #define DRT_BITVECTOR_PAGES             256
4030
4031 /*
4032  * File offset handling.
4033  *
4034  * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4035  * the correct formula is  (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4036  */
4037 #define DRT_ADDRESS_MASK                (~((1 << 20) - 1))
4038 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK)
4039
4040 /*
4041  * Hashtable address field handling.
4042  *
4043  * The low-order bits of the hashtable address are used to conserve
4044  * space.
4045  *
4046  * DRT_HASH_COUNT_MASK must be large enough to store the range
4047  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4048  * to indicate that the bucket is actually unoccupied.
4049  */
4050 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4051 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \
4052         do {                                                                                            \
4053                 (scm)->scm_hashtable[(i)].dhe_control =                                                 \
4054                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4055         } while (0)
4056 #define DRT_HASH_COUNT_MASK             0x1ff
4057 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4058 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \
4059         do {                                                                                                            \
4060                 (scm)->scm_hashtable[(i)].dhe_control =                                                                 \
4061                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \
4062         } while (0)
4063 #define DRT_HASH_CLEAR(scm, i)                                                                                          \
4064         do {                                                                                                            \
4065                 (scm)->scm_hashtable[(i)].dhe_control = 0;                                                              \
4066         } while (0)
4067 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4068 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4069 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \
4070         do {                                                                                            \
4071                 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \
4072                 DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \
4073         } while(0);
4074
4075
4076 /*
4077  * Hash table moduli.
4078  *
4079  * Since the hashtable entry's size is dependent on the size of
4080  * the bitvector, and since the hashtable size is constrained to
4081  * both being prime and fitting within the desired allocation
4082  * size, these values need to be manually determined.
4083  *
4084  * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4085  *
4086  * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4087  * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4088  */
4089 #define DRT_HASH_SMALL_MODULUS  23
4090 #define DRT_HASH_LARGE_MODULUS  401
4091
4092 #define DRT_SMALL_ALLOCATION    1024    /* 104 bytes spare */
4093 #define DRT_LARGE_ALLOCATION    16384   /* 344 bytes spare */
4094
4095 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4096
4097 /*
4098  * Hashtable bitvector handling.
4099  *
4100  * Bitvector fields are 32 bits long.
4101  */
4102
4103 #define DRT_HASH_SET_BIT(scm, i, bit)                           \
4104         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4105
4106 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \
4107         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4108
4109 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \
4110         ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4111
4112 #define DRT_BITVECTOR_CLEAR(scm, i)                             \
4113         bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4114
4115 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \
4116         bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \
4117             &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \
4118             (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4119
4120
4121
4122 /*
4123  * Hashtable entry.
4124  */
4125 struct vfs_drt_hashentry {
4126         u_int64_t       dhe_control;
4127         u_int32_t       dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
4128 };
4129
4130 /*
4131  * Dirty Region Tracking structure.
4132  *
4133  * The hashtable is allocated entirely inside the DRT structure.
4134  *
4135  * The hash is a simple circular prime modulus arrangement, the structure
4136  * is resized from small to large if it overflows.
4137  */
4138
4139 struct vfs_drt_clustermap {
4140         u_int32_t               scm_magic;      /* sanity/detection */
4141 #define DRT_SCM_MAGIC           0x12020003
4142         u_int32_t               scm_modulus;    /* current ring size */
4143         u_int32_t               scm_buckets;    /* number of occupied buckets */
4144         u_int32_t               scm_lastclean;  /* last entry we cleaned */
4145         u_int32_t               scm_iskips;     /* number of slot skips */
4146
4147         struct vfs_drt_hashentry scm_hashtable[0];
4148 };
4149
4150
4151 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus)
4152 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus)
4153
4154 /*
4155  * Debugging codes and arguments.
4156  */
4157 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4158 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4159 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4160 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4161 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4162                                                             * dirty */
4163                                                            /* 0, setcount */
4164                                                            /* 1 (clean, no map) */
4165                                                            /* 2 (map alloc fail) */
4166                                                            /* 3, resid (partial) */
4167 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87))
4168 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4169                                                             * lastclean, iskips */
4170
4171
4172 static void             vfs_drt_sanity(struct vfs_drt_clustermap *cmap);
4173 static kern_return_t    vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
4174 static kern_return_t    vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
4175 static kern_return_t    vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
4176         u_int64_t offset, int *indexp);
4177 static kern_return_t    vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
4178         u_int64_t offset,
4179         int *indexp,
4180         int recursed);
4181 static kern_return_t    vfs_drt_do_mark_pages(
4182         void            **cmapp,
4183         u_int64_t       offset,
4184         u_int           length,
4185         int             *setcountp,
4186         int             dirty);
4187 static void             vfs_drt_trace(
4188         struct vfs_drt_clustermap *cmap,
4189         int code,
4190         int arg1,
4191         int arg2,
4192         int arg3,
4193         int arg4);
4194
4195
4196 /*
4197  * Allocate and initialise a sparse cluster map.
4198  *
4199  * Will allocate a new map, resize or compact an existing map.
4200  *
4201  * XXX we should probably have at least one intermediate map size,
4202  * as the 1:16 ratio seems a bit drastic.
4203  */
4204 static kern_return_t
4205 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
4206 {
4207         struct vfs_drt_clustermap *cmap, *ocmap;
4208         kern_return_t   kret;
4209         u_int64_t       offset;
4210         int             nsize, i, active_buckets, index, copycount;
4211
4212         ocmap = NULL;
4213         if (cmapp != NULL)
4214                 ocmap = *cmapp;
4215
4216         /*
4217          * Decide on the size of the new map.
4218          */
4219         if (ocmap == NULL) {
4220                 nsize = DRT_HASH_SMALL_MODULUS;
4221         } else {
4222                 /* count the number of active buckets in the old map */
4223                 active_buckets = 0;
4224                 for (i = 0; i < ocmap->scm_modulus; i++) {
4225                         if (!DRT_HASH_VACANT(ocmap, i) &&
4226                             (DRT_HASH_GET_COUNT(ocmap, i) != 0))
4227                                 active_buckets++;
4228                 }
4229                 /*
4230                  * If we're currently using the small allocation, check to
4231                  * see whether we should grow to the large one.
4232                  */
4233                 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
4234                         /* if the ring is nearly full */
4235                         if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
4236                                 nsize = DRT_HASH_LARGE_MODULUS;
4237                         } else {
4238                                 nsize = DRT_HASH_SMALL_MODULUS;
4239                         }
4240                 } else {
4241                         /* already using the large modulus */
4242                         nsize = DRT_HASH_LARGE_MODULUS;
4243                         /*
4244                          * If the ring is completely full, there's
4245                          * nothing useful for us to do.  Behave as
4246                          * though we had compacted into the new
4247                          * array and return.
4248                          */
4249                         if (active_buckets >= DRT_HASH_LARGE_MODULUS)
4250                                 return(KERN_SUCCESS);
4251                 }
4252         }
4253
4254         /*
4255          * Allocate and initialise the new map.
4256          */
4257
4258         kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
4259             (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4260         if (kret != KERN_SUCCESS)
4261                 return(kret);
4262         cmap->scm_magic = DRT_SCM_MAGIC;
4263         cmap->scm_modulus = nsize;
4264         cmap->scm_buckets = 0;
4265         cmap->scm_lastclean = 0;
4266         cmap->scm_iskips = 0;
4267         for (i = 0; i < cmap->scm_modulus; i++) {
4268                 DRT_HASH_CLEAR(cmap, i);
4269                 DRT_HASH_VACATE(cmap, i);
4270                 DRT_BITVECTOR_CLEAR(cmap, i);
4271         }
4272
4273         /*
4274          * If there's an old map, re-hash entries from it into the new map.
4275          */
4276         copycount = 0;
4277         if (ocmap != NULL) {
4278                 for (i = 0; i < ocmap->scm_modulus; i++) {
4279                         /* skip empty buckets */
4280                         if (DRT_HASH_VACANT(ocmap, i) ||
4281                             (DRT_HASH_GET_COUNT(ocmap, i) == 0))
4282                                 continue;
4283                         /* get new index */
4284                         offset = DRT_HASH_GET_ADDRESS(ocmap, i);
4285                         kret = vfs_drt_get_index(&cmap, offset, &index, 1);
4286                         if (kret != KERN_SUCCESS) {
4287                                 /* XXX need to bail out gracefully here */
4288                                 panic("vfs_drt: new cluster map mysteriously too small");
4289                         }
4290                         /* copy */
4291                         DRT_HASH_COPY(ocmap, i, cmap, index);
4292                         copycount++;
4293                 }
4294         }
4295
4296         /* log what we've done */
4297         vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
4298
4299         /*
4300          * It's important to ensure that *cmapp always points to
4301          * a valid map, so we must overwrite it before freeing
4302          * the old map.
4303          */
4304         *cmapp = cmap;
4305         if (ocmap != NULL) {
4306                 /* emit stats into trace buffer */
4307                 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
4308                               ocmap->scm_modulus,
4309                               ocmap->scm_buckets,
4310                               ocmap->scm_lastclean,
4311                               ocmap->scm_iskips);
4312
4313                 vfs_drt_free_map(ocmap);
4314         }
4315         return(KERN_SUCCESS);
4316 }
4317
4318
4319 /*
4320  * Free a sparse cluster map.
4321  */
4322 static kern_return_t
4323 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
4324 {
4325         kern_return_t   ret;
4326
4327         kmem_free(kernel_map, (vm_offset_t)cmap,
4328                   (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4329         return(KERN_SUCCESS);
4330 }
4331
4332
4333 /*
4334  * Find the hashtable slot currently occupied by an entry for the supplied offset.
4335  */
4336 static kern_return_t
4337 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
4338 {
4339         kern_return_t   kret;
4340         int             index, i, tries;
4341
4342         offset = DRT_ALIGN_ADDRESS(offset);
4343         index = DRT_HASH(cmap, offset);
4344
4345         /* traverse the hashtable */
4346         for (i = 0; i < cmap->scm_modulus; i++) {
4347
4348                 /*
4349                  * If the slot is vacant, we can stop.
4350                  */
4351                 if (DRT_HASH_VACANT(cmap, index))
4352                         break;
4353
4354                 /*
4355                  * If the address matches our offset, we have success.
4356                  */
4357                 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
4358                         *indexp = index;
4359                         return(KERN_SUCCESS);
4360                 }
4361
4362                 /*
4363                  * Move to the next slot, try again.
4364                  */
4365                 index = DRT_HASH_NEXT(cmap, index);
4366         }
4367         /*
4368          * It's not there.
4369          */
4370         return(KERN_FAILURE);
4371 }
4372
4373 /*
4374  * Find the hashtable slot for the supplied offset.  If we haven't allocated
4375  * one yet, allocate one and populate the address field.  Note that it will
4376  * not have a nonzero page count and thus will still technically be free, so
4377  * in the case where we are called to clean pages, the slot will remain free.
4378  */
4379 static kern_return_t
4380 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
4381 {
4382         struct vfs_drt_clustermap *cmap;
4383         kern_return_t   kret;
4384         int             index, i;
4385
4386         cmap = *cmapp;
4387
4388         /* look for an existing entry */
4389         kret = vfs_drt_search_index(cmap, offset, indexp);
4390         if (kret == KERN_SUCCESS)
4391                 return(kret);
4392
4393         /* need to allocate an entry */
4394         offset = DRT_ALIGN_ADDRESS(offset);
4395         index = DRT_HASH(cmap, offset);
4396
4397         /* scan from the index forwards looking for a vacant slot */
4398         for (i = 0; i < cmap->scm_modulus; i++) {
4399                 /* slot vacant? */
4400                 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
4401                         cmap->scm_buckets++;
4402                         if (index < cmap->scm_lastclean)
4403                                 cmap->scm_lastclean = index;
4404                         DRT_HASH_SET_ADDRESS(cmap, index, offset);
4405                         DRT_HASH_SET_COUNT(cmap, index, 0);
4406                         DRT_BITVECTOR_CLEAR(cmap, index);
4407                         *indexp = index;
4408                         vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
4409                         return(KERN_SUCCESS);
4410                 }
4411                 cmap->scm_iskips += i;
4412                 index = DRT_HASH_NEXT(cmap, index);
4413         }
4414
4415         /*
4416          * We haven't found a vacant slot, so the map is full.  If we're not
4417          * already recursed, try reallocating/compacting it.
4418          */
4419         if (recursed)
4420                 return(KERN_FAILURE);
4421         kret = vfs_drt_alloc_map(cmapp);
4422         if (kret == KERN_SUCCESS) {
4423                 /* now try to insert again */
4424                 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
4425         }
4426         return(kret);
4427 }
4428
4429 /*
4430  * Implementation of set dirty/clean.
4431  *
4432  * In the 'clean' case, not finding a map is OK.
4433  */
4434 static kern_return_t
4435 vfs_drt_do_mark_pages(
4436         void            **private,
4437         u_int64_t       offset,
4438         u_int           length,
4439         int             *setcountp,
4440         int             dirty)
4441 {
4442         struct vfs_drt_clustermap *cmap, **cmapp;
4443         kern_return_t   kret;
4444         int             i, index, pgoff, pgcount, setcount, ecount;
4445
4446         cmapp = (struct vfs_drt_clustermap **)private;
4447         cmap = *cmapp;
4448
4449         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
4450
4451         if (setcountp != NULL)
4452                 *setcountp = 0;
4453
4454         /* allocate a cluster map if we don't already have one */
4455         if (cmap == NULL) {
4456                 /* no cluster map, nothing to clean */
4457                 if (!dirty) {
4458                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
4459                         return(KERN_SUCCESS);
4460                 }
4461                 kret = vfs_drt_alloc_map(cmapp);
4462                 if (kret != KERN_SUCCESS) {
4463                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
4464                         return(kret);
4465                 }
4466         }
4467         setcount = 0;
4468
4469         /*
4470          * Iterate over the length of the region.
4471          */
4472         while (length > 0) {
4473                 /*
4474                  * Get the hashtable index for this offset.
4475                  *
4476                  * XXX this will add blank entries if we are clearing a range
4477                  * that hasn't been dirtied.
4478                  */
4479                 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
4480                 cmap = *cmapp;  /* may have changed! */
4481                 /* this may be a partial-success return */
4482                 if (kret != KERN_SUCCESS) {
4483                         if (setcountp != NULL)
4484                                 *setcountp = setcount;
4485                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
4486
4487                         return(kret);
4488                 }
4489
4490                 /*
4491                  * Work out how many pages we're modifying in this
4492                  * hashtable entry.
4493                  */
4494                 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
4495                 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
4496
4497                 /*
4498                  * Iterate over pages, dirty/clearing as we go.
4499                  */
4500                 ecount = DRT_HASH_GET_COUNT(cmap, index);
4501                 for (i = 0; i < pgcount; i++) {
4502                         if (dirty) {
4503                                 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4504                                         DRT_HASH_SET_BIT(cmap, index, pgoff + i);
4505                                         ecount++;
4506                                         setcount++;
4507                                 }
4508                         } else {
4509                                 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4510                                         DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
4511                                         ecount--;
4512                                         setcount++;
4513                                 }
4514                         }
4515                 }
4516                 DRT_HASH_SET_COUNT(cmap, index, ecount);
4517 next:
4518                 offset += pgcount * PAGE_SIZE;
4519                 length -= pgcount * PAGE_SIZE;
4520         }
4521         if (setcountp != NULL)
4522                 *setcountp = setcount;
4523
4524         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
4525
4526         return(KERN_SUCCESS);
4527 }
4528
4529 /*
4530  * Mark a set of pages as dirty/clean.
4531  *
4532  * This is a public interface.
4533  *
4534  * cmapp
4535  *      Pointer to storage suitable for holding a pointer.  Note that
4536  *      this must either be NULL or a value set by this function.
4537  *
4538  * size
4539  *      Current file size in bytes.
4540  *
4541  * offset
4542  *      Offset of the first page to be marked as dirty, in bytes.  Must be
4543  *      page-aligned.
4544  *
4545  * length
4546  *      Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE.
4547  *
4548  * setcountp
4549  *      Number of pages newly marked dirty by this call (optional).
4550  *
4551  * Returns KERN_SUCCESS if all the pages were successfully marked.
4552  */
4553 static kern_return_t
4554 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp)
4555 {
4556         /* XXX size unused, drop from interface */
4557         return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
4558 }
4559
4560 static kern_return_t
4561 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
4562 {
4563         return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
4564 }
4565
4566 /*
4567  * Get a cluster of dirty pages.
4568  *
4569  * This is a public interface.
4570  *
4571  * cmapp
4572  *      Pointer to storage managed by drt_mark_pages.  Note that this must
4573  *      be NULL or a value set by drt_mark_pages.
4574  *
4575  * offsetp
4576  *      Returns the byte offset into the file of the first page in the cluster.
4577  *
4578  * lengthp
4579  *      Returns the length in bytes of the cluster of dirty pages.
4580  *
4581  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there
4582  * are no dirty pages meeting the minmum size criteria.  Private storage will
4583  * be released if there are no more dirty pages left in the map
4584  *
4585  */
4586 static kern_return_t
4587 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
4588 {
4589         struct vfs_drt_clustermap *cmap;
4590         u_int64_t       offset;
4591         u_int           length;
4592         int             index, i, j, fs, ls;
4593
4594         /* sanity */
4595         if ((cmapp == NULL) || (*cmapp == NULL))
4596                 return(KERN_FAILURE);
4597         cmap = *cmapp;
4598
4599         /* walk the hashtable */
4600         for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
4601                 index = DRT_HASH(cmap, offset);
4602
4603                 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
4604                         continue;
4605
4606                 /* scan the bitfield for a string of bits */
4607                 fs = -1;
4608
4609                 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
4610                         if (DRT_HASH_TEST_BIT(cmap, index, i)) {
4611                                 fs = i;
4612                                 break;
4613                         }
4614                 }
4615                 if (fs == -1) {
4616                         /*  didn't find any bits set */
4617                         panic("vfs_drt: entry summary count > 0 but no bits set in map");
4618                 }
4619                 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
4620                         if (!DRT_HASH_TEST_BIT(cmap, index, i))
4621                                 break;
4622                 }
4623
4624                 /* compute offset and length, mark pages clean */
4625                 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
4626                 length = ls * PAGE_SIZE;
4627                 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
4628                 cmap->scm_lastclean = index;
4629
4630                 /* return successful */
4631                 *offsetp = (off_t)offset;
4632                 *lengthp = length;
4633
4634                 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
4635                 return(KERN_SUCCESS);
4636         }
4637         /*
4638          * We didn't find anything... hashtable is empty
4639          * emit stats into trace buffer and
4640          * then free it
4641          */
4642         vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
4643                       cmap->scm_modulus,
4644                       cmap->scm_buckets,
4645                       cmap->scm_lastclean,
4646                       cmap->scm_iskips);
4647
4648         vfs_drt_free_map(cmap);
4649         *cmapp = NULL;
4650
4651         return(KERN_FAILURE);
4652 }
4653
4654
4655 static kern_return_t
4656 vfs_drt_control(void **cmapp, int op_type)
4657 {
4658         struct vfs_drt_clustermap *cmap;
4659
4660         /* sanity */
4661         if ((cmapp == NULL) || (*cmapp == NULL))
4662                 return(KERN_FAILURE);
4663         cmap = *cmapp;
4664
4665         switch (op_type) {
4666         case 0:
4667                 /* emit stats into trace buffer */
4668                 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
4669                               cmap->scm_modulus,
4670                               cmap->scm_buckets,
4671                               cmap->scm_lastclean,
4672                               cmap->scm_iskips);
4673
4674                 vfs_drt_free_map(cmap);
4675                 *cmapp = NULL;
4676                 break;
4677
4678         case 1:
4679                 cmap->scm_lastclean = 0;
4680                 break;
4681         }
4682         return(KERN_SUCCESS);
4683 }
4684
4685
4686
4687 /*
4688  * Emit a summary of the state of the clustermap into the trace buffer
4689  * along with some caller-provided data.
4690  */
4691 static void
4692 vfs_drt_trace(struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
4693 {
4694         KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
4695 }
4696
4697 /*
4698  * Perform basic sanity check on the hash entry summary count
4699  * vs. the actual bits set in the entry.
4700  */
4701 static void
4702 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
4703 {
4704         int index, i;
4705         int bits_on;
4706
4707         for (index = 0; index < cmap->scm_modulus; index++) {
4708                 if (DRT_HASH_VACANT(cmap, index))
4709                         continue;
4710
4711                 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
4712                         if (DRT_HASH_TEST_BIT(cmap, index, i))
4713                                 bits_on++;
4714                 }
4715                 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
4716                         panic("bits_on = %d,  index = %d\n", bits_on, index);
4717         }
4718 }