bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  26 /*
  27  * Copyright (c) 1993
  28  *      The Regents of the University of California.  All rights reserved.
  29  *
  30  * Redistribution and use in source and binary forms, with or without
  31  * modification, are permitted provided that the following conditions
  32  * are met:
  33  * 1. Redistributions of source code must retain the above copyright
  34  *    notice, this list of conditions and the following disclaimer.
  35  * 2. Redistributions in binary form must reproduce the above copyright
  36  *    notice, this list of conditions and the following disclaimer in the
  37  *    documentation and/or other materials provided with the distribution.
  38  * 3. All advertising materials mentioning features or use of this software
  39  *    must display the following acknowledgement:
  40  *      This product includes software developed by the University of
  41  *      California, Berkeley and its contributors.
  42  * 4. Neither the name of the University nor the names of its contributors
  43  *    may be used to endorse or promote products derived from this software
  44  *    without specific prior written permission.
  45  *
  46  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  47  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  48  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  49  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  50  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  51  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  52  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  53  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  54  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  55  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  56  * SUCH DAMAGE.
  57  *
  58  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  59  */
  60
  61 #include <sys/param.h>
  62 #include <sys/proc.h>
  63 #include <sys/buf.h>
  64 #include <sys/vnode.h>
  65 #include <sys/mount.h>
  66 #include <sys/trace.h>
  67 #include <sys/malloc.h>
  68 #include <sys/time.h>
  69 #include <sys/kernel.h>
  70 #include <sys/resourcevar.h>
  71 #include <libkern/libkern.h>
  72 #include <machine/machine_routines.h>
  73
  74 #include <sys/ubc.h>
  75 #include <vm/vm_pageout.h>
  76
  77 #include <mach/mach_types.h>
  78 #include <mach/memory_object_types.h>
  79
  80 #include <sys/kdebug.h>
  81
  82
  83 #define CL_READ      0x01
  84 #define CL_ASYNC     0x02
  85 #define CL_COMMIT    0x04
  86 #define CL_PAGEOUT   0x10
  87 #define CL_AGE       0x20
  88 #define CL_DUMP      0x40
  89 #define CL_NOZERO    0x80
  90 #define CL_PAGEIN    0x100
  91 #define CL_DEV_MEMORY 0x200
  92 #define CL_PRESERVE   0x400
  93 #define CL_THROTTLE   0x800
  94
  95
  96 struct clios {
  97         u_int  io_completed;       /* amount of io that has currently completed */
  98         u_int  io_issued;          /* amount of io that was successfully issued */
  99         int    io_error;           /* error code of first error encountered */
 100         int    io_wanted;          /* someone is sleeping waiting for a change in state */
 101 };
 102
 103
 104 static void cluster_zero(upl_t upl, vm_offset_t   upl_offset,
 105                 int size, struct buf *bp);
 106 static int cluster_read_x(struct vnode *vp, struct uio *uio,
 107                 off_t filesize, int devblocksize, int flags);
 108 static int cluster_write_x(struct vnode *vp, struct uio *uio,
 109                 off_t oldEOF, off_t newEOF, off_t headOff,
 110                 off_t tailOff, int devblocksize, int flags);
 111 static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
 112                 off_t filesize, int devblocksize, int flags);
 113 static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
 114                 off_t newEOF, int devblocksize, int flags);
 115 static int cluster_phys_read(struct vnode *vp, struct uio *uio,
 116                 off_t filesize, int devblocksize, int flags);
 117 static int cluster_phys_write(struct vnode *vp, struct uio *uio,
 118                 off_t newEOF, int devblocksize, int flags);
 119 static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
 120                 addr64_t usr_paddr, int xsize, int devblocksize, int flags);
 121 static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
 122 static int cluster_try_push(struct vnode *vp, off_t EOF, int can_delay, int push_all);
 123
 124 static int sparse_cluster_switch(struct vnode *vp, off_t EOF);
 125 static int sparse_cluster_push(struct vnode *vp, off_t EOF, int push_all);
 126 static int sparse_cluster_add(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last);
 127
 128 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp);
 129 static kern_return_t vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length);
 130 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
 131 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
 132
 133 int     ubc_page_op_with_control __P((memory_object_control_t, off_t, int, ppnum_t *, int *));
 134
 135
 136 /*
 137  * throttle the number of async writes that
 138  * can be outstanding on a single vnode
 139  * before we issue a synchronous write
 140  */
 141 #define ASYNC_THROTTLE  18
 142 #define HARD_THROTTLE_MAXCNT 1
 143 #define HARD_THROTTLE_MAXSIZE (64 * 1024)
 144
 145 int hard_throttle_on_root = 0;
 146 struct timeval priority_IO_timestamp_for_root;
 147
 148
 149 static int
 150 cluster_hard_throttle_on(vp)
 151         struct vnode *vp;
 152 {
 153         static struct timeval hard_throttle_maxelapsed = { 0, 300000 };
 154
 155         if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
 156                 struct timeval elapsed;
 157
 158                 if (hard_throttle_on_root)
 159                         return(1);
 160
 161                 elapsed = time;
 162                 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
 163
 164                 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
 165                         return(1);
 166         }
 167         return(0);
 168 }
 169
 170
 171 static int
 172 cluster_iodone(bp)
 173         struct buf *bp;
 174 {
 175         int         b_flags;
 176         int         error;
 177         int         total_size;
 178         int         total_resid;
 179         int         upl_offset;
 180         int         zero_offset;
 181         upl_t       upl;
 182         struct buf *cbp;
 183         struct buf *cbp_head;
 184         struct buf *cbp_next;
 185         struct buf *real_bp;
 186         struct vnode *vp;
 187         struct clios *iostate;
 188         int         commit_size;
 189         int         pg_offset;
 190
 191
 192         cbp_head = (struct buf *)(bp->b_trans_head);
 193
 194         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 195                      (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 196
 197         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 198                 /*
 199                  * all I/O requests that are part of this transaction
 200                  * have to complete before we can process it
 201                  */
 202                 if ( !(cbp->b_flags & B_DONE)) {
 203
 204                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 205                                      (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
 206
 207                         return 0;
 208                 }
 209         }
 210         error       = 0;
 211         total_size  = 0;
 212         total_resid = 0;
 213
 214         cbp        = cbp_head;
 215         upl_offset = cbp->b_uploffset;
 216         upl        = cbp->b_pagelist;
 217         b_flags    = cbp->b_flags;
 218         real_bp    = cbp->b_real_bp;
 219         vp         = cbp->b_vp;
 220         zero_offset= cbp->b_validend;
 221         iostate    = (struct clios *)cbp->b_iostate;
 222
 223         while (cbp) {
 224                 if ((cbp->b_flags & B_ERROR) && error == 0)
 225                         error = cbp->b_error;
 226
 227                 total_resid += cbp->b_resid;
 228                 total_size  += cbp->b_bcount;
 229
 230                 cbp_next = cbp->b_trans_next;
 231
 232                 free_io_buf(cbp);
 233
 234                 cbp = cbp_next;
 235         }
 236         if (zero_offset)
 237                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 238
 239         if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
 240                 vp->v_flag &= ~VTHROTTLED;
 241                 wakeup((caddr_t)&vp->v_numoutput);
 242         }
 243         if (iostate) {
 244                 /*
 245                  * someone has issued multiple I/Os asynchrounsly
 246                  * and is waiting for them to complete (streaming)
 247                  */
 248                 if (error && iostate->io_error == 0)
 249                         iostate->io_error = error;
 250
 251                 iostate->io_completed += total_size;
 252
 253                 if (iostate->io_wanted) {
 254                         /*
 255                          * someone is waiting for the state of
 256                          * this io stream to change
 257                          */
 258                         iostate->io_wanted = 0;
 259                         wakeup((caddr_t)&iostate->io_wanted);
 260                 }
 261         }
 262         if ((b_flags & B_NEED_IODONE) && real_bp) {
 263                 if (error) {
 264                         real_bp->b_flags |= B_ERROR;
 265                         real_bp->b_error = error;
 266                 }
 267                 real_bp->b_resid = total_resid;
 268
 269                 biodone(real_bp);
 270         }
 271         if (error == 0 && total_resid)
 272                 error = EIO;
 273
 274         if (b_flags & B_COMMIT_UPL) {
 275                 pg_offset   = upl_offset & PAGE_MASK;
 276                 commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 277
 278                 if (error || (b_flags & B_NOCACHE)) {
 279                         int upl_abort_code;
 280
 281                         if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
 282                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 283                         else if (b_flags & B_PGIN)
 284                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 285                         else
 286                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 287
 288                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 289                                         upl_abort_code);
 290
 291                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 292                                      (int)upl, upl_offset - pg_offset, commit_size,
 293                                      0x80000000|upl_abort_code, 0);
 294
 295                 } else {
 296                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 297
 298                         if (b_flags & B_PHYS) {
 299                                 if (b_flags & B_READ)
 300                                         upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
 301                         } else if ( !(b_flags & B_PAGEOUT))
 302                                 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
 303
 304                         if (b_flags & B_AGE)
 305                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 306
 307                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 308                                         upl_commit_flags);
 309
 310                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 311                                      (int)upl, upl_offset - pg_offset, commit_size,
 312                                      upl_commit_flags, 0);
 313                 }
 314         } else
 315                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 316                              (int)upl, upl_offset, 0, error, 0);
 317
 318         return (error);
 319 }
 320
 321
 322 static void
 323 cluster_zero(upl, upl_offset, size, bp)
 324         upl_t         upl;
 325         vm_offset_t   upl_offset;
 326         int           size;
 327         struct buf   *bp;
 328 {
 329         upl_page_info_t *pl;
 330
 331         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
 332                      upl_offset, size, (int)bp, 0, 0);
 333
 334         if (bp == NULL || bp->b_data == NULL) {
 335
 336                 pl = ubc_upl_pageinfo(upl);
 337
 338                 while (size) {
 339                         int           page_offset;
 340                         int           page_index;
 341                         addr64_t      zero_addr;
 342                         int           zero_cnt;
 343
 344                         page_index  = upl_offset / PAGE_SIZE;
 345                         page_offset = upl_offset & PAGE_MASK;
 346
 347                         zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
 348                         zero_cnt  = min(PAGE_SIZE - page_offset, size);
 349
 350                         bzero_phys(zero_addr, zero_cnt);
 351
 352                         size       -= zero_cnt;
 353                         upl_offset += zero_cnt;
 354                 }
 355         } else
 356                 bzero((caddr_t)((vm_offset_t)bp->b_data + upl_offset), size);
 357
 358         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
 359                      upl_offset, size, 0, 0, 0);
 360 }
 361
 362 static int
 363 cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
 364         struct vnode *vp;
 365         upl_t         upl;
 366         vm_offset_t   upl_offset;
 367         off_t         f_offset;
 368         int           non_rounded_size;
 369         int           devblocksize;
 370         int           flags;
 371         struct buf   *real_bp;
 372         struct clios *iostate;
 373 {
 374         struct buf   *cbp;
 375         u_int         size;
 376         u_int         io_size;
 377         int           io_flags;
 378         int           error = 0;
 379         int           retval = 0;
 380         struct buf   *cbp_head = 0;
 381         struct buf   *cbp_tail = 0;
 382         int buf_count = 0;
 383         int pg_count;
 384         int pg_offset;
 385         u_int max_iosize;
 386         u_int max_vectors;
 387         int priv;
 388         int zero_offset = 0;
 389         int async_throttle;
 390
 391         if (devblocksize)
 392                 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
 393         else
 394                 size = non_rounded_size;
 395
 396         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 397                      (int)f_offset, size, upl_offset, flags, 0);
 398
 399
 400         if (flags & CL_READ) {
 401                 io_flags = (B_VECTORLIST | B_READ);
 402
 403                 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
 404         } else {
 405                 io_flags = (B_VECTORLIST | B_WRITEINPROG);
 406
 407                 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
 408         }
 409         /*
 410          * make sure the maximum iosize are at least the size of a page
 411          * and that they are multiples of the page size
 412          */
 413         max_iosize  &= ~PAGE_MASK;
 414
 415         if (flags & CL_THROTTLE) {
 416                 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
 417                         if (max_iosize > HARD_THROTTLE_MAXSIZE)
 418                                 max_iosize = HARD_THROTTLE_MAXSIZE;
 419                         async_throttle = HARD_THROTTLE_MAXCNT;
 420                 } else
 421                         async_throttle = ASYNC_THROTTLE;
 422         }
 423         if (flags & CL_AGE)
 424                 io_flags |= B_AGE;
 425         if (flags & CL_DUMP)
 426                 io_flags |= B_NOCACHE;
 427         if (flags & CL_PAGEIN)
 428                 io_flags |= B_PGIN;
 429         if (flags & CL_PAGEOUT)
 430                 io_flags |= B_PAGEOUT;
 431         if (flags & CL_COMMIT)
 432                 io_flags |= B_COMMIT_UPL;
 433         if (flags & CL_PRESERVE)
 434                 io_flags |= B_PHYS;
 435
 436         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 437                 /*
 438                  * then we are going to end up
 439                  * with a page that we can't complete (the file size wasn't a multiple
 440                  * of PAGE_SIZE and we're trying to read to the end of the file
 441                  * so we'll go ahead and zero out the portion of the page we can't
 442                  * read in from the file
 443                  */
 444                 zero_offset = upl_offset + non_rounded_size;
 445         }
 446         while (size) {
 447                 int vsize;
 448                 int i;
 449                 int pg_resid;
 450                 int num_contig;
 451                 daddr_t lblkno;
 452                 daddr_t blkno;
 453
 454                 if (size > max_iosize)
 455                         io_size = max_iosize;
 456                 else
 457                         io_size = size;
 458
 459                 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
 460                         if (error == EOPNOTSUPP)
 461                                 panic("VOP_CMAP Unimplemented");
 462                         break;
 463                 }
 464
 465                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 466                              (int)f_offset, (int)blkno, io_size, zero_offset, 0);
 467
 468                 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
 469                         if (flags & CL_PAGEOUT) {
 470                                 error = EINVAL;
 471                                 break;
 472                         };
 473
 474                         /* Try paging out the page individually before
 475                            giving up entirely and dumping it (it could
 476                            be mapped in a "hole" and require allocation
 477                            before the I/O:
 478                          */
 479                          ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
 480                          if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
 481                                 error = EINVAL;
 482                                 break;
 483                          };
 484
 485                         f_offset   += PAGE_SIZE_64;
 486                         upl_offset += PAGE_SIZE;
 487                         size       -= PAGE_SIZE;
 488                         continue;
 489                 }
 490                 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
 491                 /*
 492                  * we have now figured out how much I/O we can do - this is in 'io_size'
 493                  * pg_offset is the starting point in the first page for the I/O
 494                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 495                  */
 496                 pg_offset = upl_offset & PAGE_MASK;
 497
 498                 if (flags & CL_DEV_MEMORY) {
 499                         /*
 500                          * currently, can't deal with reading 'holes' in file
 501                          */
 502                         if ((long)blkno == -1) {
 503                                 error = EINVAL;
 504                                 break;
 505                         }
 506                         /*
 507                          * treat physical requests as one 'giant' page
 508                          */
 509                         pg_count = 1;
 510                 } else
 511                         pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 512
 513                 if ((flags & CL_READ) && (long)blkno == -1) {
 514                         int bytes_to_zero;
 515
 516                         /*
 517                          * if we're reading and blkno == -1, then we've got a
 518                          * 'hole' in the file that we need to deal with by zeroing
 519                          * out the affected area in the upl
 520                          */
 521                         if (zero_offset && io_size == size) {
 522                                 /*
 523                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
 524                                  * than 'zero_offset' will be non-zero
 525                                  * if the 'hole' returned by VOP_CMAP extends all the way to the eof
 526                                  * (indicated by the io_size finishing off the I/O request for this UPL)
 527                                  * than we're not going to issue an I/O for the
 528                                  * last page in this upl... we need to zero both the hole and the tail
 529                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
 530                                  */
 531                                 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
 532
 533                                 zero_offset = 0;
 534                         } else
 535                                 bytes_to_zero = io_size;
 536
 537                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
 538
 539                         if (cbp_head)
 540                                 /*
 541                                  * if there is a current I/O chain pending
 542                                  * then the first page of the group we just zero'd
 543                                  * will be handled by the I/O completion if the zero
 544                                  * fill started in the middle of the page
 545                                  */
 546                                 pg_count = (io_size - pg_offset) / PAGE_SIZE;
 547                         else {
 548                                 /*
 549                                  * no pending I/O to pick up that first page
 550                                  * so, we have to make sure it gets committed
 551                                  * here.
 552                                  * set the pg_offset to 0 so that the upl_commit_range
 553                                  * starts with this page
 554                                  */
 555                                 pg_count = (io_size + pg_offset) / PAGE_SIZE;
 556                                 pg_offset = 0;
 557                         }
 558                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 559                                 /*
 560                                  * if we're done with the request for this UPL
 561                                  * then we have to make sure to commit the last page
 562                                  * even if we only partially zero-filled it
 563                                  */
 564                                 pg_count++;
 565
 566                         if (pg_count) {
 567                                 if (pg_offset)
 568                                         pg_resid = PAGE_SIZE - pg_offset;
 569                                 else
 570                                         pg_resid = 0;
 571
 572                                 if (flags & CL_COMMIT)
 573                                         ubc_upl_commit_range(upl,
 574                                                         (upl_offset + pg_resid) & ~PAGE_MASK,
 575                                                         pg_count * PAGE_SIZE,
 576                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 577                         }
 578                         upl_offset += io_size;
 579                         f_offset   += io_size;
 580                         size       -= io_size;
 581
 582                         if (cbp_head && pg_count)
 583                                 goto start_io;
 584                         continue;
 585
 586                 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
 587                         real_bp->b_blkno = blkno;
 588                 }
 589
 590                 if (pg_count > max_vectors) {
 591                         io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 592
 593                         if (io_size < 0) {
 594                                 io_size = PAGE_SIZE - pg_offset;
 595                                 pg_count = 1;
 596                         } else
 597                                 pg_count = max_vectors;
 598                 }
 599
 600                 if ( !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV))
 601                         /*
 602                          * if we're not targeting a virtual device i.e. a disk image
 603                          * it's safe to dip into the reserve pool since real devices
 604                          * can complete this I/O request without requiring additional
 605                          * bufs from the alloc_io_buf pool
 606                          */
 607                         priv = 1;
 608                 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 609                         /*
 610                          * Throttle the speculative IO
 611                          */
 612                         priv = 0;
 613                 else
 614                         priv = 1;
 615
 616                 cbp = alloc_io_buf(vp, priv);
 617
 618
 619                 if (flags & CL_PAGEOUT) {
 620                         for (i = 0; i < pg_count; i++) {
 621                                 int         s;
 622                                 struct buf *bp;
 623
 624                                 s = splbio();
 625                                 if (bp = incore(vp, lblkno + i)) {
 626                                         if (!ISSET(bp->b_flags, B_BUSY)) {
 627                                                 bremfree(bp);
 628                                                 SET(bp->b_flags, (B_BUSY | B_INVAL));
 629                                                 splx(s);
 630                                                 brelse(bp);
 631                                         } else
 632                                                 panic("BUSY bp found in cluster_io");
 633                                 }
 634                                 splx(s);
 635                         }
 636                 }
 637                 if (flags & CL_ASYNC) {
 638                         cbp->b_flags |= (B_CALL | B_ASYNC);
 639                         cbp->b_iodone = (void *)cluster_iodone;
 640                 }
 641                 cbp->b_flags |= io_flags;
 642
 643                 cbp->b_lblkno = lblkno;
 644                 cbp->b_blkno  = blkno;
 645                 cbp->b_bcount = io_size;
 646                 cbp->b_pagelist  = upl;
 647                 cbp->b_uploffset = upl_offset;
 648                 cbp->b_trans_next = (struct buf *)0;
 649
 650                 if (cbp->b_iostate = (void *)iostate)
 651                         /*
 652                          * caller wants to track the state of this
 653                          * io... bump the amount issued against this stream
 654                          */
 655                         iostate->io_issued += io_size;
 656
 657                 if (flags & CL_READ)
 658                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 659                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 660                 else
 661                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 662                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 663
 664                 if (cbp_head) {
 665                         cbp_tail->b_trans_next = cbp;
 666                         cbp_tail = cbp;
 667                 } else {
 668                         cbp_head = cbp;
 669                         cbp_tail = cbp;
 670                 }
 671                 (struct buf *)(cbp->b_trans_head) = cbp_head;
 672                 buf_count++;
 673
 674                 upl_offset += io_size;
 675                 f_offset   += io_size;
 676                 size       -= io_size;
 677
 678                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
 679                         /*
 680                          * if we have no more I/O to issue or
 681                          * the current I/O we've prepared fully
 682                          * completes the last page in this request
 683                          * and it's either an ASYNC request or
 684                          * we've already accumulated more than 8 I/O's into
 685                          * this transaction and it's not an I/O directed to
 686                          * special DEVICE memory
 687                          * then go ahead and issue the I/O
 688                          */
 689 start_io:
 690                         if (real_bp) {
 691                                 cbp_head->b_flags |= B_NEED_IODONE;
 692                                 cbp_head->b_real_bp = real_bp;
 693                         } else
 694                                 cbp_head->b_real_bp = (struct buf *)NULL;
 695
 696                         if (size == 0) {
 697                                 /*
 698                                  * we're about to issue the last I/O for this upl
 699                                  * if this was a read to the eof and the eof doesn't
 700                                  * finish on a page boundary, than we need to zero-fill
 701                                  * the rest of the page....
 702                                  */
 703                                 cbp_head->b_validend = zero_offset;
 704                         } else
 705                                 cbp_head->b_validend = 0;
 706
 707                         if (flags & CL_THROTTLE) {
 708                                 while (vp->v_numoutput >= async_throttle) {
 709                                         vp->v_flag |= VTHROTTLED;
 710                                         tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_io", 0);
 711                                 }
 712                         }
 713                         for (cbp = cbp_head; cbp;) {
 714                                 struct buf * cbp_next;
 715
 716                                 if (io_flags & B_WRITEINPROG)
 717                                         cbp->b_vp->v_numoutput++;
 718
 719                                 cbp_next = cbp->b_trans_next;
 720
 721                                 (void) VOP_STRATEGY(cbp);
 722                                 cbp = cbp_next;
 723                         }
 724                         if ( !(flags & CL_ASYNC)) {
 725                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 726                                         biowait(cbp);
 727
 728                                 if (error = cluster_iodone(cbp_head)) {
 729                                         if ((flags & CL_PAGEOUT) && (error == ENXIO))
 730                                                 retval = 0;     /* drop the error */
 731                                         else
 732                                                 retval = error;
 733                                         error  = 0;
 734                                 }
 735                         }
 736                         cbp_head = (struct buf *)0;
 737                         cbp_tail = (struct buf *)0;
 738
 739                         buf_count = 0;
 740                 }
 741         }
 742         if (error) {
 743                 int abort_size;
 744
 745                 io_size = 0;
 746
 747                 for (cbp = cbp_head; cbp;) {
 748                         struct buf * cbp_next;
 749
 750                         upl_offset -= cbp->b_bcount;
 751                         size       += cbp->b_bcount;
 752                         io_size    += cbp->b_bcount;
 753
 754                         cbp_next = cbp->b_trans_next;
 755                         free_io_buf(cbp);
 756                         cbp = cbp_next;
 757                 }
 758                 if (iostate) {
 759                         /*
 760                          * update the error condition for this stream
 761                          * since we never really issued the io
 762                          * just go ahead and adjust it back
 763                          */
 764                         if (iostate->io_error == 0)
 765                                 iostate->io_error = error;
 766                         iostate->io_issued -= io_size;
 767
 768                         if (iostate->io_wanted) {
 769                                 /*
 770                                  * someone is waiting for the state of
 771                                  * this io stream to change
 772                                  */
 773                                 iostate->io_wanted = 0;
 774                                 wakeup((caddr_t)&iostate->io_wanted);
 775                         }
 776                 }
 777                 pg_offset  = upl_offset & PAGE_MASK;
 778                 abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 779
 780                 if (flags & CL_COMMIT) {
 781                         int upl_abort_code;
 782
 783                         if (flags & CL_PRESERVE) {
 784                                 ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
 785                                                      UPL_COMMIT_FREE_ON_EMPTY);
 786                         } else {
 787                                 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
 788                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 789                                 else if (flags & CL_PAGEIN)
 790                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 791                                 else
 792                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 793
 794                                 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
 795                                                 upl_abort_code);
 796                         }
 797                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
 798                                      (int)upl, upl_offset - pg_offset, abort_size, error, 0);
 799                 }
 800                 if (real_bp) {
 801                         real_bp->b_flags |= B_ERROR;
 802                         real_bp->b_error  = error;
 803
 804                         biodone(real_bp);
 805                 }
 806                 if (retval == 0)
 807                         retval = error;
 808         }
 809         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
 810                      (int)f_offset, size, upl_offset, retval, 0);
 811
 812         return (retval);
 813 }
 814
 815
 816 static int
 817 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
 818         struct vnode *vp;
 819         off_t         f_offset;
 820         u_int         size;
 821         off_t         filesize;
 822         int           devblocksize;
 823 {
 824         int           pages_in_prefetch;
 825
 826         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
 827                      (int)f_offset, size, (int)filesize, 0, 0);
 828
 829         if (f_offset >= filesize) {
 830                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 831                              (int)f_offset, 0, 0, 0, 0);
 832                 return(0);
 833         }
 834         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
 835                 size = (MAX_UPL_TRANSFER * PAGE_SIZE);
 836         else
 837                 size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 838
 839         if ((off_t)size > (filesize - f_offset))
 840                 size = filesize - f_offset;
 841         pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
 842
 843         advisory_read(vp, filesize, f_offset, size, devblocksize);
 844
 845         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 846                      (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
 847
 848         return (pages_in_prefetch);
 849 }
 850
 851
 852
 853 static void
 854 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
 855         struct vnode *vp;
 856         daddr_t       b_lblkno;
 857         daddr_t       e_lblkno;
 858         off_t         filesize;
 859         int           devblocksize;
 860 {
 861         daddr_t       r_lblkno;
 862         off_t         f_offset;
 863         int           size_of_prefetch;
 864
 865         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
 866                      b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
 867
 868         if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
 869                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 870                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
 871                 return;
 872         }
 873         if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
 874                                  (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
 875                 vp->v_ralen = 0;
 876                 vp->v_maxra = 0;
 877
 878                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 879                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
 880
 881                 return;
 882         }
 883         if (e_lblkno < vp->v_maxra) {
 884                 if ((vp->v_maxra - e_lblkno) > (MAX_UPL_TRANSFER / 4)) {
 885
 886                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 887                                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
 888                         return;
 889                 }
 890         }
 891         r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
 892         f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
 893
 894         size_of_prefetch = 0;
 895
 896         ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
 897
 898         if (size_of_prefetch) {
 899                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 900                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
 901                 return;
 902         }
 903         if (f_offset < filesize) {
 904                 vp->v_ralen = vp->v_ralen ? min(MAX_UPL_TRANSFER, vp->v_ralen << 1) : 1;
 905
 906                 if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
 907                         vp->v_ralen = min(MAX_UPL_TRANSFER, (e_lblkno + 1) - b_lblkno);
 908
 909                 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
 910
 911                 if (size_of_prefetch)
 912                         vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
 913         }
 914         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 915                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 4, 0);
 916 }
 917
 918 int
 919 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 920         struct vnode *vp;
 921         upl_t         upl;
 922         vm_offset_t   upl_offset;
 923         off_t         f_offset;
 924         int           size;
 925         off_t         filesize;
 926         int           devblocksize;
 927         int           flags;
 928 {
 929         int           io_size;
 930         int           rounded_size;
 931         off_t         max_size;
 932         int           local_flags;
 933
 934         if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
 935                 /*
 936                  * if we know we're issuing this I/O to a virtual device (i.e. disk image)
 937                  * then we don't want to enforce this throttle... if we do, we can
 938                  * potentially deadlock since we're stalling the pageout thread at a time
 939                  * when the disk image might need additional memory (which won't be available
 940                  * if the pageout thread can't run)... instead we'll just depend on the throttle
 941                  * that the pageout thread now has in place to deal with external files
 942                  */
 943                 local_flags = CL_PAGEOUT;
 944         else
 945                 local_flags = CL_PAGEOUT | CL_THROTTLE;
 946
 947         if ((flags & UPL_IOSYNC) == 0)
 948                 local_flags |= CL_ASYNC;
 949         if ((flags & UPL_NOCOMMIT) == 0)
 950                 local_flags |= CL_COMMIT;
 951
 952
 953         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
 954                      (int)f_offset, size, (int)filesize, local_flags, 0);
 955
 956         /*
 957          * If they didn't specify any I/O, then we are done...
 958          * we can't issue an abort because we don't know how
 959          * big the upl really is
 960          */
 961         if (size <= 0)
 962                 return (EINVAL);
 963
 964         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
 965                 if (local_flags & CL_COMMIT)
 966                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 967                 return (EROFS);
 968         }
 969         /*
 970          * can't page-in from a negative offset
 971          * or if we're starting beyond the EOF
 972          * or if the file offset isn't page aligned
 973          * or the size requested isn't a multiple of PAGE_SIZE
 974          */
 975         if (f_offset < 0 || f_offset >= filesize ||
 976            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 977                 if (local_flags & CL_COMMIT)
 978                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 979                 return (EINVAL);
 980         }
 981         max_size = filesize - f_offset;
 982
 983         if (size < max_size)
 984                 io_size = size;
 985         else
 986                 io_size = max_size;
 987
 988         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 989
 990         if (size > rounded_size) {
 991                 if (local_flags & CL_COMMIT)
 992                         ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
 993                                         UPL_ABORT_FREE_ON_EMPTY);
 994         }
 995         vp->v_flag |= VHASBEENPAGED;
 996
 997         return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
 998                            local_flags, (struct buf *)0, (struct clios *)0));
 999 }
1000
1001 int
1002 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
1003         struct vnode *vp;
1004         upl_t         upl;
1005         vm_offset_t   upl_offset;
1006         off_t         f_offset;
1007         int           size;
1008         off_t         filesize;
1009         int           devblocksize;
1010         int           flags;
1011 {
1012         u_int         io_size;
1013         int           rounded_size;
1014         off_t         max_size;
1015         int           retval;
1016         int           local_flags = 0;
1017
1018         if (upl == NULL || size < 0)
1019                 panic("cluster_pagein: NULL upl passed in");
1020
1021         if ((flags & UPL_IOSYNC) == 0)
1022                 local_flags |= CL_ASYNC;
1023         if ((flags & UPL_NOCOMMIT) == 0)
1024                 local_flags |= CL_COMMIT;
1025
1026
1027         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1028                      (int)f_offset, size, (int)filesize, local_flags, 0);
1029
1030         /*
1031          * can't page-in from a negative offset
1032          * or if we're starting beyond the EOF
1033          * or if the file offset isn't page aligned
1034          * or the size requested isn't a multiple of PAGE_SIZE
1035          */
1036         if (f_offset < 0 || f_offset >= filesize ||
1037            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1038                 if (local_flags & CL_COMMIT)
1039                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1040                 return (EINVAL);
1041         }
1042         max_size = filesize - f_offset;
1043
1044         if (size < max_size)
1045                 io_size = size;
1046         else
1047                 io_size = max_size;
1048
1049         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1050
1051         if (size > rounded_size && (local_flags & CL_COMMIT))
1052                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1053                                     size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1054
1055         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
1056                            local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
1057
1058         if (retval == 0) {
1059                 int b_lblkno;
1060                 int e_lblkno;
1061
1062                 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
1063                 e_lblkno = (int)
1064                         ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1065
1066                 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
1067                         /*
1068                          * we haven't read the last page in of the file yet
1069                          * so let's try to read ahead if we're in
1070                          * a sequential access pattern
1071                          */
1072                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1073                 }
1074                 vp->v_lastr = e_lblkno;
1075         }
1076         return (retval);
1077 }
1078
1079 int
1080 cluster_bp(bp)
1081         struct buf *bp;
1082 {
1083         off_t  f_offset;
1084         int    flags;
1085
1086         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1087                      (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1088
1089         if (bp->b_pagelist == (upl_t) 0)
1090                 panic("cluster_bp: can't handle NULL upl yet\n");
1091         if (bp->b_flags & B_READ)
1092                 flags = CL_ASYNC | CL_READ;
1093         else
1094                 flags = CL_ASYNC;
1095
1096         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1097
1098         return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
1099 }
1100
1101 int
1102 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1103         struct vnode *vp;
1104         struct uio   *uio;
1105         off_t         oldEOF;
1106         off_t         newEOF;
1107         off_t         headOff;
1108         off_t         tailOff;
1109         int           devblocksize;
1110         int           flags;
1111 {
1112         int           prev_resid;
1113         int           clip_size;
1114         off_t         max_io_size;
1115         struct iovec  *iov;
1116         int           upl_size;
1117         int           upl_flags;
1118         upl_t         upl;
1119         int           retval = 0;
1120
1121
1122         if (vp->v_flag & VHASBEENPAGED)
1123           {
1124             /*
1125              * this vnode had pages cleaned to it by
1126              * the pager which indicates that either
1127              * it's not very 'hot', or the system is
1128              * being overwhelmed by a lot of dirty
1129              * data being delayed in the VM cache...
1130              * in either event, we'll push our remaining
1131              * delayed data at this point...  this will
1132              * be more efficient than paging out 1 page at
1133              * a time, and will also act as a throttle
1134              * by delaying this client from writing any
1135              * more data until all his delayed data has
1136              * at least been queued to the uderlying driver.
1137              */
1138             cluster_push(vp);
1139
1140             vp->v_flag &= ~VHASBEENPAGED;
1141           }
1142
1143         if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
1144           {
1145             /*
1146              * go do a write through the cache if one of the following is true....
1147              *   NOCACHE is not true
1148              *   there is no uio structure or it doesn't target USERSPACE
1149              */
1150             return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags));
1151           }
1152
1153         while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1154           {
1155             /*
1156              * we know we have a resid, so this is safe
1157              * skip over any emtpy vectors
1158              */
1159             iov = uio->uio_iov;
1160
1161             while (iov->iov_len == 0) {
1162               uio->uio_iov++;
1163               uio->uio_iovcnt--;
1164               iov = uio->uio_iov;
1165             }
1166             upl_size  = PAGE_SIZE;
1167             upl_flags = UPL_QUERY_OBJECT_TYPE;
1168
1169             if ((vm_map_get_upl(current_map(),
1170                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1171                                &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS)
1172               {
1173                 /*
1174                  * the user app must have passed in an invalid address
1175                  */
1176                 return (EFAULT);
1177               }
1178
1179             /*
1180              * We check every vector target but if it is physically
1181              * contiguous space, we skip the sanity checks.
1182              */
1183             if (upl_flags & UPL_PHYS_CONTIG)
1184               {
1185                 if (flags & IO_HEADZEROFILL)
1186                   {
1187                     flags &= ~IO_HEADZEROFILL;
1188
1189                     if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1190                         return(retval);
1191                   }
1192
1193                 retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
1194
1195                 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1196                   {
1197                     return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL));
1198                   }
1199               }
1200             else if ((uio->uio_resid < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1201               {
1202                 /*
1203                  * we're here because we're don't have a physically contiguous target buffer
1204                  * go do a write through the cache if one of the following is true....
1205                  *   the total xfer size is less than a page...
1206                  *   we're being asked to ZEROFILL either the head or the tail of the I/O...
1207                  */
1208                 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags));
1209               }
1210             else if (((int)uio->uio_offset & PAGE_MASK) || ((int)iov->iov_base & PAGE_MASK))
1211               {
1212                 if (((int)uio->uio_offset & PAGE_MASK) == ((int)iov->iov_base & PAGE_MASK))
1213                   {
1214                     /*
1215                      * Bring the file offset write up to a pagesize boundary
1216                      * this will also bring the base address to a page boundary
1217                      * since they both are currently on the same offset within a page
1218                      * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1219                      * so the computed clip_size must always be less than the current uio_resid
1220                      */
1221                     clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1222
1223                     /*
1224                      * Fake the resid going into the cluster_write_x call
1225                      * and restore it on the way out.
1226                      */
1227                     prev_resid = uio->uio_resid;
1228                     uio->uio_resid = clip_size;
1229                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1230                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1231                   }
1232                 else
1233                   {
1234                     /*
1235                      * can't get both the file offset and the buffer offset aligned to a page boundary
1236                      * so fire an I/O through the cache for this entire vector
1237                      */
1238                     clip_size = iov->iov_len;
1239                     prev_resid = uio->uio_resid;
1240                     uio->uio_resid = clip_size;
1241                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1242                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1243                   }
1244               }
1245             else
1246               {
1247                 /*
1248                  * If we come in here, we know the offset into
1249                  * the file is on a pagesize boundary and the
1250                  * target buffer address is also on a page boundary
1251                  */
1252                 max_io_size = newEOF - uio->uio_offset;
1253                 clip_size = uio->uio_resid;
1254                 if (iov->iov_len < clip_size)
1255                   clip_size = iov->iov_len;
1256                 if (max_io_size < clip_size)
1257                   clip_size = max_io_size;
1258
1259                 if (clip_size < PAGE_SIZE)
1260                   {
1261                     /*
1262                      * Take care of tail end of write in this vector
1263                      */
1264                     prev_resid = uio->uio_resid;
1265                     uio->uio_resid = clip_size;
1266                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1267                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1268                   }
1269                 else
1270                   {
1271                     /* round clip_size down to a multiple of pagesize */
1272                     clip_size = clip_size & ~(PAGE_MASK);
1273                     prev_resid = uio->uio_resid;
1274                     uio->uio_resid = clip_size;
1275                     retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1276                     if ((retval == 0) && uio->uio_resid)
1277                       retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1278                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1279                   }
1280               } /* end else */
1281           } /* end while */
1282         return(retval);
1283 }
1284
1285
1286 static int
1287 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1288         struct vnode *vp;
1289         struct uio   *uio;
1290         off_t         newEOF;
1291         int           devblocksize;
1292         int           flags;
1293 {
1294         upl_t            upl;
1295         upl_page_info_t  *pl;
1296         off_t            upl_f_offset;
1297         vm_offset_t      upl_offset;
1298         off_t            max_io_size;
1299         int              io_size;
1300         int              io_flag;
1301         int              upl_size;
1302         int              upl_needed_size;
1303         int              pages_in_pl;
1304         int              upl_flags;
1305         kern_return_t    kret;
1306         struct iovec     *iov;
1307         int              i;
1308         int              force_data_sync;
1309         int              error  = 0;
1310         struct clios     iostate;
1311
1312         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1313                      (int)uio->uio_offset, (int)uio->uio_resid,
1314                      (int)newEOF, devblocksize, 0);
1315
1316         /*
1317          * When we enter this routine, we know
1318          *  -- the offset into the file is on a pagesize boundary
1319          *  -- the resid is a page multiple
1320          *  -- the resid will not exceed iov_len
1321          */
1322         cluster_try_push(vp, newEOF, 0, 1);
1323
1324         iostate.io_completed = 0;
1325         iostate.io_issued = 0;
1326         iostate.io_error = 0;
1327         iostate.io_wanted = 0;
1328
1329         iov = uio->uio_iov;
1330
1331         while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1332                 io_size = uio->uio_resid;
1333
1334                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1335                         io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1336
1337                 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
1338                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1339
1340                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1341                              (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1342
1343                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1344                         pages_in_pl = 0;
1345                         upl_size = upl_needed_size;
1346                         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1347                                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1348
1349                         kret = vm_map_get_upl(current_map(),
1350                                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1351                                               &upl_size,
1352                                               &upl,
1353                                               NULL,
1354                                               &pages_in_pl,
1355                                               &upl_flags,
1356                                               force_data_sync);
1357
1358                         if (kret != KERN_SUCCESS) {
1359                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1360                                              0, 0, 0, kret, 0);
1361                                 /*
1362                                  * cluster_nocopy_write: failed to get pagelist
1363                                  *
1364                                  * we may have already spun some portion of this request
1365                                  * off as async requests... we need to wait for the I/O
1366                                  * to complete before returning
1367                                  */
1368                                 goto wait_for_writes;
1369                         }
1370                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1371                         pages_in_pl = upl_size / PAGE_SIZE;
1372
1373                         for (i = 0; i < pages_in_pl; i++) {
1374                                 if (!upl_valid_page(pl, i))
1375                                         break;
1376                         }
1377                         if (i == pages_in_pl)
1378                                 break;
1379
1380                         /*
1381                          * didn't get all the pages back that we
1382                          * needed... release this upl and try again
1383                          */
1384                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1385                                             UPL_ABORT_FREE_ON_EMPTY);
1386                 }
1387                 if (force_data_sync >= 3) {
1388                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1389                                      i, pages_in_pl, upl_size, kret, 0);
1390                         /*
1391                          * for some reason, we couldn't acquire a hold on all
1392                          * the pages needed in the user's address space
1393                          *
1394                          * we may have already spun some portion of this request
1395                          * off as async requests... we need to wait for the I/O
1396                          * to complete before returning
1397                          */
1398                         goto wait_for_writes;
1399                 }
1400
1401                 /*
1402                  * Consider the possibility that upl_size wasn't satisfied.
1403                  */
1404                 if (upl_size != upl_needed_size)
1405                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1406
1407                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1408                              (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1409
1410                 if (io_size == 0) {
1411                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1412                                             UPL_ABORT_FREE_ON_EMPTY);
1413                         /*
1414                          * we may have already spun some portion of this request
1415                          * off as async requests... we need to wait for the I/O
1416                          * to complete before returning
1417                          */
1418                         goto wait_for_writes;
1419                 }
1420                 /*
1421                  * Now look for pages already in the cache
1422                  * and throw them away.
1423                  * uio->uio_offset is page aligned within the file
1424                  * io_size is a multiple of PAGE_SIZE
1425                  */
1426                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
1427
1428                 /*
1429                  * we want push out these writes asynchronously so that we can overlap
1430                  * the preparation of the next I/O
1431                  * if there are already too many outstanding writes
1432                  * wait until some complete before issuing the next
1433                  */
1434                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1435                         iostate.io_wanted = 1;
1436                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1437                 }
1438                 if (iostate.io_error) {
1439                         /*
1440                          * one of the earlier writes we issued ran into a hard error
1441                          * don't issue any more writes, cleanup the UPL
1442                          * that was just created but not used, then
1443                          * go wait for all writes that are part of this stream
1444                          * to complete before returning the error to the caller
1445                          */
1446                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1447                                             UPL_ABORT_FREE_ON_EMPTY);
1448
1449                         goto wait_for_writes;
1450                 }
1451                 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE;
1452
1453                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1454                              (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1455
1456                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1457                                    io_size, devblocksize, io_flag, (struct buf *)0, &iostate);
1458
1459                 iov->iov_len    -= io_size;
1460                 iov->iov_base   += io_size;
1461                 uio->uio_resid  -= io_size;
1462                 uio->uio_offset += io_size;
1463
1464                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1465                              (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1466
1467         } /* end while */
1468
1469 wait_for_writes:
1470         /*
1471          * make sure all async writes issued as part of this stream
1472          * have completed before we return
1473          */
1474         while (iostate.io_issued != iostate.io_completed) {
1475                 iostate.io_wanted = 1;
1476                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1477         }
1478         if (iostate.io_error)
1479                 error = iostate.io_error;
1480
1481         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1482                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1483
1484         return (error);
1485 }
1486
1487
1488 static int
1489 cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
1490         struct vnode *vp;
1491         struct uio   *uio;
1492         off_t        newEOF;
1493         int          devblocksize;
1494         int          flags;
1495 {
1496         upl_page_info_t *pl;
1497         addr64_t         src_paddr;
1498         upl_t            upl;
1499         vm_offset_t      upl_offset;
1500         int              tail_size;
1501         int              io_size;
1502         int              upl_size;
1503         int              upl_needed_size;
1504         int              pages_in_pl;
1505         int              upl_flags;
1506         kern_return_t    kret;
1507         struct iovec     *iov;
1508         int              error  = 0;
1509
1510         /*
1511          * When we enter this routine, we know
1512          *  -- the resid will not exceed iov_len
1513          *  -- the vector target address is physcially contiguous
1514          */
1515         cluster_try_push(vp, newEOF, 0, 1);
1516
1517         iov = uio->uio_iov;
1518         io_size = iov->iov_len;
1519         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
1520         upl_needed_size = upl_offset + io_size;
1521
1522         pages_in_pl = 0;
1523         upl_size = upl_needed_size;
1524         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1525                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1526
1527         kret = vm_map_get_upl(current_map(),
1528                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1529                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1530
1531         if (kret != KERN_SUCCESS) {
1532                 /*
1533                  * cluster_phys_write: failed to get pagelist
1534                  * note: return kret here
1535                  */
1536               return(EINVAL);
1537         }
1538         /*
1539          * Consider the possibility that upl_size wasn't satisfied.
1540          * This is a failure in the physical memory case.
1541          */
1542         if (upl_size < upl_needed_size) {
1543                 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1544                 return(EINVAL);
1545         }
1546         pl = ubc_upl_pageinfo(upl);
1547
1548         src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)((u_int)iov->iov_base & PAGE_MASK));
1549
1550         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1551                 int   head_size;
1552
1553                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1554
1555                 if (head_size > io_size)
1556                         head_size = io_size;
1557
1558                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
1559
1560                 if (error) {
1561                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1562
1563                         return(EINVAL);
1564                 }
1565                 upl_offset += head_size;
1566                 src_paddr  += head_size;
1567                 io_size    -= head_size;
1568         }
1569         tail_size = io_size & (devblocksize - 1);
1570         io_size  -= tail_size;
1571
1572         if (io_size) {
1573                 /*
1574                  * issue a synchronous write to cluster_io
1575                  */
1576                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1577                                    io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
1578         }
1579         if (error == 0) {
1580                 /*
1581                  * The cluster_io write completed successfully,
1582                  * update the uio structure
1583                  */
1584                 uio->uio_resid  -= io_size;
1585                 iov->iov_len    -= io_size;
1586                 iov->iov_base   += io_size;
1587                 uio->uio_offset += io_size;
1588                 src_paddr       += io_size;
1589
1590                 if (tail_size)
1591                         error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
1592         }
1593         /*
1594          * just release our hold on the physically contiguous
1595          * region without changing any state
1596          */
1597         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1598
1599         return (error);
1600 }
1601
1602
1603 static int
1604 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1605         struct vnode *vp;
1606         struct uio   *uio;
1607         off_t         oldEOF;
1608         off_t         newEOF;
1609         off_t         headOff;
1610         off_t         tailOff;
1611         int           devblocksize;
1612         int           flags;
1613 {
1614         upl_page_info_t *pl;
1615         upl_t            upl;
1616         vm_offset_t      upl_offset;
1617         int              upl_size;
1618         off_t            upl_f_offset;
1619         int              pages_in_upl;
1620         int              start_offset;
1621         int              xfer_resid;
1622         int              io_size;
1623         int              io_flags;
1624         int              io_offset;
1625         int              bytes_to_zero;
1626         int              bytes_to_move;
1627         kern_return_t    kret;
1628         int              retval = 0;
1629         int              uio_resid;
1630         long long        total_size;
1631         long long        zero_cnt;
1632         off_t            zero_off;
1633         long long        zero_cnt1;
1634         off_t            zero_off1;
1635         daddr_t          start_blkno;
1636         daddr_t          last_blkno;
1637         int              intersection;
1638
1639
1640         if (uio) {
1641                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1642                              (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1643
1644                 uio_resid = uio->uio_resid;
1645         } else {
1646                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1647                              0, 0, (int)oldEOF, (int)newEOF, 0);
1648
1649                 uio_resid = 0;
1650         }
1651         zero_cnt  = 0;
1652         zero_cnt1 = 0;
1653
1654         if (flags & IO_HEADZEROFILL) {
1655                 /*
1656                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1657                  * so we zero fill the intervening space between the old EOF and the offset
1658                  * where the next chunk of real data begins.... ftruncate will also use this
1659                  * routine to zero fill to the new EOF when growing a file... in this case, the
1660                  * uio structure will not be provided
1661                  */
1662                 if (uio) {
1663                         if (headOff < uio->uio_offset) {
1664                                 zero_cnt = uio->uio_offset - headOff;
1665                                 zero_off = headOff;
1666                         }
1667                 } else if (headOff < newEOF) {
1668                         zero_cnt = newEOF - headOff;
1669                         zero_off = headOff;
1670                 }
1671         }
1672         if (flags & IO_TAILZEROFILL) {
1673                 if (uio) {
1674                         zero_off1 = uio->uio_offset + uio->uio_resid;
1675
1676                         if (zero_off1 < tailOff)
1677                                 zero_cnt1 = tailOff - zero_off1;
1678                 }
1679         }
1680         if (zero_cnt == 0 && uio == (struct uio *) 0) {
1681             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1682                          retval, 0, 0, 0, 0);
1683             return (0);
1684         }
1685
1686         while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1687                 /*
1688                  * for this iteration of the loop, figure out where our starting point is
1689                  */
1690                 if (zero_cnt) {
1691                         start_offset = (int)(zero_off & PAGE_MASK_64);
1692                         upl_f_offset = zero_off - start_offset;
1693                 } else if (uio_resid) {
1694                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1695                         upl_f_offset = uio->uio_offset - start_offset;
1696                 } else {
1697                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1698                         upl_f_offset = zero_off1 - start_offset;
1699                 }
1700                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1701                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1702
1703                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1704                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1705
1706                 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1707
1708                 if (uio && !(vp->v_flag & VNOCACHE_DATA) &&
1709                    (flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0) {
1710                         /*
1711                          * assumption... total_size <= uio_resid
1712                          * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1713                          */
1714                         if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
1715                                 total_size -= start_offset;
1716                         xfer_resid = total_size;
1717
1718                         retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
1719
1720                         if (retval)
1721                                 break;
1722
1723                         uio_resid   -= (total_size - xfer_resid);
1724                         total_size   = xfer_resid;
1725                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1726                         upl_f_offset = uio->uio_offset - start_offset;
1727
1728                         if (total_size == 0) {
1729                                 if (start_offset) {
1730                                         /*
1731                                          * the write did not finish on a page boundary
1732                                          * which will leave upl_f_offset pointing to the
1733                                          * beginning of the last page written instead of
1734                                          * the page beyond it... bump it in this case
1735                                          * so that the cluster code records the last page
1736                                          * written as dirty
1737                                          */
1738                                         upl_f_offset += PAGE_SIZE_64;
1739                                 }
1740                                 upl_size = 0;
1741
1742                                 goto check_cluster;
1743                         }
1744                 }
1745                 /*
1746                  * compute the size of the upl needed to encompass
1747                  * the requested write... limit each call to cluster_io
1748                  * to the maximum UPL size... cluster_io will clip if
1749                  * this exceeds the maximum io_size for the device,
1750                  * make sure to account for
1751                  * a starting offset that's not page aligned
1752                  */
1753                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1754
1755                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1756                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1757
1758                 pages_in_upl = upl_size / PAGE_SIZE;
1759                 io_size      = upl_size - start_offset;
1760
1761                 if ((long long)io_size > total_size)
1762                         io_size = total_size;
1763
1764                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
1765
1766
1767                 kret = ubc_create_upl(vp,
1768                                                         upl_f_offset,
1769                                                         upl_size,
1770                                                         &upl,
1771                                                         &pl,
1772                                                         UPL_SET_LITE);
1773                 if (kret != KERN_SUCCESS)
1774                         panic("cluster_write: failed to get pagelist");
1775
1776                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
1777                         (int)upl, (int)upl_f_offset, start_offset, 0, 0);
1778
1779                 if (start_offset && !upl_valid_page(pl, 0)) {
1780                         int   read_size;
1781
1782                         /*
1783                          * we're starting in the middle of the first page of the upl
1784                          * and the page isn't currently valid, so we're going to have
1785                          * to read it in first... this is a synchronous operation
1786                          */
1787                         read_size = PAGE_SIZE;
1788
1789                         if ((upl_f_offset + read_size) > newEOF)
1790                                 read_size = newEOF - upl_f_offset;
1791
1792                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
1793                                             CL_READ, (struct buf *)0, (struct clios *)0);
1794                         if (retval) {
1795                                 /*
1796                                  * we had an error during the read which causes us to abort
1797                                  * the current cluster_write request... before we do, we need
1798                                  * to release the rest of the pages in the upl without modifying
1799                                  * there state and mark the failed page in error
1800                                  */
1801                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1802                                 ubc_upl_abort_range(upl, 0, upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1803
1804                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1805                                              (int)upl, 0, 0, retval, 0);
1806                                 break;
1807                         }
1808                 }
1809                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1810                         /*
1811                          * the last offset we're writing to in this upl does not end on a page
1812                          * boundary... if it's not beyond the old EOF, then we'll also need to
1813                          * pre-read this page in if it isn't already valid
1814                          */
1815                         upl_offset = upl_size - PAGE_SIZE;
1816
1817                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1818                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1819                                 int   read_size;
1820
1821                                 read_size = PAGE_SIZE;
1822
1823                                 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1824                                         read_size = newEOF - (upl_f_offset + upl_offset);
1825
1826                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
1827                                                     CL_READ, (struct buf *)0, (struct clios *)0);
1828                                 if (retval) {
1829                                         /*
1830                                          * we had an error during the read which causes us to abort
1831                                          * the current cluster_write request... before we do, we
1832                                          * need to release the rest of the pages in the upl without
1833                                          * modifying there state and mark the failed page in error
1834                                          */
1835                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1836                                         ubc_upl_abort_range(upl, 0,          upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1837
1838                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1839                                                      (int)upl, 0, 0, retval, 0);
1840                                         break;
1841                                 }
1842                         }
1843                 }
1844                 xfer_resid = io_size;
1845                 io_offset = start_offset;
1846
1847                 while (zero_cnt && xfer_resid) {
1848
1849                         if (zero_cnt < (long long)xfer_resid)
1850                                 bytes_to_zero = zero_cnt;
1851                         else
1852                                 bytes_to_zero = xfer_resid;
1853
1854                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1855                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1856                         } else {
1857                                 int zero_pg_index;
1858
1859                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1860                                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1861
1862                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1863                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1864
1865                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1866                                            !upl_dirty_page(pl, zero_pg_index)) {
1867                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1868                                 }
1869                         }
1870                         xfer_resid -= bytes_to_zero;
1871                         zero_cnt   -= bytes_to_zero;
1872                         zero_off   += bytes_to_zero;
1873                         io_offset  += bytes_to_zero;
1874                 }
1875                 if (xfer_resid && uio_resid) {
1876                         bytes_to_move = min(uio_resid, xfer_resid);
1877
1878                         retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
1879
1880                         if (retval) {
1881
1882                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1883
1884                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1885                                              (int)upl, 0, 0, retval, 0);
1886                         } else {
1887                                 uio_resid  -= bytes_to_move;
1888                                 xfer_resid -= bytes_to_move;
1889                                 io_offset  += bytes_to_move;
1890                         }
1891                 }
1892                 while (xfer_resid && zero_cnt1 && retval == 0) {
1893
1894                         if (zero_cnt1 < (long long)xfer_resid)
1895                                 bytes_to_zero = zero_cnt1;
1896                         else
1897                                 bytes_to_zero = xfer_resid;
1898
1899                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1900                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1901                         } else {
1902                                 int zero_pg_index;
1903
1904                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1905                                 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1906
1907                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1908                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1909                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1910                                            !upl_dirty_page(pl, zero_pg_index)) {
1911                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1912                                 }
1913                         }
1914                         xfer_resid -= bytes_to_zero;
1915                         zero_cnt1  -= bytes_to_zero;
1916                         zero_off1  += bytes_to_zero;
1917                         io_offset  += bytes_to_zero;
1918                 }
1919
1920                 if (retval == 0) {
1921                         int cl_index;
1922                         int can_delay;
1923
1924                         io_size += start_offset;
1925
1926                         if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1927                                 /*
1928                                  * if we're extending the file with this write
1929                                  * we'll zero fill the rest of the page so that
1930                                  * if the file gets extended again in such a way as to leave a
1931                                  * hole starting at this EOF, we'll have zero's in the correct spot
1932                                  */
1933                                 cluster_zero(upl, io_size, upl_size - io_size, NULL);
1934                         }
1935                         if (flags & IO_SYNC)
1936                                 /*
1937                                  * if the IO_SYNC flag is set than we need to
1938                                  * bypass any clusters and immediately issue
1939                                  * the I/O
1940                                  */
1941                                 goto issue_io;
1942 check_cluster:
1943                         /*
1944                          * calculate the last logical block number
1945                          * that this delayed I/O encompassed
1946                          */
1947                         last_blkno = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
1948
1949                         if (vp->v_flag & VHASDIRTY) {
1950
1951                                 if ( !(vp->v_flag & VNOCACHE_DATA)) {
1952                                         /*
1953                                          * we've fallen into the sparse
1954                                          * cluster method of delaying dirty pages
1955                                          * first, we need to release the upl if we hold one
1956                                          * since pages in it may be present in the sparse cluster map
1957                                          * and may span 2 separate buckets there... if they do and
1958                                          * we happen to have to flush a bucket to make room and it intersects
1959                                          * this upl, a deadlock may result on page BUSY
1960                                          */
1961                                         if (upl_size)
1962                                                 ubc_upl_commit_range(upl, 0, upl_size,
1963                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1964
1965                                         sparse_cluster_add(vp, newEOF, start_blkno, last_blkno);
1966
1967                                         continue;
1968                                 }
1969                                 /*
1970                                  * must have done cached writes that fell into
1971                                  * the sparse cluster mechanism... we've switched
1972                                  * to uncached writes on the file, so go ahead
1973                                  * and push whatever's in the sparse map
1974                                  * and switch back to normal clustering
1975                                  *
1976                                  * see the comment above concerning a possible deadlock...
1977                                  */
1978                                 if (upl_size) {
1979                                         ubc_upl_commit_range(upl, 0, upl_size,
1980                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1981                                         /*
1982                                          * setting upl_size to 0 keeps us from committing a
1983                                          * second time in the start_new_cluster path
1984                                          */
1985                                         upl_size = 0;
1986                                 }
1987                                 sparse_cluster_push(vp, ubc_getsize(vp), 1);
1988
1989                                 /*
1990                                  * no clusters of either type present at this point
1991                                  * so just go directly to start_new_cluster since
1992                                  * we know we need to delay this I/O since we've
1993                                  * already released the pages back into the cache
1994                                  * to avoid the deadlock with sparse_cluster_push
1995                                  */
1996                                 goto start_new_cluster;
1997                         }
1998                         upl_offset = 0;
1999
2000                         if (vp->v_clen == 0)
2001                                 /*
2002                                  * no clusters currently present
2003                                  */
2004                                 goto start_new_cluster;
2005
2006                         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
2007                                 /*
2008                                  * check each cluster that we currently hold
2009                                  * try to merge some or all of this write into
2010                                  * one or more of the existing clusters... if
2011                                  * any portion of the write remains, start a
2012                                  * new cluster
2013                                  */
2014                                 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
2015                                         /*
2016                                          * the current write starts at or after the current cluster
2017                                          */
2018                                         if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
2019                                                 /*
2020                                                  * we have a write that fits entirely
2021                                                  * within the existing cluster limits
2022                                                  */
2023                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg)
2024                                                         /*
2025                                                          * update our idea of where the cluster ends
2026                                                          */
2027                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
2028                                                 break;
2029                                         }
2030                                         if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
2031                                                 /*
2032                                                  * we have a write that starts in the middle of the current cluster
2033                                                  * but extends beyond the cluster's limit... we know this because
2034                                                  * of the previous checks
2035                                                  * we'll extend the current cluster to the max
2036                                                  * and update the start_blkno for the current write to reflect that
2037                                                  * the head of it was absorbed into this cluster...
2038                                                  * note that we'll always have a leftover tail in this case since
2039                                                  * full absorbtion would have occurred in the clause above
2040                                                  */
2041                                                 vp->v_clusters[cl_index].last_pg = vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER;
2042
2043                                                 if (upl_size) {
2044                                                         int  start_pg_in_upl;
2045
2046                                                         start_pg_in_upl = upl_f_offset / PAGE_SIZE_64;
2047
2048                                                         if (start_pg_in_upl < vp->v_clusters[cl_index].last_pg) {
2049                                                                 intersection = (vp->v_clusters[cl_index].last_pg - start_pg_in_upl) * PAGE_SIZE;
2050
2051                                                                 ubc_upl_commit_range(upl, upl_offset, intersection,
2052                                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2053                                                                 upl_f_offset += intersection;
2054                                                                 upl_offset   += intersection;
2055                                                                 upl_size     -= intersection;
2056                                                         }
2057                                                 }
2058                                                 start_blkno = vp->v_clusters[cl_index].last_pg;
2059                                         }
2060                                         /*
2061                                          * we come here for the case where the current write starts
2062                                          * beyond the limit of the existing cluster or we have a leftover
2063                                          * tail after a partial absorbtion
2064                                          *
2065                                          * in either case, we'll check the remaining clusters before
2066                                          * starting a new one
2067                                          */
2068                                 } else {
2069                                         /*
2070                                          * the current write starts in front of the cluster we're currently considering
2071                                          */
2072                                         if ((vp->v_clusters[cl_index].last_pg - start_blkno) <= MAX_UPL_TRANSFER) {
2073                                                 /*
2074                                                  * we can just merge the new request into
2075                                                  * this cluster and leave it in the cache
2076                                                  * since the resulting cluster is still
2077                                                  * less than the maximum allowable size
2078                                                  */
2079                                                 vp->v_clusters[cl_index].start_pg = start_blkno;
2080
2081                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
2082                                                         /*
2083                                                          * the current write completely
2084                                                          * envelops the existing cluster and since
2085                                                          * each write is limited to at most MAX_UPL_TRANSFER bytes
2086                                                          * we can just use the start and last blocknos of the write
2087                                                          * to generate the cluster limits
2088                                                          */
2089                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
2090                                                 }
2091                                                 break;
2092                                         }
2093
2094                                         /*
2095                                          * if we were to combine this write with the current cluster
2096                                          * we would exceed the cluster size limit.... so,
2097                                          * let's see if there's any overlap of the new I/O with
2098                                          * the cluster we're currently considering... in fact, we'll
2099                                          * stretch the cluster out to it's full limit and see if we
2100                                          * get an intersection with the current write
2101                                          *
2102                                          */
2103                                         if (last_blkno > vp->v_clusters[cl_index].last_pg - MAX_UPL_TRANSFER) {
2104                                                 /*
2105                                                  * the current write extends into the proposed cluster
2106                                                  * clip the length of the current write after first combining it's
2107                                                  * tail with the newly shaped cluster
2108                                                  */
2109                                                 vp->v_clusters[cl_index].start_pg = vp->v_clusters[cl_index].last_pg - MAX_UPL_TRANSFER;
2110
2111                                                 if (upl_size) {
2112                                                         intersection = (last_blkno - vp->v_clusters[cl_index].start_pg) * PAGE_SIZE;
2113
2114                                                         if (intersection > upl_size)
2115                                                                 /*
2116                                                                  * because the current write may consist of a number of pages found in the cache
2117                                                                  * which are not part of the UPL, we may have an intersection that exceeds
2118                                                                  * the size of the UPL that is also part of this write
2119                                                                  */
2120                                                                 intersection = upl_size;
2121
2122                                                         ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2123                                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2124                                                         upl_size -= intersection;
2125                                                 }
2126                                                 last_blkno = vp->v_clusters[cl_index].start_pg;
2127                                         }
2128                                         /*
2129                                          * if we get here, there was no way to merge
2130                                          * any portion of this write with this cluster
2131                                          * or we could only merge part of it which
2132                                          * will leave a tail...
2133                                          * we'll check the remaining clusters before starting a new one
2134                                          */
2135                                 }
2136                         }
2137                         if (cl_index < vp->v_clen)
2138                                 /*
2139                                  * we found an existing cluster(s) that we
2140                                  * could entirely merge this I/O into
2141                                  */
2142                                 goto delay_io;
2143
2144                         if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
2145                                 /*
2146                                  * we didn't find an existing cluster to
2147                                  * merge into, but there's room to start
2148                                  * a new one
2149                                  */
2150                                 goto start_new_cluster;
2151
2152                         /*
2153                          * no exisitng cluster to merge with and no
2154                          * room to start a new one... we'll try
2155                          * pushing one of the existing ones... if none of
2156                          * them are able to be pushed, we'll switch
2157                          * to the sparse cluster mechanism
2158                          * cluster_try_push updates v_clen to the
2159                          * number of remaining clusters... and
2160                          * returns the number of currently unused clusters
2161                          */
2162                         if (vp->v_flag & VNOCACHE_DATA)
2163                                 can_delay = 0;
2164                         else
2165                                 can_delay = 1;
2166
2167                         if (cluster_try_push(vp, newEOF, can_delay, 0) == 0) {
2168                                 /*
2169                                  * no more room in the normal cluster mechanism
2170                                  * so let's switch to the more expansive but expensive
2171                                  * sparse mechanism....
2172                                  * first, we need to release the upl if we hold one
2173                                  * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2174                                  * and may span 2 separate buckets there... if they do and
2175                                  * we happen to have to flush a bucket to make room and it intersects
2176                                  * this upl, a deadlock may result on page BUSY
2177                                  */
2178                                 if (upl_size)
2179                                         ubc_upl_commit_range(upl, upl_offset, upl_size,
2180                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2181
2182                                 sparse_cluster_switch(vp, newEOF);
2183                                 sparse_cluster_add(vp, newEOF, start_blkno, last_blkno);
2184
2185                                 continue;
2186                         }
2187                         /*
2188                          * we pushed one cluster successfully, so we must be sequentially writing this file
2189                          * otherwise, we would have failed and fallen into the sparse cluster support
2190                          * so let's take the opportunity to push out additional clusters as long as we
2191                          * remain below the throttle... this will give us better I/O locality if we're
2192                          * in a copy loop (i.e.  we won't jump back and forth between the read and write points
2193                          * however, we don't want to push so much out that the write throttle kicks in and
2194                          * hangs this thread up until some of the I/O completes...
2195                          */
2196                         while (vp->v_clen && (vp->v_numoutput <= (ASYNC_THROTTLE / 2)))
2197                                 cluster_try_push(vp, newEOF, 0, 0);
2198
2199 start_new_cluster:
2200                         if (vp->v_clen == 0)
2201                                 vp->v_ciosiz = devblocksize;
2202
2203                         vp->v_clusters[vp->v_clen].start_pg = start_blkno;
2204                         vp->v_clusters[vp->v_clen].last_pg  = last_blkno;
2205                         vp->v_clen++;
2206
2207 delay_io:
2208                         if (upl_size)
2209                                 ubc_upl_commit_range(upl, upl_offset, upl_size,
2210                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2211                         continue;
2212 issue_io:
2213                         /*
2214                          * in order to maintain some semblance of coherency with mapped writes
2215                          * we need to write the cluster back out as a multiple of the PAGESIZE
2216                          * unless the cluster encompasses the last page of the file... in this
2217                          * case we'll round out to the nearest device block boundary
2218                          */
2219                         io_size = upl_size;
2220
2221                         if ((upl_f_offset + io_size) > newEOF) {
2222                                 io_size = newEOF - upl_f_offset;
2223                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2224                         }
2225
2226                         if (flags & IO_SYNC)
2227                                 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE;
2228                         else
2229                                 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | CL_ASYNC;
2230
2231                         if (vp->v_flag & VNOCACHE_DATA)
2232                                 io_flags |= CL_DUMP;
2233
2234                         retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
2235                                             io_flags, (struct buf *)0, (struct clios *)0);
2236                 }
2237         }
2238         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2239                      retval, 0, uio_resid, 0, 0);
2240
2241         return (retval);
2242 }
2243
2244 int
2245 cluster_read(vp, uio, filesize, devblocksize, flags)
2246         struct vnode *vp;
2247         struct uio   *uio;
2248         off_t         filesize;
2249         int           devblocksize;
2250         int           flags;
2251 {
2252         int           prev_resid;
2253         int           clip_size;
2254         off_t         max_io_size;
2255         struct iovec  *iov;
2256         int           upl_size;
2257         int           upl_flags;
2258         upl_t         upl;
2259         int           retval = 0;
2260
2261
2262         if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
2263           {
2264             /*
2265              * go do a read through the cache if one of the following is true....
2266              *   NOCACHE is not true
2267              *   the uio request doesn't target USERSPACE
2268              */
2269             return (cluster_read_x(vp, uio, filesize, devblocksize, flags));
2270           }
2271
2272         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2273           {
2274             /*
2275              * we know we have a resid, so this is safe
2276              * skip over any emtpy vectors
2277              */
2278             iov = uio->uio_iov;
2279
2280             while (iov->iov_len == 0) {
2281               uio->uio_iov++;
2282               uio->uio_iovcnt--;
2283               iov = uio->uio_iov;
2284             }
2285             upl_size  = PAGE_SIZE;
2286             upl_flags = UPL_QUERY_OBJECT_TYPE;
2287
2288             if ((vm_map_get_upl(current_map(),
2289                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2290                                &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS)
2291               {
2292                 /*
2293                  * the user app must have passed in an invalid address
2294                  */
2295                 return (EFAULT);
2296               }
2297
2298             /*
2299              * We check every vector target but if it is physically
2300              * contiguous space, we skip the sanity checks.
2301              */
2302             if (upl_flags & UPL_PHYS_CONTIG)
2303               {
2304                 retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
2305               }
2306             else if (uio->uio_resid < PAGE_SIZE)
2307               {
2308                 /*
2309                  * we're here because we're don't have a physically contiguous target buffer
2310                  * go do a read through the cache if
2311                  *   the total xfer size is less than a page...
2312                  */
2313                 return (cluster_read_x(vp, uio, filesize, devblocksize, flags));
2314               }
2315             else if (((int)uio->uio_offset & PAGE_MASK) || ((int)iov->iov_base & PAGE_MASK))
2316               {
2317                 if (((int)uio->uio_offset & PAGE_MASK) == ((int)iov->iov_base & PAGE_MASK))
2318                   {
2319                     /*
2320                      * Bring the file offset read up to a pagesize boundary
2321                      * this will also bring the base address to a page boundary
2322                      * since they both are currently on the same offset within a page
2323                      * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2324                      * so the computed clip_size must always be less than the current uio_resid
2325                      */
2326                     clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2327
2328                     /*
2329                      * Fake the resid going into the cluster_read_x call
2330                      * and restore it on the way out.
2331                      */
2332                     prev_resid = uio->uio_resid;
2333                     uio->uio_resid = clip_size;
2334                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2335                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2336                   }
2337                 else
2338                   {
2339                     /*
2340                      * can't get both the file offset and the buffer offset aligned to a page boundary
2341                      * so fire an I/O through the cache for this entire vector
2342                      */
2343                     clip_size = iov->iov_len;
2344                     prev_resid = uio->uio_resid;
2345                     uio->uio_resid = clip_size;
2346                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2347                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2348                   }
2349               }
2350             else
2351               {
2352                 /*
2353                  * If we come in here, we know the offset into
2354                  * the file is on a pagesize boundary
2355                  */
2356
2357                 max_io_size = filesize - uio->uio_offset;
2358                 clip_size = uio->uio_resid;
2359                 if (iov->iov_len < clip_size)
2360                   clip_size = iov->iov_len;
2361                 if (max_io_size < clip_size)
2362                   clip_size = (int)max_io_size;
2363
2364                 if (clip_size < PAGE_SIZE)
2365                   {
2366                     /*
2367                      * Take care of the tail end of the read in this vector.
2368                      */
2369                     prev_resid = uio->uio_resid;
2370                     uio->uio_resid = clip_size;
2371                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2372                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2373                   }
2374                 else
2375                   {
2376                     /* round clip_size down to a multiple of pagesize */
2377                     clip_size = clip_size & ~(PAGE_MASK);
2378                     prev_resid = uio->uio_resid;
2379                     uio->uio_resid = clip_size;
2380                     retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2381                     if ((retval==0) && uio->uio_resid)
2382                       retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2383                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2384                   }
2385               } /* end else */
2386           } /* end while */
2387
2388         return(retval);
2389 }
2390
2391 static int
2392 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2393         struct vnode *vp;
2394         struct uio   *uio;
2395         off_t         filesize;
2396         int           devblocksize;
2397         int           flags;
2398 {
2399         upl_page_info_t *pl;
2400         upl_t            upl;
2401         vm_offset_t      upl_offset;
2402         int              upl_size;
2403         off_t            upl_f_offset;
2404         int              start_offset;
2405         int              start_pg;
2406         int              last_pg;
2407         int              uio_last;
2408         int              pages_in_upl;
2409         off_t            max_size;
2410         off_t            last_ioread_offset;
2411         off_t            last_request_offset;
2412         u_int            size_of_prefetch;
2413         int              io_size;
2414         kern_return_t    kret;
2415         int              error  = 0;
2416         int              retval = 0;
2417         u_int            b_lblkno;
2418         u_int            e_lblkno;
2419         struct clios     iostate;
2420         u_int            max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2421         u_int            rd_ahead_enabled = 1;
2422         u_int            prefetch_enabled = 1;
2423
2424
2425         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2426                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2427
2428         if (cluster_hard_throttle_on(vp)) {
2429                 rd_ahead_enabled = 0;
2430                 prefetch_enabled = 0;
2431
2432                 max_rd_size = HARD_THROTTLE_MAXSIZE;
2433         }
2434         if (vp->v_flag & (VRAOFF|VNOCACHE_DATA))
2435                 rd_ahead_enabled = 0;
2436
2437         last_request_offset = uio->uio_offset + uio->uio_resid;
2438
2439         if (last_request_offset > filesize)
2440                 last_request_offset = filesize;
2441         b_lblkno = (u_int)(uio->uio_offset / PAGE_SIZE_64);
2442         e_lblkno = (u_int)((last_request_offset - 1) / PAGE_SIZE_64);
2443
2444         if (vp->v_ralen && (vp->v_lastr == b_lblkno || (vp->v_lastr + 1) == b_lblkno)) {
2445                 /*
2446                  * determine if we already have a read-ahead in the pipe courtesy of the
2447                  * last read systemcall that was issued...
2448                  * if so, pick up it's extent to determine where we should start
2449                  * with respect to any read-ahead that might be necessary to
2450                  * garner all the data needed to complete this read systemcall
2451                  */
2452                 last_ioread_offset = (vp->v_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
2453
2454                 if (last_ioread_offset < uio->uio_offset)
2455                         last_ioread_offset = (off_t)0;
2456                 else if (last_ioread_offset > last_request_offset)
2457                         last_ioread_offset = last_request_offset;
2458         } else
2459                 last_ioread_offset = (off_t)0;
2460
2461         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2462                 /*
2463                  * compute the size of the upl needed to encompass
2464                  * the requested read... limit each call to cluster_io
2465                  * to the maximum UPL size... cluster_io will clip if
2466                  * this exceeds the maximum io_size for the device,
2467                  * make sure to account for
2468                  * a starting offset that's not page aligned
2469                  */
2470                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2471                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2472                 max_size     = filesize - uio->uio_offset;
2473
2474                 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2475                         io_size = uio->uio_resid;
2476                 else
2477                         io_size = max_size;
2478
2479                 if (!(vp->v_flag & VNOCACHE_DATA)) {
2480
2481                         while (io_size) {
2482                                 u_int io_resid;
2483                                 u_int io_requested;
2484
2485                                 /*
2486                                  * if we keep finding the pages we need already in the cache, then
2487                                  * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2488                                  * to determine that we have all the pages we need... once we miss in
2489                                  * the cache and have issued an I/O, than we'll assume that we're likely
2490                                  * to continue to miss in the cache and it's to our advantage to try and prefetch
2491                                  */
2492                                 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
2493                                         if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
2494                                                 /*
2495                                                  * we've already issued I/O for this request and
2496                                                  * there's still work to do and
2497                                                  * our prefetch stream is running dry, so issue a
2498                                                  * pre-fetch I/O... the I/O latency will overlap
2499                                                  * with the copying of the data
2500                                                  */
2501                                                 if (size_of_prefetch > max_rd_size)
2502                                                         size_of_prefetch = max_rd_size;
2503
2504                                                 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, devblocksize);
2505
2506                                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2507
2508                                                 if (last_ioread_offset > last_request_offset)
2509                                                         last_ioread_offset = last_request_offset;
2510                                         }
2511                                 }
2512                                 /*
2513                                  * limit the size of the copy we're about to do so that
2514                                  * we can notice that our I/O pipe is running dry and
2515                                  * get the next I/O issued before it does go dry
2516                                  */
2517                                 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
2518                                         io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
2519                                 else
2520                                         io_resid = io_size;
2521
2522                                 io_requested = io_resid;
2523
2524                                 retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
2525
2526                                 io_size -= (io_requested - io_resid);
2527
2528                                 if (retval || io_resid)
2529                                         /*
2530                                          * if we run into a real error or
2531                                          * a page that is not in the cache
2532                                          * we need to leave streaming mode
2533                                          */
2534                                         break;
2535
2536                                 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
2537                                         /*
2538                                          * we're already finished the I/O for this read request
2539                                          * let's see if we should do a read-ahead
2540                                          */
2541                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2542                                 }
2543                         }
2544                         if (retval)
2545                                 break;
2546                         if (io_size == 0) {
2547                                 if (e_lblkno < vp->v_lastr)
2548                                         vp->v_maxra = 0;
2549                                 vp->v_lastr = e_lblkno;
2550
2551                                 break;
2552                         }
2553                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2554                         upl_f_offset = uio->uio_offset - (off_t)start_offset;
2555                         max_size     = filesize - uio->uio_offset;
2556                 }
2557                 if (io_size > max_rd_size)
2558                         io_size = max_rd_size;
2559
2560                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2561
2562                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2563                         upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
2564                 pages_in_upl = upl_size / PAGE_SIZE;
2565
2566                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2567                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2568
2569                 kret = ubc_create_upl(vp,
2570                                                 upl_f_offset,
2571                                                 upl_size,
2572                                                 &upl,
2573                                                 &pl,
2574                                                 UPL_SET_LITE);
2575                 if (kret != KERN_SUCCESS)
2576                         panic("cluster_read: failed to get pagelist");
2577
2578                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2579                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2580
2581                 /*
2582                  * scan from the beginning of the upl looking for the first
2583                  * non-valid page.... this will become the first page in
2584                  * the request we're going to make to 'cluster_io'... if all
2585                  * of the pages are valid, we won't call through to 'cluster_io'
2586                  */
2587                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2588                         if (!upl_valid_page(pl, start_pg))
2589                                 break;
2590                 }
2591
2592                 /*
2593                  * scan from the starting invalid page looking for a valid
2594                  * page before the end of the upl is reached, if we
2595                  * find one, then it will be the last page of the request to
2596                  * 'cluster_io'
2597                  */
2598                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2599                         if (upl_valid_page(pl, last_pg))
2600                                 break;
2601                 }
2602                 iostate.io_completed = 0;
2603                 iostate.io_issued = 0;
2604                 iostate.io_error = 0;
2605                 iostate.io_wanted = 0;
2606
2607                 if (start_pg < last_pg) {
2608                         /*
2609                          * we found a range of 'invalid' pages that must be filled
2610                          * if the last page in this range is the last page of the file
2611                          * we may have to clip the size of it to keep from reading past
2612                          * the end of the last physical block associated with the file
2613                          */
2614                         upl_offset = start_pg * PAGE_SIZE;
2615                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2616
2617                         if ((upl_f_offset + upl_offset + io_size) > filesize)
2618                                 io_size = filesize - (upl_f_offset + upl_offset);
2619
2620                         /*
2621                          * issue an asynchronous read to cluster_io
2622                          */
2623
2624                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2625                                            io_size, devblocksize, CL_READ | CL_ASYNC, (struct buf *)0, &iostate);
2626                 }
2627                 if (error == 0) {
2628                         /*
2629                          * if the read completed successfully, or there was no I/O request
2630                          * issued, than copy the data into user land via 'cluster_upl_copy_data'
2631                          * we'll first add on any 'valid'
2632                          * pages that were present in the upl when we acquired it.
2633                          */
2634                         u_int  val_size;
2635
2636                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2637                                 if (!upl_valid_page(pl, uio_last))
2638                                         break;
2639                         }
2640                         /*
2641                          * compute size to transfer this round,  if uio->uio_resid is
2642                          * still non-zero after this attempt, we'll loop around and
2643                          * set up for another I/O.
2644                          */
2645                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2646
2647                         if (val_size > max_size)
2648                                 val_size = max_size;
2649
2650                         if (val_size > uio->uio_resid)
2651                                 val_size = uio->uio_resid;
2652
2653                         if (last_ioread_offset == 0)
2654                                 last_ioread_offset = uio->uio_offset + val_size;
2655
2656                         if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
2657                                 /*
2658                                  * if there's still I/O left to do for this request, and...
2659                                  * we're not in hard throttle mode, then issue a
2660                                  * pre-fetch I/O... the I/O latency will overlap
2661                                  * with the copying of the data
2662                                  */
2663                                 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, devblocksize);
2664
2665                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2666
2667                                 if (last_ioread_offset > last_request_offset)
2668                                         last_ioread_offset = last_request_offset;
2669
2670                         } else if ((uio->uio_offset + val_size) == last_request_offset) {
2671                                 /*
2672                                  * this transfer will finish this request, so...
2673                                  * let's try to read ahead if we're in
2674                                  * a sequential access pattern and we haven't
2675                                  * explicitly disabled it
2676                                  */
2677                                 if (rd_ahead_enabled)
2678                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2679
2680                                 if (e_lblkno < vp->v_lastr)
2681                                         vp->v_maxra = 0;
2682                                 vp->v_lastr = e_lblkno;
2683                         }
2684                         while (iostate.io_issued != iostate.io_completed) {
2685                                 iostate.io_wanted = 1;
2686                                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_read_x", 0);
2687                         }
2688                         if (iostate.io_error)
2689                                 error = iostate.io_error;
2690                         else
2691                                 retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
2692                 }
2693                 if (start_pg < last_pg) {
2694                         /*
2695                          * compute the range of pages that we actually issued an I/O for
2696                          * and either commit them as valid if the I/O succeeded
2697                          * or abort them if the I/O failed
2698                          */
2699                         io_size = (last_pg - start_pg) * PAGE_SIZE;
2700
2701                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2702                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2703
2704                         if (error || (vp->v_flag & VNOCACHE_DATA))
2705                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2706                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2707                         else
2708                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2709                                                      UPL_COMMIT_CLEAR_DIRTY |
2710                                                      UPL_COMMIT_FREE_ON_EMPTY |
2711                                                      UPL_COMMIT_INACTIVATE);
2712
2713                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2714                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2715                 }
2716                 if ((last_pg - start_pg) < pages_in_upl) {
2717                         int cur_pg;
2718                         int commit_flags;
2719
2720                         /*
2721                          * the set of pages that we issued an I/O for did not encompass
2722                          * the entire upl... so just release these without modifying
2723                          * their state
2724                          */
2725                         if (error)
2726                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2727                         else {
2728                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2729                                              (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2730
2731                                 if (start_pg) {
2732                                         /*
2733                                          * we found some already valid pages at the beginning of
2734                                          * the upl commit these back to the inactive list with
2735                                          * reference cleared
2736                                          */
2737                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2738                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2739                                                                    | UPL_COMMIT_INACTIVATE;
2740
2741                                                 if (upl_dirty_page(pl, cur_pg))
2742                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2743
2744                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2745                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2746                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2747                                                 else
2748                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2749                                                                 PAGE_SIZE, commit_flags);
2750                                         }
2751                                 }
2752                                 if (last_pg < uio_last) {
2753                                         /*
2754                                          * we found some already valid pages immediately after the
2755                                          * pages we issued I/O for, commit these back to the
2756                                          * inactive list with reference cleared
2757                                          */
2758                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2759                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
2760                                                                                 | UPL_COMMIT_INACTIVATE;
2761
2762                                                 if (upl_dirty_page(pl, cur_pg))
2763                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2764
2765                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2766                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2767                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2768                                                 else
2769                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2770                                                                 PAGE_SIZE, commit_flags);
2771                                         }
2772                                 }
2773                                 if (uio_last < pages_in_upl) {
2774                                         /*
2775                                          * there were some invalid pages beyond the valid pages
2776                                          * that we didn't issue an I/O for, just release them
2777                                          * unchanged
2778                                          */
2779                                         ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2780                                                             (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
2781                                 }
2782
2783                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2784                                         (int)upl, -1, -1, 0, 0);
2785                         }
2786                 }
2787                 if (retval == 0)
2788                         retval = error;
2789         }
2790         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2791                      (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2792
2793         return (retval);
2794 }
2795
2796
2797 static int
2798 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2799         struct vnode *vp;
2800         struct uio   *uio;
2801         off_t         filesize;
2802         int           devblocksize;
2803         int           flags;
2804 {
2805         upl_t            upl;
2806         upl_page_info_t  *pl;
2807         vm_offset_t      upl_offset;
2808         off_t            max_io_size;
2809         int              io_size;
2810         int              upl_size;
2811         int              upl_needed_size;
2812         int              pages_in_pl;
2813         int              upl_flags;
2814         kern_return_t    kret;
2815         struct iovec     *iov;
2816         int              i;
2817         int              force_data_sync;
2818         int              retval = 0;
2819         struct clios     iostate;
2820         u_int            max_rd_size  = MAX_UPL_TRANSFER * PAGE_SIZE;
2821         u_int            max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
2822
2823
2824         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2825                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2826
2827         /*
2828          * When we enter this routine, we know
2829          *  -- the offset into the file is on a pagesize boundary
2830          *  -- the resid is a page multiple
2831          *  -- the resid will not exceed iov_len
2832          */
2833
2834         iostate.io_completed = 0;
2835         iostate.io_issued = 0;
2836         iostate.io_error = 0;
2837         iostate.io_wanted = 0;
2838
2839         iov = uio->uio_iov;
2840
2841         if (cluster_hard_throttle_on(vp)) {
2842                 max_rd_size  = HARD_THROTTLE_MAXSIZE;
2843                 max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
2844         }
2845         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2846
2847                 max_io_size = filesize - uio->uio_offset;
2848
2849                 if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2850                         io_size = max_io_size;
2851                 else
2852                         io_size = uio->uio_resid;
2853
2854                 /*
2855                  * First look for pages already in the cache
2856                  * and move them to user space.
2857                  */
2858                 retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
2859
2860                 if (retval) {
2861                         /*
2862                          * we may have already spun some portion of this request
2863                          * off as async requests... we need to wait for the I/O
2864                          * to complete before returning
2865                          */
2866                         goto wait_for_reads;
2867                 }
2868                 /*
2869                  * If we are already finished with this read, then return
2870                  */
2871                 if (io_size == 0) {
2872                         /*
2873                          * we may have already spun some portion of this request
2874                          * off as async requests... we need to wait for the I/O
2875                          * to complete before returning
2876                          */
2877                         goto wait_for_reads;
2878                 }
2879                 max_io_size = io_size;
2880
2881                 if (max_io_size > max_rd_size)
2882                         max_io_size = max_rd_size;
2883
2884                 io_size = 0;
2885
2886                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
2887
2888                 if (io_size == 0)
2889                         /*
2890                          * we may have already spun some portion of this request
2891                          * off as async requests... we need to wait for the I/O
2892                          * to complete before returning
2893                          */
2894                         goto wait_for_reads;
2895
2896                 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
2897                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2898
2899                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2900                              (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
2901
2902                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2903                         pages_in_pl = 0;
2904                         upl_size = upl_needed_size;
2905                         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2906
2907                         kret = vm_map_get_upl(current_map(),
2908                                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2909                                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2910
2911                         if (kret != KERN_SUCCESS) {
2912                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2913                                              (int)upl_offset, upl_size, io_size, kret, 0);
2914                                 /*
2915                                  * cluster_nocopy_read: failed to get pagelist
2916                                  *
2917                                  * we may have already spun some portion of this request
2918                                  * off as async requests... we need to wait for the I/O
2919                                  * to complete before returning
2920                                  */
2921                                 goto wait_for_reads;
2922                         }
2923                         pages_in_pl = upl_size / PAGE_SIZE;
2924                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2925
2926                         for (i = 0; i < pages_in_pl; i++) {
2927                                 if (!upl_valid_page(pl, i))
2928                                         break;
2929                         }
2930                         if (i == pages_in_pl)
2931                                 break;
2932
2933                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2934                                             UPL_ABORT_FREE_ON_EMPTY);
2935                 }
2936                 if (force_data_sync >= 3) {
2937                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2938                                      (int)upl_offset, upl_size, io_size, kret, 0);
2939
2940                         goto wait_for_reads;
2941                 }
2942                 /*
2943                  * Consider the possibility that upl_size wasn't satisfied.
2944                  */
2945                 if (upl_size != upl_needed_size)
2946                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2947
2948                 if (io_size == 0) {
2949                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2950                                             UPL_ABORT_FREE_ON_EMPTY);
2951                         goto wait_for_reads;
2952                 }
2953                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2954                              (int)upl_offset, upl_size, io_size, kret, 0);
2955
2956                 /*
2957                  * request asynchronously so that we can overlap
2958                  * the preparation of the next I/O
2959                  * if there are already too many outstanding reads
2960                  * wait until some have completed before issuing the next read
2961                  */
2962                 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
2963                         iostate.io_wanted = 1;
2964                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2965                 }
2966                 if (iostate.io_error) {
2967                         /*
2968                          * one of the earlier reads we issued ran into a hard error
2969                          * don't issue any more reads, cleanup the UPL
2970                          * that was just created but not used, then
2971                          * go wait for any other reads to complete before
2972                          * returning the error to the caller
2973                          */
2974                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2975                                             UPL_ABORT_FREE_ON_EMPTY);
2976
2977                         goto wait_for_reads;
2978                 }
2979                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2980                              (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
2981
2982                 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2983                                    io_size, devblocksize,
2984                                    CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
2985                                    (struct buf *)0, &iostate);
2986
2987                 /*
2988                  * update the uio structure
2989                  */
2990                 iov->iov_base   += io_size;
2991                 iov->iov_len    -= io_size;
2992                 uio->uio_resid  -= io_size;
2993                 uio->uio_offset += io_size;
2994
2995                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2996                              (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, retval, 0);
2997
2998         } /* end while */
2999
3000 wait_for_reads:
3001         /*
3002          * make sure all async reads that are part of this stream
3003          * have completed before we return
3004          */
3005         while (iostate.io_issued != iostate.io_completed) {
3006                 iostate.io_wanted = 1;
3007                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
3008         }
3009         if (iostate.io_error)
3010                 retval = iostate.io_error;
3011
3012         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
3013                      (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
3014
3015         return (retval);
3016 }
3017
3018
3019 static int
3020 cluster_phys_read(vp, uio, filesize, devblocksize, flags)
3021         struct vnode *vp;
3022         struct uio   *uio;
3023         off_t        filesize;
3024         int          devblocksize;
3025         int          flags;
3026 {
3027         upl_page_info_t *pl;
3028         upl_t            upl;
3029         vm_offset_t      upl_offset;
3030         addr64_t         dst_paddr;
3031         off_t            max_size;
3032         int              io_size;
3033         int              tail_size;
3034         int              upl_size;
3035         int              upl_needed_size;
3036         int              pages_in_pl;
3037         int              upl_flags;
3038         kern_return_t    kret;
3039         struct iovec     *iov;
3040         struct clios     iostate;
3041         int              error;
3042
3043         /*
3044          * When we enter this routine, we know
3045          *  -- the resid will not exceed iov_len
3046          *  -- the target address is physically contiguous
3047          */
3048
3049         iov = uio->uio_iov;
3050
3051         max_size = filesize - uio->uio_offset;
3052
3053         if (max_size > (off_t)((unsigned int)iov->iov_len))
3054                 io_size = iov->iov_len;
3055         else
3056                 io_size = max_size;
3057
3058         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
3059         upl_needed_size = upl_offset + io_size;
3060
3061         error       = 0;
3062         pages_in_pl = 0;
3063         upl_size = upl_needed_size;
3064         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3065
3066         kret = vm_map_get_upl(current_map(),
3067                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
3068                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
3069
3070         if (kret != KERN_SUCCESS) {
3071                 /*
3072                  * cluster_phys_read: failed to get pagelist
3073                  */
3074                 return(EINVAL);
3075         }
3076         if (upl_size < upl_needed_size) {
3077                 /*
3078                  * The upl_size wasn't satisfied.
3079                  */
3080                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3081
3082                 return(EINVAL);
3083         }
3084         pl = ubc_upl_pageinfo(upl);
3085
3086         dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)((u_int)iov->iov_base & PAGE_MASK));
3087
3088         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3089                 int   head_size;
3090
3091                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
3092
3093                 if (head_size > io_size)
3094                         head_size = io_size;
3095
3096                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
3097
3098                 if (error) {
3099                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3100
3101                         return(EINVAL);
3102                 }
3103                 upl_offset += head_size;
3104                 dst_paddr  += head_size;
3105                 io_size    -= head_size;
3106         }
3107         tail_size = io_size & (devblocksize - 1);
3108         io_size  -= tail_size;
3109
3110         iostate.io_completed = 0;
3111         iostate.io_issued = 0;
3112         iostate.io_error = 0;
3113         iostate.io_wanted = 0;
3114
3115         while (io_size && error == 0) {
3116                 int  xsize;
3117
3118                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3119                         xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
3120                 else
3121                         xsize = io_size;
3122                 /*
3123                  * request asynchronously so that we can overlap
3124                  * the preparation of the next I/O... we'll do
3125                  * the commit after all the I/O has completed
3126                  * since its all issued against the same UPL
3127                  * if there are already too many outstanding reads
3128                  * wait until some have completed before issuing the next
3129                  */
3130                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
3131                         iostate.io_wanted = 1;
3132                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3133                 }
3134
3135                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0,
3136                                    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3137                                    (struct buf *)0, &iostate);
3138                 /*
3139                  * The cluster_io read was issued successfully,
3140                  * update the uio structure
3141                  */
3142                 if (error == 0) {
3143                         uio->uio_resid  -= xsize;
3144                         iov->iov_len    -= xsize;
3145                         iov->iov_base   += xsize;
3146                         uio->uio_offset += xsize;
3147                         dst_paddr       += xsize;
3148                         upl_offset      += xsize;
3149                         io_size         -= xsize;
3150                 }
3151         }
3152         /*
3153          * make sure all async reads that are part of this stream
3154          * have completed before we proceed
3155          */
3156         while (iostate.io_issued != iostate.io_completed) {
3157                 iostate.io_wanted = 1;
3158                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3159         }
3160         if (iostate.io_error) {
3161                 error = iostate.io_error;
3162         }
3163         if (error == 0 && tail_size)
3164                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
3165
3166         /*
3167          * just release our hold on the physically contiguous
3168          * region without changing any state
3169          */
3170         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3171
3172         return (error);
3173 }
3174
3175
3176 /*
3177  * generate advisory I/O's in the largest chunks possible
3178  * the completed pages will be released into the VM cache
3179  */
3180 int
3181 advisory_read(vp, filesize, f_offset, resid, devblocksize)
3182         struct vnode *vp;
3183         off_t         filesize;
3184         off_t         f_offset;
3185         int           resid;
3186         int           devblocksize;
3187 {
3188         upl_page_info_t *pl;
3189         upl_t            upl;
3190         vm_offset_t      upl_offset;
3191         int              upl_size;
3192         off_t            upl_f_offset;
3193         int              start_offset;
3194         int              start_pg;
3195         int              last_pg;
3196         int              pages_in_upl;
3197         off_t            max_size;
3198         int              io_size;
3199         kern_return_t    kret;
3200         int              retval = 0;
3201         int              issued_io;
3202         int              skip_range;
3203
3204         if (!UBCINFOEXISTS(vp))
3205                 return(EINVAL);
3206
3207         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3208                      (int)f_offset, resid, (int)filesize, devblocksize, 0);
3209
3210         while (resid && f_offset < filesize && retval == 0) {
3211                 /*
3212                  * compute the size of the upl needed to encompass
3213                  * the requested read... limit each call to cluster_io
3214                  * to the maximum UPL size... cluster_io will clip if
3215                  * this exceeds the maximum io_size for the device,
3216                  * make sure to account for
3217                  * a starting offset that's not page aligned
3218                  */
3219                 start_offset = (int)(f_offset & PAGE_MASK_64);
3220                 upl_f_offset = f_offset - (off_t)start_offset;
3221                 max_size     = filesize - f_offset;
3222
3223                 if (resid < max_size)
3224                         io_size = resid;
3225                 else
3226                         io_size = max_size;
3227
3228                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3229                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3230                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3231
3232                 skip_range = 0;
3233                 /*
3234                  * return the number of contiguously present pages in the cache
3235                  * starting at upl_f_offset within the file
3236                  */
3237                 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
3238
3239                 if (skip_range) {
3240                         /*
3241                          * skip over pages already present in the cache
3242                          */
3243                         io_size = skip_range - start_offset;
3244
3245                         f_offset += io_size;
3246                         resid    -= io_size;
3247
3248                         if (skip_range == upl_size)
3249                                 continue;
3250                         /*
3251                          * have to issue some real I/O
3252                          * at this point, we know it's starting on a page boundary
3253                          * because we've skipped over at least the first page in the request
3254                          */
3255                         start_offset = 0;
3256                         upl_f_offset += skip_range;
3257                         upl_size     -= skip_range;
3258                 }
3259                 pages_in_upl = upl_size / PAGE_SIZE;
3260
3261                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
3262                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3263
3264                 kret = ubc_create_upl(vp,
3265                                                 upl_f_offset,
3266                                                 upl_size,
3267                                                 &upl,
3268                                                 &pl,
3269                                                 UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
3270                 if (kret != KERN_SUCCESS)
3271                         return(retval);
3272                 issued_io = 0;
3273
3274                 /*
3275                  * before we start marching forward, we must make sure we end on
3276                  * a present page, otherwise we will be working with a freed
3277                  * upl
3278                  */
3279                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3280                         if (upl_page_present(pl, last_pg))
3281                                 break;
3282                 }
3283                 pages_in_upl = last_pg + 1;
3284
3285
3286                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
3287                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3288
3289
3290                 for (last_pg = 0; last_pg < pages_in_upl; ) {
3291                         /*
3292                          * scan from the beginning of the upl looking for the first
3293                          * page that is present.... this will become the first page in
3294                          * the request we're going to make to 'cluster_io'... if all
3295                          * of the pages are absent, we won't call through to 'cluster_io'
3296                          */
3297                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3298                                 if (upl_page_present(pl, start_pg))
3299                                         break;
3300                         }
3301
3302                         /*
3303                          * scan from the starting present page looking for an absent
3304                          * page before the end of the upl is reached, if we
3305                          * find one, then it will terminate the range of pages being
3306                          * presented to 'cluster_io'
3307                          */
3308                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3309                                 if (!upl_page_present(pl, last_pg))
3310                                         break;
3311                         }
3312
3313                         if (last_pg > start_pg) {
3314                                 /*
3315                                  * we found a range of pages that must be filled
3316                                  * if the last page in this range is the last page of the file
3317                                  * we may have to clip the size of it to keep from reading past
3318                                  * the end of the last physical block associated with the file
3319                                  */
3320                                 upl_offset = start_pg * PAGE_SIZE;
3321                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
3322
3323                                 if ((upl_f_offset + upl_offset + io_size) > filesize)
3324                                         io_size = filesize - (upl_f_offset + upl_offset);
3325
3326                                 /*
3327                                  * issue an asynchronous read to cluster_io
3328                                  */
3329                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
3330                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
3331
3332                                 issued_io = 1;
3333                         }
3334                 }
3335                 if (issued_io == 0)
3336                         ubc_upl_abort(upl, 0);
3337
3338                 io_size = upl_size - start_offset;
3339
3340                 if (io_size > resid)
3341                         io_size = resid;
3342                 f_offset += io_size;
3343                 resid    -= io_size;
3344         }
3345
3346         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3347                      (int)f_offset, resid, retval, 0, 0);
3348
3349         return(retval);
3350 }
3351
3352
3353 int
3354 cluster_push(vp)
3355         struct vnode *vp;
3356 {
3357         int  retval;
3358
3359         if (!UBCINFOEXISTS(vp) || (vp->v_clen == 0 && !(vp->v_flag & VHASDIRTY)))
3360                 return(0);
3361
3362         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3363                      vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
3364
3365         if (vp->v_flag & VHASDIRTY) {
3366                 sparse_cluster_push(vp, ubc_getsize(vp), 1);
3367
3368                 vp->v_clen = 0;
3369                 retval = 1;
3370         } else
3371                 retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
3372
3373         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3374                      vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
3375
3376         return (retval);
3377 }
3378
3379
3380 int
3381 cluster_release(vp)
3382         struct vnode *vp;
3383 {
3384         off_t offset;
3385         u_int length;
3386
3387         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3388
3389         if (vp->v_flag & VHASDIRTY) {
3390                 vfs_drt_control(&(vp->v_scmap), 0);
3391
3392                 vp->v_flag &= ~VHASDIRTY;
3393         }
3394         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3395 }
3396
3397
3398 static int
3399 cluster_try_push(vp, EOF, can_delay, push_all)
3400         struct vnode *vp;
3401         off_t  EOF;
3402         int    can_delay;
3403         int    push_all;
3404 {
3405         int cl_index;
3406         int cl_index1;
3407         int min_index;
3408         int cl_len;
3409         int cl_total;
3410         int cl_pushed = 0;
3411         struct v_cluster l_clusters[MAX_CLUSTERS];
3412
3413         /*
3414          * make a local 'sorted' copy of the clusters
3415          * and clear vp->v_clen so that new clusters can
3416          * be developed
3417          */
3418         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3419                 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3420                         if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3421                                 continue;
3422                         if (min_index == -1)
3423                                 min_index = cl_index1;
3424                         else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3425                                 min_index = cl_index1;
3426                 }
3427                 if (min_index == -1)
3428                         break;
3429                 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3430                 l_clusters[cl_index].last_pg  = vp->v_clusters[min_index].last_pg;
3431
3432                 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3433         }
3434         cl_len     = cl_index;
3435         vp->v_clen = 0;
3436
3437         if (can_delay && cl_len == MAX_CLUSTERS) {
3438                 int   i;
3439
3440                 /*
3441                  * determine if we appear to be writing the file sequentially
3442                  * if not, by returning without having pushed any clusters
3443                  * we will cause this vnode to be pushed into the sparse cluster mechanism
3444                  * used for managing more random I/O patterns
3445                  *
3446                  * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3447                  * that's why we're in try_push with can_delay true...
3448                  *
3449                  * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3450                  * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3451                  * so we can just make a simple pass through up, to but not including the last one...
3452                  * note that last_pg is not inclusive, so it will be equal to the start_pg of the next cluster if they
3453                  * are sequential
3454                  *
3455                  * we let the last one be partial as long as it was adjacent to the previous one...
3456                  * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3457                  * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3458                  */
3459                 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
3460                         if ((l_clusters[i].last_pg - l_clusters[i].start_pg) != MAX_UPL_TRANSFER)
3461                                 goto dont_try;
3462                         if (l_clusters[i].last_pg != l_clusters[i+1].start_pg)
3463                                 goto dont_try;
3464                 }
3465         }
3466         for (cl_index = 0; cl_index < cl_len; cl_index++) {
3467                 /*
3468                  * try to push each cluster in turn...  cluster_push_x may not
3469                  * push the cluster if can_delay is TRUE and the cluster doesn't
3470                  * meet the critera for an immediate push
3471                  */
3472                 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3473                         l_clusters[cl_index].start_pg = 0;
3474                         l_clusters[cl_index].last_pg  = 0;
3475
3476                         cl_pushed++;
3477
3478                         if (push_all == 0)
3479                                 break;
3480                 }
3481         }
3482 dont_try:
3483         if (cl_len > cl_pushed) {
3484                /*
3485                 * we didn't push all of the clusters, so
3486                 * lets try to merge them back in to the vnode
3487                 */
3488                 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3489                         /*
3490                          * we picked up some new clusters while we were trying to
3491                          * push the old ones (I don't think this can happen because
3492                          * I'm holding the lock, but just in case)... the sum of the
3493                          * leftovers plus the new cluster count exceeds our ability
3494                          * to represent them, so switch to the sparse cluster mechanism
3495                          */
3496
3497                         /*
3498                          * first collect the new clusters sitting in the vp
3499                          */
3500                         sparse_cluster_switch(vp, EOF);
3501
3502                         for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
3503                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3504                                         continue;
3505                                 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3506                                 vp->v_clusters[cl_index1].last_pg  = l_clusters[cl_index].last_pg;
3507
3508                                 cl_index1++;
3509                         }
3510                         /*
3511                          * update the cluster count
3512                          */
3513                         vp->v_clen = cl_index1;
3514
3515                         /*
3516                          * and collect the original clusters that were moved into the
3517                          * local storage for sorting purposes
3518                          */
3519                         sparse_cluster_switch(vp, EOF);
3520
3521                 } else {
3522                         /*
3523                          * we've got room to merge the leftovers back in
3524                          * just append them starting at the next 'hole'
3525                          * represented by vp->v_clen
3526                          */
3527                         for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3528                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3529                                         continue;
3530
3531                                 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3532                                 vp->v_clusters[cl_index1].last_pg  = l_clusters[cl_index].last_pg;
3533
3534                                 cl_index1++;
3535                         }
3536                         /*
3537                          * update the cluster count
3538                          */
3539                         vp->v_clen = cl_index1;
3540                 }
3541         }
3542         return(MAX_CLUSTERS - vp->v_clen);
3543 }
3544
3545
3546
3547 static int
3548 cluster_push_x(vp, EOF, first, last, can_delay)
3549         struct vnode *vp;
3550         off_t  EOF;
3551         daddr_t first;
3552         daddr_t last;
3553         int    can_delay;
3554 {
3555         upl_page_info_t *pl;
3556         upl_t            upl;
3557         vm_offset_t      upl_offset;
3558         int              upl_size;
3559         off_t            upl_f_offset;
3560         int              pages_in_upl;
3561         int              start_pg;
3562         int              last_pg;
3563         int              io_size;
3564         int              io_flags;
3565         int              upl_flags;
3566         int              size;
3567         kern_return_t    kret;
3568
3569
3570         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3571                      vp->v_clen, first, last, EOF, 0);
3572
3573         if ((pages_in_upl = last - first) == 0) {
3574                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
3575
3576                 return (1);
3577         }
3578         upl_size = pages_in_upl * PAGE_SIZE;
3579         upl_f_offset = ((off_t)first) * PAGE_SIZE_64;
3580
3581         if (upl_f_offset + upl_size >= EOF) {
3582
3583                 if (upl_f_offset >= EOF) {
3584                         /*
3585                          * must have truncated the file and missed
3586                          * clearing a dangling cluster (i.e. it's completely
3587                          * beyond the new EOF
3588                          */
3589                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3590
3591                         return(1);
3592                 }
3593                 size = EOF - upl_f_offset;
3594
3595                 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3596                 pages_in_upl = upl_size / PAGE_SIZE;
3597         } else
3598                 size = upl_size;
3599
3600         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
3601
3602         if (vp->v_flag & VNOCACHE_DATA)
3603                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
3604         else
3605                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
3606
3607         kret = ubc_create_upl(vp,
3608                                 upl_f_offset,
3609                                 upl_size,
3610                                 &upl,
3611                                 &pl,
3612                                 upl_flags);
3613         if (kret != KERN_SUCCESS)
3614                 panic("cluster_push: failed to get pagelist");
3615
3616         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
3617
3618         /*
3619          * since we only asked for the dirty pages back
3620          * it's possible that we may only get a few or even none, so...
3621          * before we start marching forward, we must make sure we know
3622          * where the last present page is in the UPL, otherwise we could
3623          * end up working with a freed upl due to the FREE_ON_EMPTY semantics
3624          * employed by commit_range and abort_range.
3625          */
3626         for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3627                 if (upl_page_present(pl, last_pg))
3628                         break;
3629         }
3630         pages_in_upl = last_pg + 1;
3631
3632         if (pages_in_upl == 0) {
3633                 ubc_upl_abort(upl, 0);
3634
3635                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
3636                 return(1);
3637         }
3638
3639         for (last_pg = 0; last_pg < pages_in_upl; ) {
3640                 /*
3641                  * find the next dirty page in the UPL
3642                  * this will become the first page in the
3643                  * next I/O to generate
3644                  */
3645                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3646                         if (upl_dirty_page(pl, start_pg))
3647                                 break;
3648                         if (upl_page_present(pl, start_pg))
3649                                 /*
3650                                  * RET_ONLY_DIRTY will return non-dirty 'precious' pages
3651                                  * just release these unchanged since we're not going
3652                                  * to steal them or change their state
3653                                  */
3654                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
3655                 }
3656                 if (start_pg >= pages_in_upl)
3657                         /*
3658                          * done... no more dirty pages to push
3659                          */
3660                         break;
3661                 if (start_pg > last_pg)
3662                         /*
3663                          * skipped over some non-dirty pages
3664                          */
3665                         size -= ((start_pg - last_pg) * PAGE_SIZE);
3666
3667                 /*
3668                  * find a range of dirty pages to write
3669                  */
3670                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3671                         if (!upl_dirty_page(pl, last_pg))
3672                                 break;
3673                 }
3674                 upl_offset = start_pg * PAGE_SIZE;
3675
3676                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3677
3678                 if (vp->v_flag & VNOCACHE_DATA)
3679                         io_flags = CL_THROTTLE | CL_COMMIT | CL_ASYNC | CL_DUMP;
3680                 else
3681                         io_flags = CL_THROTTLE | CL_COMMIT | CL_ASYNC;
3682
3683                 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
3684
3685                 size -= io_size;
3686         }
3687         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3688
3689         return(1);
3690 }
3691
3692
3693 static int
3694 sparse_cluster_switch(struct vnode *vp, off_t EOF)
3695 {
3696         int cl_index;
3697
3698         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3699
3700         if ( !(vp->v_flag & VHASDIRTY)) {
3701                 vp->v_flag |= VHASDIRTY;
3702                 vp->v_scdirty = 0;
3703                 vp->v_scmap   = 0;
3704         }
3705         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3706                 int    flags;
3707                 int    start_pg;
3708                 int    last_pg;
3709
3710                 for (start_pg = vp->v_clusters[cl_index].start_pg; start_pg < vp->v_clusters[cl_index].last_pg; start_pg++) {
3711
3712                         if (ubc_page_op(vp, (off_t)(((off_t)start_pg) * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
3713                                 if (flags & UPL_POP_DIRTY)
3714                                         sparse_cluster_add(vp, EOF, start_pg, start_pg + 1);
3715                         }
3716                 }
3717         }
3718         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3719 }
3720
3721
3722 static int
3723 sparse_cluster_push(struct vnode *vp, off_t EOF, int push_all)
3724 {
3725         daddr_t first;
3726         daddr_t last;
3727         off_t offset;
3728         u_int length;
3729
3730         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, push_all, 0);
3731
3732         if (push_all)
3733                 vfs_drt_control(&(vp->v_scmap), 1);
3734
3735         for (;;) {
3736                 if (vfs_drt_get_cluster(&(vp->v_scmap), &offset, &length) != KERN_SUCCESS) {
3737                         vp->v_flag &= ~VHASDIRTY;
3738                         vp->v_clen = 0;
3739                         break;
3740                 }
3741                 first = (daddr_t)(offset / PAGE_SIZE_64);
3742                 last  = (daddr_t)((offset + length) / PAGE_SIZE_64);
3743
3744                 cluster_push_x(vp, EOF, first, last, 0);
3745
3746                 vp->v_scdirty -= (last - first);
3747
3748                 if (push_all == 0)
3749                         break;
3750         }
3751         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3752 }
3753
3754
3755 static int
3756 sparse_cluster_add(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last)
3757 {
3758         u_int new_dirty;
3759         u_int length;
3760         off_t offset;
3761
3762         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)vp->v_scmap, vp->v_scdirty, first, last, 0);
3763
3764         offset = (off_t)first * PAGE_SIZE_64;
3765         length = (last - first) * PAGE_SIZE;
3766
3767         while (vfs_drt_mark_pages(&(vp->v_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
3768                 /*
3769                  * no room left in the map
3770                  * only a partial update was done
3771                  * push out some pages and try again
3772                  */
3773                 vp->v_scdirty += new_dirty;
3774
3775                 sparse_cluster_push(vp, EOF, 0);
3776
3777                 offset += (new_dirty * PAGE_SIZE_64);
3778                 length -= (new_dirty * PAGE_SIZE);
3779         }
3780         vp->v_scdirty += new_dirty;
3781
3782         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3783 }
3784
3785
3786 static int
3787 cluster_align_phys_io(struct vnode *vp, struct uio *uio, addr64_t usr_paddr, int xsize, int devblocksize, int flags)
3788 {
3789         struct iovec     *iov;
3790         upl_page_info_t  *pl;
3791         upl_t            upl;
3792         addr64_t         ubc_paddr;
3793         kern_return_t    kret;
3794         int              error = 0;
3795
3796         iov = uio->uio_iov;
3797
3798         kret = ubc_create_upl(vp,
3799                               uio->uio_offset & ~PAGE_MASK_64,
3800                               PAGE_SIZE,
3801                               &upl,
3802                               &pl,
3803                               UPL_SET_LITE);
3804
3805         if (kret != KERN_SUCCESS)
3806                 return(EINVAL);
3807
3808         if (!upl_valid_page(pl, 0)) {
3809                 /*
3810                  * issue a synchronous read to cluster_io
3811                  */
3812                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3813                                    CL_READ, (struct buf *)0, (struct clios *)0);
3814                 if (error) {
3815                           ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3816
3817                           return(error);
3818                 }
3819         }
3820         ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
3821
3822 /*
3823  *      NOTE:  There is no prototype for the following in BSD. It, and the definitions
3824  *      of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
3825  *      osfmk/ppc/mappings.h.  They are not included here because there appears to be no
3826  *      way to do so without exporting them to kexts as well.
3827  */
3828         if (flags & CL_READ)
3829 //              copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);    /* Copy physical to physical and flush the destination */
3830                 copypv(ubc_paddr, usr_paddr, xsize,        2 |        1 |        4);    /* Copy physical to physical and flush the destination */
3831         else
3832 //              copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);    /* Copy physical to physical and flush the source */
3833                 copypv(ubc_paddr, usr_paddr, xsize,        2 |        1 |        8);    /* Copy physical to physical and flush the source */
3834
3835         if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
3836                 /*
3837                  * issue a synchronous write to cluster_io
3838                  */
3839                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3840                                         0, (struct buf *)0, (struct clios *)0);
3841         }
3842         if (error == 0) {
3843                 uio->uio_offset += xsize;
3844                 iov->iov_base   += xsize;
3845                 iov->iov_len    -= xsize;
3846                 uio->uio_resid  -= xsize;
3847         }
3848         ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3849
3850         return (error);
3851 }
3852
3853
3854
3855 int
3856 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
3857 {
3858         int       pg_offset;
3859         int       pg_index;
3860         int       csize;
3861         int       segflg;
3862         int       retval = 0;
3863         upl_page_info_t *pl;
3864         boolean_t funnel_state = FALSE;
3865
3866
3867         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
3868                      (int)uio->uio_offset, uio->uio_resid, upl_offset, xsize, 0);
3869
3870         if (xsize >= (16 * 1024))
3871                 funnel_state = thread_funnel_set(kernel_flock, FALSE);
3872
3873         segflg = uio->uio_segflg;
3874
3875         switch(segflg) {
3876
3877           case UIO_USERSPACE:
3878           case UIO_USERISPACE:
3879                 uio->uio_segflg = UIO_PHYS_USERSPACE;
3880                 break;
3881
3882           case UIO_SYSSPACE:
3883                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
3884                 break;
3885         }
3886         pl = ubc_upl_pageinfo(upl);
3887
3888         pg_index  = upl_offset / PAGE_SIZE;
3889         pg_offset = upl_offset & PAGE_MASK;
3890         csize     = min(PAGE_SIZE - pg_offset, xsize);
3891
3892         while (xsize && retval == 0) {
3893                 addr64_t  paddr;
3894
3895                 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
3896
3897                 retval = uiomove64(paddr, csize, uio);
3898
3899                 pg_index += 1;
3900                 pg_offset = 0;
3901                 xsize    -= csize;
3902                 csize     = min(PAGE_SIZE, xsize);
3903         }
3904         uio->uio_segflg = segflg;
3905
3906         if (funnel_state == TRUE)
3907                 thread_funnel_set(kernel_flock, TRUE);
3908
3909         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3910                      (int)uio->uio_offset, uio->uio_resid, retval, segflg, 0);
3911
3912         return (retval);
3913 }
3914
3915
3916 int
3917 cluster_copy_ubc_data(struct vnode *vp, struct uio *uio, int *io_resid, int mark_dirty)
3918 {
3919         int       segflg;
3920         int       io_size;
3921         int       xsize;
3922         int       start_offset;
3923         off_t     f_offset;
3924         int       retval = 0;
3925         memory_object_control_t  control;
3926         int       op_flags = UPL_POP_SET | UPL_POP_BUSY;
3927         boolean_t funnel_state = FALSE;
3928
3929
3930         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
3931                      (int)uio->uio_offset, uio->uio_resid, 0, *io_resid, 0);
3932
3933         control = ubc_getobject(vp, UBC_FLAGS_NONE);
3934         if (control == MEMORY_OBJECT_CONTROL_NULL) {
3935                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3936                              (int)uio->uio_offset, uio->uio_resid, retval, 3, 0);
3937
3938                 return(0);
3939         }
3940         if (mark_dirty)
3941                 op_flags |= UPL_POP_DIRTY;
3942
3943         segflg = uio->uio_segflg;
3944
3945         switch(segflg) {
3946
3947           case UIO_USERSPACE:
3948           case UIO_USERISPACE:
3949                 uio->uio_segflg = UIO_PHYS_USERSPACE;
3950                 break;
3951
3952           case UIO_SYSSPACE:
3953                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
3954                 break;
3955         }
3956         io_size      = *io_resid;
3957         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3958         f_offset     = uio->uio_offset - start_offset;
3959         xsize        = min(PAGE_SIZE - start_offset, io_size);
3960
3961         while (io_size && retval == 0) {
3962                 ppnum_t pgframe;
3963
3964                 if (ubc_page_op_with_control(control, f_offset, op_flags, &pgframe, 0) != KERN_SUCCESS)
3965                         break;
3966
3967                 if (funnel_state == FALSE && io_size >= (16 * 1024))
3968                         funnel_state = thread_funnel_set(kernel_flock, FALSE);
3969
3970                 retval = uiomove64((addr64_t)(((addr64_t)pgframe << 12) + start_offset), xsize, uio);
3971
3972                 ubc_page_op_with_control(control, f_offset, UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
3973
3974                 io_size     -= xsize;
3975                 start_offset = 0;
3976                 f_offset     = uio->uio_offset;
3977                 xsize        = min(PAGE_SIZE, io_size);
3978         }
3979         uio->uio_segflg = segflg;
3980         *io_resid       = io_size;
3981
3982         if (funnel_state == TRUE)
3983                 thread_funnel_set(kernel_flock, TRUE);
3984
3985         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3986                      (int)uio->uio_offset, uio->uio_resid, retval, 0x80000000 | segflg, 0);
3987
3988         return(retval);
3989 }
3990
3991
3992 int
3993 is_file_clean(struct vnode *vp, off_t filesize)
3994 {
3995         off_t f_offset;
3996         int   flags;
3997         int   total_dirty = 0;
3998
3999         for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
4000                 if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
4001                         if (flags & UPL_POP_DIRTY) {
4002                                 total_dirty++;
4003                         }
4004                 }
4005         }
4006         if (total_dirty)
4007                 return(EINVAL);
4008
4009         return (0);
4010 }
4011
4012
4013
4014 /*
4015  * Dirty region tracking/clustering mechanism.
4016  *
4017  * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4018  * dirty regions within a larger space (file).  It is primarily intended to
4019  * support clustering in large files with many dirty areas.
4020  *
4021  * The implementation assumes that the dirty regions are pages.
4022  *
4023  * To represent dirty pages within the file, we store bit vectors in a
4024  * variable-size circular hash.
4025  */
4026
4027 /*
4028  * Bitvector size.  This determines the number of pages we group in a
4029  * single hashtable entry.  Each hashtable entry is aligned to this
4030  * size within the file.
4031  */
4032 #define DRT_BITVECTOR_PAGES             256
4033
4034 /*
4035  * File offset handling.
4036  *
4037  * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4038  * the correct formula is  (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4039  */
4040 #define DRT_ADDRESS_MASK                (~((1 << 20) - 1))
4041 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK)
4042
4043 /*
4044  * Hashtable address field handling.
4045  *
4046  * The low-order bits of the hashtable address are used to conserve
4047  * space.
4048  *
4049  * DRT_HASH_COUNT_MASK must be large enough to store the range
4050  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4051  * to indicate that the bucket is actually unoccupied.
4052  */
4053 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4054 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \
4055         do {                                                                                            \
4056                 (scm)->scm_hashtable[(i)].dhe_control =                                                 \
4057                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4058         } while (0)
4059 #define DRT_HASH_COUNT_MASK             0x1ff
4060 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4061 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \
4062         do {                                                                                                            \
4063                 (scm)->scm_hashtable[(i)].dhe_control =                                                                 \
4064                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \
4065         } while (0)
4066 #define DRT_HASH_CLEAR(scm, i)                                                                                          \
4067         do {                                                                                                            \
4068                 (scm)->scm_hashtable[(i)].dhe_control = 0;                                                              \
4069         } while (0)
4070 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4071 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4072 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \
4073         do {                                                                                            \
4074                 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \
4075                 DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \
4076         } while(0);
4077
4078
4079 /*
4080  * Hash table moduli.
4081  *
4082  * Since the hashtable entry's size is dependent on the size of
4083  * the bitvector, and since the hashtable size is constrained to
4084  * both being prime and fitting within the desired allocation
4085  * size, these values need to be manually determined.
4086  *
4087  * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4088  *
4089  * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4090  * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4091  */
4092 #define DRT_HASH_SMALL_MODULUS  23
4093 #define DRT_HASH_LARGE_MODULUS  401
4094
4095 #define DRT_SMALL_ALLOCATION    1024    /* 104 bytes spare */
4096 #define DRT_LARGE_ALLOCATION    16384   /* 344 bytes spare */
4097
4098 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4099
4100 /*
4101  * Hashtable bitvector handling.
4102  *
4103  * Bitvector fields are 32 bits long.
4104  */
4105
4106 #define DRT_HASH_SET_BIT(scm, i, bit)                           \
4107         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4108
4109 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \
4110         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4111
4112 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \
4113         ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4114
4115 #define DRT_BITVECTOR_CLEAR(scm, i)                             \
4116         bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4117
4118 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \
4119         bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \
4120             &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \
4121             (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4122
4123
4124
4125 /*
4126  * Hashtable entry.
4127  */
4128 struct vfs_drt_hashentry {
4129         u_int64_t       dhe_control;
4130         u_int32_t       dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
4131 };
4132
4133 /*
4134  * Dirty Region Tracking structure.
4135  *
4136  * The hashtable is allocated entirely inside the DRT structure.
4137  *
4138  * The hash is a simple circular prime modulus arrangement, the structure
4139  * is resized from small to large if it overflows.
4140  */
4141
4142 struct vfs_drt_clustermap {
4143         u_int32_t               scm_magic;      /* sanity/detection */
4144 #define DRT_SCM_MAGIC           0x12020003
4145         u_int32_t               scm_modulus;    /* current ring size */
4146         u_int32_t               scm_buckets;    /* number of occupied buckets */
4147         u_int32_t               scm_lastclean;  /* last entry we cleaned */
4148         u_int32_t               scm_iskips;     /* number of slot skips */
4149
4150         struct vfs_drt_hashentry scm_hashtable[0];
4151 };
4152
4153
4154 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus)
4155 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus)
4156
4157 /*
4158  * Debugging codes and arguments.
4159  */
4160 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4161 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4162 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4163 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4164 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4165                                                             * dirty */
4166                                                            /* 0, setcount */
4167                                                            /* 1 (clean, no map) */
4168                                                            /* 2 (map alloc fail) */
4169                                                            /* 3, resid (partial) */
4170 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87))
4171 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4172                                                             * lastclean, iskips */
4173
4174
4175 static void             vfs_drt_sanity(struct vfs_drt_clustermap *cmap);
4176 static kern_return_t    vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
4177 static kern_return_t    vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
4178 static kern_return_t    vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
4179         u_int64_t offset, int *indexp);
4180 static kern_return_t    vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
4181         u_int64_t offset,
4182         int *indexp,
4183         int recursed);
4184 static kern_return_t    vfs_drt_do_mark_pages(
4185         void            **cmapp,
4186         u_int64_t       offset,
4187         u_int           length,
4188         int             *setcountp,
4189         int             dirty);
4190 static void             vfs_drt_trace(
4191         struct vfs_drt_clustermap *cmap,
4192         int code,
4193         int arg1,
4194         int arg2,
4195         int arg3,
4196         int arg4);
4197
4198
4199 /*
4200  * Allocate and initialise a sparse cluster map.
4201  *
4202  * Will allocate a new map, resize or compact an existing map.
4203  *
4204  * XXX we should probably have at least one intermediate map size,
4205  * as the 1:16 ratio seems a bit drastic.
4206  */
4207 static kern_return_t
4208 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
4209 {
4210         struct vfs_drt_clustermap *cmap, *ocmap;
4211         kern_return_t   kret;
4212         u_int64_t       offset;
4213         int             nsize, i, active_buckets, index, copycount;
4214
4215         ocmap = NULL;
4216         if (cmapp != NULL)
4217                 ocmap = *cmapp;
4218
4219         /*
4220          * Decide on the size of the new map.
4221          */
4222         if (ocmap == NULL) {
4223                 nsize = DRT_HASH_SMALL_MODULUS;
4224         } else {
4225                 /* count the number of active buckets in the old map */
4226                 active_buckets = 0;
4227                 for (i = 0; i < ocmap->scm_modulus; i++) {
4228                         if (!DRT_HASH_VACANT(ocmap, i) &&
4229                             (DRT_HASH_GET_COUNT(ocmap, i) != 0))
4230                                 active_buckets++;
4231                 }
4232                 /*
4233                  * If we're currently using the small allocation, check to
4234                  * see whether we should grow to the large one.
4235                  */
4236                 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
4237                         /* if the ring is nearly full */
4238                         if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
4239                                 nsize = DRT_HASH_LARGE_MODULUS;
4240                         } else {
4241                                 nsize = DRT_HASH_SMALL_MODULUS;
4242                         }
4243                 } else {
4244                         /* already using the large modulus */
4245                         nsize = DRT_HASH_LARGE_MODULUS;
4246                         /*
4247                          * If the ring is completely full, there's
4248                          * nothing useful for us to do.  Behave as
4249                          * though we had compacted into the new
4250                          * array and return.
4251                          */
4252                         if (active_buckets >= DRT_HASH_LARGE_MODULUS)
4253                                 return(KERN_SUCCESS);
4254                 }
4255         }
4256
4257         /*
4258          * Allocate and initialise the new map.
4259          */
4260
4261         kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
4262             (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4263         if (kret != KERN_SUCCESS)
4264                 return(kret);
4265         cmap->scm_magic = DRT_SCM_MAGIC;
4266         cmap->scm_modulus = nsize;
4267         cmap->scm_buckets = 0;
4268         cmap->scm_lastclean = 0;
4269         cmap->scm_iskips = 0;
4270         for (i = 0; i < cmap->scm_modulus; i++) {
4271                 DRT_HASH_CLEAR(cmap, i);
4272                 DRT_HASH_VACATE(cmap, i);
4273                 DRT_BITVECTOR_CLEAR(cmap, i);
4274         }
4275
4276         /*
4277          * If there's an old map, re-hash entries from it into the new map.
4278          */
4279         copycount = 0;
4280         if (ocmap != NULL) {
4281                 for (i = 0; i < ocmap->scm_modulus; i++) {
4282                         /* skip empty buckets */
4283                         if (DRT_HASH_VACANT(ocmap, i) ||
4284                             (DRT_HASH_GET_COUNT(ocmap, i) == 0))
4285                                 continue;
4286                         /* get new index */
4287                         offset = DRT_HASH_GET_ADDRESS(ocmap, i);
4288                         kret = vfs_drt_get_index(&cmap, offset, &index, 1);
4289                         if (kret != KERN_SUCCESS) {
4290                                 /* XXX need to bail out gracefully here */
4291                                 panic("vfs_drt: new cluster map mysteriously too small");
4292                         }
4293                         /* copy */
4294                         DRT_HASH_COPY(ocmap, i, cmap, index);
4295                         copycount++;
4296                 }
4297         }
4298
4299         /* log what we've done */
4300         vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
4301
4302         /*
4303          * It's important to ensure that *cmapp always points to
4304          * a valid map, so we must overwrite it before freeing
4305          * the old map.
4306          */
4307         *cmapp = cmap;
4308         if (ocmap != NULL) {
4309                 /* emit stats into trace buffer */
4310                 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
4311                               ocmap->scm_modulus,
4312                               ocmap->scm_buckets,
4313                               ocmap->scm_lastclean,
4314                               ocmap->scm_iskips);
4315
4316                 vfs_drt_free_map(ocmap);
4317         }
4318         return(KERN_SUCCESS);
4319 }
4320
4321
4322 /*
4323  * Free a sparse cluster map.
4324  */
4325 static kern_return_t
4326 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
4327 {
4328         kern_return_t   ret;
4329
4330         kmem_free(kernel_map, (vm_offset_t)cmap,
4331                   (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4332         return(KERN_SUCCESS);
4333 }
4334
4335
4336 /*
4337  * Find the hashtable slot currently occupied by an entry for the supplied offset.
4338  */
4339 static kern_return_t
4340 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
4341 {
4342         kern_return_t   kret;
4343         int             index, i, tries;
4344
4345         offset = DRT_ALIGN_ADDRESS(offset);
4346         index = DRT_HASH(cmap, offset);
4347
4348         /* traverse the hashtable */
4349         for (i = 0; i < cmap->scm_modulus; i++) {
4350
4351                 /*
4352                  * If the slot is vacant, we can stop.
4353                  */
4354                 if (DRT_HASH_VACANT(cmap, index))
4355                         break;
4356
4357                 /*
4358                  * If the address matches our offset, we have success.
4359                  */
4360                 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
4361                         *indexp = index;
4362                         return(KERN_SUCCESS);
4363                 }
4364
4365                 /*
4366                  * Move to the next slot, try again.
4367                  */
4368                 index = DRT_HASH_NEXT(cmap, index);
4369         }
4370         /*
4371          * It's not there.
4372          */
4373         return(KERN_FAILURE);
4374 }
4375
4376 /*
4377  * Find the hashtable slot for the supplied offset.  If we haven't allocated
4378  * one yet, allocate one and populate the address field.  Note that it will
4379  * not have a nonzero page count and thus will still technically be free, so
4380  * in the case where we are called to clean pages, the slot will remain free.
4381  */
4382 static kern_return_t
4383 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
4384 {
4385         struct vfs_drt_clustermap *cmap;
4386         kern_return_t   kret;
4387         int             index, i;
4388
4389         cmap = *cmapp;
4390
4391         /* look for an existing entry */
4392         kret = vfs_drt_search_index(cmap, offset, indexp);
4393         if (kret == KERN_SUCCESS)
4394                 return(kret);
4395
4396         /* need to allocate an entry */
4397         offset = DRT_ALIGN_ADDRESS(offset);
4398         index = DRT_HASH(cmap, offset);
4399
4400         /* scan from the index forwards looking for a vacant slot */
4401         for (i = 0; i < cmap->scm_modulus; i++) {
4402                 /* slot vacant? */
4403                 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
4404                         cmap->scm_buckets++;
4405                         if (index < cmap->scm_lastclean)
4406                                 cmap->scm_lastclean = index;
4407                         DRT_HASH_SET_ADDRESS(cmap, index, offset);
4408                         DRT_HASH_SET_COUNT(cmap, index, 0);
4409                         DRT_BITVECTOR_CLEAR(cmap, index);
4410                         *indexp = index;
4411                         vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
4412                         return(KERN_SUCCESS);
4413                 }
4414                 cmap->scm_iskips += i;
4415                 index = DRT_HASH_NEXT(cmap, index);
4416         }
4417
4418         /*
4419          * We haven't found a vacant slot, so the map is full.  If we're not
4420          * already recursed, try reallocating/compacting it.
4421          */
4422         if (recursed)
4423                 return(KERN_FAILURE);
4424         kret = vfs_drt_alloc_map(cmapp);
4425         if (kret == KERN_SUCCESS) {
4426                 /* now try to insert again */
4427                 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
4428         }
4429         return(kret);
4430 }
4431
4432 /*
4433  * Implementation of set dirty/clean.
4434  *
4435  * In the 'clean' case, not finding a map is OK.
4436  */
4437 static kern_return_t
4438 vfs_drt_do_mark_pages(
4439         void            **private,
4440         u_int64_t       offset,
4441         u_int           length,
4442         int             *setcountp,
4443         int             dirty)
4444 {
4445         struct vfs_drt_clustermap *cmap, **cmapp;
4446         kern_return_t   kret;
4447         int             i, index, pgoff, pgcount, setcount, ecount;
4448
4449         cmapp = (struct vfs_drt_clustermap **)private;
4450         cmap = *cmapp;
4451
4452         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
4453
4454         if (setcountp != NULL)
4455                 *setcountp = 0;
4456
4457         /* allocate a cluster map if we don't already have one */
4458         if (cmap == NULL) {
4459                 /* no cluster map, nothing to clean */
4460                 if (!dirty) {
4461                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
4462                         return(KERN_SUCCESS);
4463                 }
4464                 kret = vfs_drt_alloc_map(cmapp);
4465                 if (kret != KERN_SUCCESS) {
4466                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
4467                         return(kret);
4468                 }
4469         }
4470         setcount = 0;
4471
4472         /*
4473          * Iterate over the length of the region.
4474          */
4475         while (length > 0) {
4476                 /*
4477                  * Get the hashtable index for this offset.
4478                  *
4479                  * XXX this will add blank entries if we are clearing a range
4480                  * that hasn't been dirtied.
4481                  */
4482                 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
4483                 cmap = *cmapp;  /* may have changed! */
4484                 /* this may be a partial-success return */
4485                 if (kret != KERN_SUCCESS) {
4486                         if (setcountp != NULL)
4487                                 *setcountp = setcount;
4488                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
4489
4490                         return(kret);
4491                 }
4492
4493                 /*
4494                  * Work out how many pages we're modifying in this
4495                  * hashtable entry.
4496                  */
4497                 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
4498                 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
4499
4500                 /*
4501                  * Iterate over pages, dirty/clearing as we go.
4502                  */
4503                 ecount = DRT_HASH_GET_COUNT(cmap, index);
4504                 for (i = 0; i < pgcount; i++) {
4505                         if (dirty) {
4506                                 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4507                                         DRT_HASH_SET_BIT(cmap, index, pgoff + i);
4508                                         ecount++;
4509                                         setcount++;
4510                                 }
4511                         } else {
4512                                 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4513                                         DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
4514                                         ecount--;
4515                                         setcount++;
4516                                 }
4517                         }
4518                 }
4519                 DRT_HASH_SET_COUNT(cmap, index, ecount);
4520 next:
4521                 offset += pgcount * PAGE_SIZE;
4522                 length -= pgcount * PAGE_SIZE;
4523         }
4524         if (setcountp != NULL)
4525                 *setcountp = setcount;
4526
4527         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
4528
4529         return(KERN_SUCCESS);
4530 }
4531
4532 /*
4533  * Mark a set of pages as dirty/clean.
4534  *
4535  * This is a public interface.
4536  *
4537  * cmapp
4538  *      Pointer to storage suitable for holding a pointer.  Note that
4539  *      this must either be NULL or a value set by this function.
4540  *
4541  * size
4542  *      Current file size in bytes.
4543  *
4544  * offset
4545  *      Offset of the first page to be marked as dirty, in bytes.  Must be
4546  *      page-aligned.
4547  *
4548  * length
4549  *      Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE.
4550  *
4551  * setcountp
4552  *      Number of pages newly marked dirty by this call (optional).
4553  *
4554  * Returns KERN_SUCCESS if all the pages were successfully marked.
4555  */
4556 static kern_return_t
4557 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp)
4558 {
4559         /* XXX size unused, drop from interface */
4560         return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
4561 }
4562
4563 static kern_return_t
4564 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
4565 {
4566         return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
4567 }
4568
4569 /*
4570  * Get a cluster of dirty pages.
4571  *
4572  * This is a public interface.
4573  *
4574  * cmapp
4575  *      Pointer to storage managed by drt_mark_pages.  Note that this must
4576  *      be NULL or a value set by drt_mark_pages.
4577  *
4578  * offsetp
4579  *      Returns the byte offset into the file of the first page in the cluster.
4580  *
4581  * lengthp
4582  *      Returns the length in bytes of the cluster of dirty pages.
4583  *
4584  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there
4585  * are no dirty pages meeting the minmum size criteria.  Private storage will
4586  * be released if there are no more dirty pages left in the map
4587  *
4588  */
4589 static kern_return_t
4590 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
4591 {
4592         struct vfs_drt_clustermap *cmap;
4593         u_int64_t       offset;
4594         u_int           length;
4595         int             index, i, j, fs, ls;
4596
4597         /* sanity */
4598         if ((cmapp == NULL) || (*cmapp == NULL))
4599                 return(KERN_FAILURE);
4600         cmap = *cmapp;
4601
4602         /* walk the hashtable */
4603         for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
4604                 index = DRT_HASH(cmap, offset);
4605
4606                 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
4607                         continue;
4608
4609                 /* scan the bitfield for a string of bits */
4610                 fs = -1;
4611
4612                 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
4613                         if (DRT_HASH_TEST_BIT(cmap, index, i)) {
4614                                 fs = i;
4615                                 break;
4616                         }
4617                 }
4618                 if (fs == -1) {
4619                         /*  didn't find any bits set */
4620                         panic("vfs_drt: entry summary count > 0 but no bits set in map");
4621                 }
4622                 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
4623                         if (!DRT_HASH_TEST_BIT(cmap, index, i))
4624                                 break;
4625                 }
4626
4627                 /* compute offset and length, mark pages clean */
4628                 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
4629                 length = ls * PAGE_SIZE;
4630                 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
4631                 cmap->scm_lastclean = index;
4632
4633                 /* return successful */
4634                 *offsetp = (off_t)offset;
4635                 *lengthp = length;
4636
4637                 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
4638                 return(KERN_SUCCESS);
4639         }
4640         /*
4641          * We didn't find anything... hashtable is empty
4642          * emit stats into trace buffer and
4643          * then free it
4644          */
4645         vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
4646                       cmap->scm_modulus,
4647                       cmap->scm_buckets,
4648                       cmap->scm_lastclean,
4649                       cmap->scm_iskips);
4650
4651         vfs_drt_free_map(cmap);
4652         *cmapp = NULL;
4653
4654         return(KERN_FAILURE);
4655 }
4656
4657
4658 static kern_return_t
4659 vfs_drt_control(void **cmapp, int op_type)
4660 {
4661         struct vfs_drt_clustermap *cmap;
4662
4663         /* sanity */
4664         if ((cmapp == NULL) || (*cmapp == NULL))
4665                 return(KERN_FAILURE);
4666         cmap = *cmapp;
4667
4668         switch (op_type) {
4669         case 0:
4670                 /* emit stats into trace buffer */
4671                 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
4672                               cmap->scm_modulus,
4673                               cmap->scm_buckets,
4674                               cmap->scm_lastclean,
4675                               cmap->scm_iskips);
4676
4677                 vfs_drt_free_map(cmap);
4678                 *cmapp = NULL;
4679                 break;
4680
4681         case 1:
4682                 cmap->scm_lastclean = 0;
4683                 break;
4684         }
4685         return(KERN_SUCCESS);
4686 }
4687
4688
4689
4690 /*
4691  * Emit a summary of the state of the clustermap into the trace buffer
4692  * along with some caller-provided data.
4693  */
4694 static void
4695 vfs_drt_trace(struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
4696 {
4697         KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
4698 }
4699
4700 /*
4701  * Perform basic sanity check on the hash entry summary count
4702  * vs. the actual bits set in the entry.
4703  */
4704 static void
4705 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
4706 {
4707         int index, i;
4708         int bits_on;
4709
4710         for (index = 0; index < cmap->scm_modulus; index++) {
4711                 if (DRT_HASH_VACANT(cmap, index))
4712                         continue;
4713
4714                 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
4715                         if (DRT_HASH_TEST_BIT(cmap, index, i))
4716                                 bits_on++;
4717                 }
4718                 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
4719                         panic("bits_on = %d,  index = %d\n", bits_on, index);
4720         }
4721 }