bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * Redistribution and use in source and binary forms, with or without
  28  * modification, are permitted provided that the following conditions
  29  * are met:
  30  * 1. Redistributions of source code must retain the above copyright
  31  *    notice, this list of conditions and the following disclaimer.
  32  * 2. Redistributions in binary form must reproduce the above copyright
  33  *    notice, this list of conditions and the following disclaimer in the
  34  *    documentation and/or other materials provided with the distribution.
  35  * 3. All advertising materials mentioning features or use of this software
  36  *    must display the following acknowledgement:
  37  *      This product includes software developed by the University of
  38  *      California, Berkeley and its contributors.
  39  * 4. Neither the name of the University nor the names of its contributors
  40  *    may be used to endorse or promote products derived from this software
  41  *    without specific prior written permission.
  42  *
  43  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  46  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  53  * SUCH DAMAGE.
  54  *
  55  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  56  */
  57
  58 #include <sys/param.h>
  59 #include <sys/proc.h>
  60 #include <sys/buf.h>
  61 #include <sys/vnode.h>
  62 #include <sys/mount.h>
  63 #include <sys/trace.h>
  64 #include <sys/malloc.h>
  65 #include <sys/time.h>
  66 #include <sys/kernel.h>
  67 #include <sys/resourcevar.h>
  68 #include <libkern/libkern.h>
  69 #include <machine/machine_routines.h>
  70
  71 #include <sys/ubc.h>
  72 #include <vm/vm_pageout.h>
  73
  74 #include <mach/mach_types.h>
  75 #include <mach/memory_object_types.h>
  76
  77 #include <sys/kdebug.h>
  78
  79 #define CL_READ      0x01
  80 #define CL_ASYNC     0x02
  81 #define CL_COMMIT    0x04
  82 #define CL_PAGEOUT   0x10
  83 #define CL_AGE       0x20
  84 #define CL_DUMP      0x40
  85 #define CL_NOZERO    0x80
  86 #define CL_PAGEIN    0x100
  87 #define CL_DEV_MEMORY 0x200
  88 #define CL_PRESERVE   0x400
  89 #define CL_THROTTLE   0x800
  90
  91
  92 struct clios {
  93         u_int  io_completed;       /* amount of io that has currently completed */
  94         u_int  io_issued;          /* amount of io that was successfully issued */
  95         int    io_error;           /* error code of first error encountered */
  96         int    io_wanted;          /* someone is sleeping waiting for a change in state */
  97 };
  98
  99
 100 static void cluster_zero(upl_t upl, vm_offset_t   upl_offset,
 101                 int size, struct buf *bp);
 102 static int cluster_read_x(struct vnode *vp, struct uio *uio,
 103                 off_t filesize, int devblocksize, int flags);
 104 static int cluster_write_x(struct vnode *vp, struct uio *uio,
 105                 off_t oldEOF, off_t newEOF, off_t headOff,
 106                 off_t tailOff, int devblocksize, int flags);
 107 static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
 108                 off_t filesize, int devblocksize, int flags);
 109 static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
 110                 off_t newEOF, int devblocksize, int flags);
 111 static int cluster_phys_read(struct vnode *vp, struct uio *uio,
 112                 off_t filesize, int devblocksize, int flags);
 113 static int cluster_phys_write(struct vnode *vp, struct uio *uio,
 114                 off_t newEOF, int devblocksize, int flags);
 115 static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
 116                 addr64_t usr_paddr, int xsize, int devblocksize, int flags);
 117 static int cluster_push_x(struct vnode *vp, off_t EOF, unsigned int first, unsigned int last, int can_delay);
 118 static int cluster_try_push(struct vnode *vp, off_t EOF, int can_delay, int push_all);
 119
 120 static int sparse_cluster_switch(struct vnode *vp, off_t EOF);
 121 static int sparse_cluster_push(struct vnode *vp, off_t EOF, int push_all);
 122 static int sparse_cluster_add(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last);
 123
 124 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp);
 125 static kern_return_t vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length);
 126 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
 127 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
 128
 129 int     ubc_page_op_with_control __P((memory_object_control_t, off_t, int, ppnum_t *, int *));
 130
 131
 132 /*
 133  * throttle the number of async writes that
 134  * can be outstanding on a single vnode
 135  * before we issue a synchronous write
 136  */
 137 #define ASYNC_THROTTLE  18
 138 #define HARD_THROTTLE_MAXCNT 1
 139 #define HARD_THROTTLE_MAXSIZE (64 * 1024)
 140
 141 int hard_throttle_on_root = 0;
 142 struct timeval priority_IO_timestamp_for_root;
 143
 144
 145 static int
 146 cluster_hard_throttle_on(vp)
 147         struct vnode *vp;
 148 {
 149         static struct timeval hard_throttle_maxelapsed = { 0, 300000 };
 150
 151         if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
 152                 struct timeval elapsed;
 153
 154                 if (hard_throttle_on_root)
 155                         return(1);
 156
 157                 elapsed = time;
 158                 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
 159
 160                 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
 161                         return(1);
 162         }
 163         return(0);
 164 }
 165
 166
 167 static int
 168 cluster_iodone(bp)
 169         struct buf *bp;
 170 {
 171         int         b_flags;
 172         int         error;
 173         int         total_size;
 174         int         total_resid;
 175         int         upl_offset;
 176         int         zero_offset;
 177         upl_t       upl;
 178         struct buf *cbp;
 179         struct buf *cbp_head;
 180         struct buf *cbp_next;
 181         struct buf *real_bp;
 182         struct vnode *vp;
 183         struct clios *iostate;
 184         int         commit_size;
 185         int         pg_offset;
 186
 187
 188         cbp_head = (struct buf *)(bp->b_trans_head);
 189
 190         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 191                      (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 192
 193         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 194                 /*
 195                  * all I/O requests that are part of this transaction
 196                  * have to complete before we can process it
 197                  */
 198                 if ( !(cbp->b_flags & B_DONE)) {
 199
 200                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 201                                      (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
 202
 203                         return 0;
 204                 }
 205         }
 206         error       = 0;
 207         total_size  = 0;
 208         total_resid = 0;
 209
 210         cbp        = cbp_head;
 211         upl_offset = cbp->b_uploffset;
 212         upl        = cbp->b_pagelist;
 213         b_flags    = cbp->b_flags;
 214         real_bp    = cbp->b_real_bp;
 215         vp         = cbp->b_vp;
 216         zero_offset= cbp->b_validend;
 217         iostate    = (struct clios *)cbp->b_iostate;
 218
 219         while (cbp) {
 220                 if ((cbp->b_flags & B_ERROR) && error == 0)
 221                         error = cbp->b_error;
 222
 223                 total_resid += cbp->b_resid;
 224                 total_size  += cbp->b_bcount;
 225
 226                 cbp_next = cbp->b_trans_next;
 227
 228                 free_io_buf(cbp);
 229
 230                 cbp = cbp_next;
 231         }
 232         if (zero_offset)
 233                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 234
 235         if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
 236                 vp->v_flag &= ~VTHROTTLED;
 237                 wakeup((caddr_t)&vp->v_numoutput);
 238         }
 239         if (iostate) {
 240                 /*
 241                  * someone has issued multiple I/Os asynchrounsly
 242                  * and is waiting for them to complete (streaming)
 243                  */
 244                 if (error && iostate->io_error == 0)
 245                         iostate->io_error = error;
 246
 247                 iostate->io_completed += total_size;
 248
 249                 if (iostate->io_wanted) {
 250                         /*
 251                          * someone is waiting for the state of
 252                          * this io stream to change
 253                          */
 254                         iostate->io_wanted = 0;
 255                         wakeup((caddr_t)&iostate->io_wanted);
 256                 }
 257         }
 258         if ((b_flags & B_NEED_IODONE) && real_bp) {
 259                 if (error) {
 260                         real_bp->b_flags |= B_ERROR;
 261                         real_bp->b_error = error;
 262                 }
 263                 real_bp->b_resid = total_resid;
 264
 265                 biodone(real_bp);
 266         }
 267         if (error == 0 && total_resid)
 268                 error = EIO;
 269
 270         if (b_flags & B_COMMIT_UPL) {
 271                 pg_offset   = upl_offset & PAGE_MASK;
 272                 commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 273
 274                 if (error || (b_flags & B_NOCACHE)) {
 275                         int upl_abort_code;
 276
 277                         if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
 278                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 279                         else if (b_flags & B_PGIN)
 280                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 281                         else
 282                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 283
 284                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 285                                         upl_abort_code);
 286
 287                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 288                                      (int)upl, upl_offset - pg_offset, commit_size,
 289                                      0x80000000|upl_abort_code, 0);
 290
 291                 } else {
 292                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 293
 294                         if (b_flags & B_PHYS) {
 295                                 if (b_flags & B_READ)
 296                                         upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
 297                         } else if ( !(b_flags & B_PAGEOUT))
 298                                 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
 299
 300                         if (b_flags & B_AGE)
 301                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 302
 303                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 304                                         upl_commit_flags);
 305
 306                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 307                                      (int)upl, upl_offset - pg_offset, commit_size,
 308                                      upl_commit_flags, 0);
 309                 }
 310         } else
 311                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 312                              (int)upl, upl_offset, 0, error, 0);
 313
 314         return (error);
 315 }
 316
 317
 318 static void
 319 cluster_zero(upl, upl_offset, size, bp)
 320         upl_t         upl;
 321         vm_offset_t   upl_offset;
 322         int           size;
 323         struct buf   *bp;
 324 {
 325         upl_page_info_t *pl;
 326
 327         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
 328                      upl_offset, size, (int)bp, 0, 0);
 329
 330         if (bp == NULL || bp->b_data == NULL) {
 331
 332                 pl = ubc_upl_pageinfo(upl);
 333
 334                 while (size) {
 335                         int           page_offset;
 336                         int           page_index;
 337                         addr64_t      zero_addr;
 338                         int           zero_cnt;
 339
 340                         page_index  = upl_offset / PAGE_SIZE;
 341                         page_offset = upl_offset & PAGE_MASK;
 342
 343                         zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
 344                         zero_cnt  = min(PAGE_SIZE - page_offset, size);
 345
 346                         bzero_phys(zero_addr, zero_cnt);
 347
 348                         size       -= zero_cnt;
 349                         upl_offset += zero_cnt;
 350                 }
 351         } else
 352                 bzero((caddr_t)((vm_offset_t)bp->b_data + upl_offset), size);
 353
 354         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
 355                      upl_offset, size, 0, 0, 0);
 356 }
 357
 358 static int
 359 cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
 360         struct vnode *vp;
 361         upl_t         upl;
 362         vm_offset_t   upl_offset;
 363         off_t         f_offset;
 364         int           non_rounded_size;
 365         int           devblocksize;
 366         int           flags;
 367         struct buf   *real_bp;
 368         struct clios *iostate;
 369 {
 370         struct buf   *cbp;
 371         u_int         size;
 372         u_int         io_size;
 373         int           io_flags;
 374         int           error = 0;
 375         int           retval = 0;
 376         struct buf   *cbp_head = 0;
 377         struct buf   *cbp_tail = 0;
 378         int buf_count = 0;
 379         int pg_count;
 380         int pg_offset;
 381         u_int max_iosize;
 382         u_int max_vectors;
 383         int priv;
 384         int zero_offset = 0;
 385         int async_throttle;
 386
 387         if (devblocksize)
 388                 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
 389         else
 390                 size = non_rounded_size;
 391
 392         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 393                      (int)f_offset, size, upl_offset, flags, 0);
 394
 395
 396         if (flags & CL_READ) {
 397                 io_flags = (B_VECTORLIST | B_READ);
 398
 399                 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
 400         } else {
 401                 io_flags = (B_VECTORLIST | B_WRITEINPROG);
 402
 403                 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
 404         }
 405         /*
 406          * make sure the maximum iosize are at least the size of a page
 407          * and that they are multiples of the page size
 408          */
 409         max_iosize  &= ~PAGE_MASK;
 410
 411         if (flags & CL_THROTTLE) {
 412                 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
 413                         if (max_iosize > HARD_THROTTLE_MAXSIZE)
 414                                 max_iosize = HARD_THROTTLE_MAXSIZE;
 415                         async_throttle = HARD_THROTTLE_MAXCNT;
 416                 } else
 417                         async_throttle = ASYNC_THROTTLE;
 418         }
 419         if (flags & CL_AGE)
 420                 io_flags |= B_AGE;
 421         if (flags & CL_DUMP)
 422                 io_flags |= B_NOCACHE;
 423         if (flags & CL_PAGEIN)
 424                 io_flags |= B_PGIN;
 425         if (flags & CL_PAGEOUT)
 426                 io_flags |= B_PAGEOUT;
 427         if (flags & CL_COMMIT)
 428                 io_flags |= B_COMMIT_UPL;
 429         if (flags & CL_PRESERVE)
 430                 io_flags |= B_PHYS;
 431
 432         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 433                 /*
 434                  * then we are going to end up
 435                  * with a page that we can't complete (the file size wasn't a multiple
 436                  * of PAGE_SIZE and we're trying to read to the end of the file
 437                  * so we'll go ahead and zero out the portion of the page we can't
 438                  * read in from the file
 439                  */
 440                 zero_offset = upl_offset + non_rounded_size;
 441         }
 442         while (size) {
 443                 int vsize;
 444                 int i;
 445                 int pg_resid;
 446                 int num_contig;
 447                 daddr_t lblkno;
 448                 daddr_t blkno;
 449
 450                 if (size > max_iosize)
 451                         io_size = max_iosize;
 452                 else
 453                         io_size = size;
 454
 455                 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
 456                         if (error == EOPNOTSUPP)
 457                                 panic("VOP_CMAP Unimplemented");
 458                         break;
 459                 }
 460
 461                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 462                              (int)f_offset, (int)blkno, io_size, zero_offset, 0);
 463
 464                 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
 465                         if (flags & CL_PAGEOUT) {
 466                                 error = EINVAL;
 467                                 break;
 468                         };
 469
 470                         /* Try paging out the page individually before
 471                            giving up entirely and dumping it (it could
 472                            be mapped in a "hole" and require allocation
 473                            before the I/O:
 474                          */
 475                          ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
 476                          if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
 477                                 error = EINVAL;
 478                                 break;
 479                          };
 480
 481                         f_offset   += PAGE_SIZE_64;
 482                         upl_offset += PAGE_SIZE;
 483                         size       -= PAGE_SIZE;
 484                         continue;
 485                 }
 486                 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
 487                 /*
 488                  * we have now figured out how much I/O we can do - this is in 'io_size'
 489                  * pg_offset is the starting point in the first page for the I/O
 490                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 491                  */
 492                 pg_offset = upl_offset & PAGE_MASK;
 493
 494                 if (flags & CL_DEV_MEMORY) {
 495                         /*
 496                          * currently, can't deal with reading 'holes' in file
 497                          */
 498                         if ((long)blkno == -1) {
 499                                 error = EINVAL;
 500                                 break;
 501                         }
 502                         /*
 503                          * treat physical requests as one 'giant' page
 504                          */
 505                         pg_count = 1;
 506                 } else
 507                         pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 508
 509                 if ((flags & CL_READ) && (long)blkno == -1) {
 510                         int bytes_to_zero;
 511
 512                         /*
 513                          * if we're reading and blkno == -1, then we've got a
 514                          * 'hole' in the file that we need to deal with by zeroing
 515                          * out the affected area in the upl
 516                          */
 517                         if (zero_offset && io_size == size) {
 518                                 /*
 519                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
 520                                  * than 'zero_offset' will be non-zero
 521                                  * if the 'hole' returned by VOP_CMAP extends all the way to the eof
 522                                  * (indicated by the io_size finishing off the I/O request for this UPL)
 523                                  * than we're not going to issue an I/O for the
 524                                  * last page in this upl... we need to zero both the hole and the tail
 525                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
 526                                  */
 527                                 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
 528
 529                                 zero_offset = 0;
 530                         } else
 531                                 bytes_to_zero = io_size;
 532
 533                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
 534
 535                         if (cbp_head)
 536                                 /*
 537                                  * if there is a current I/O chain pending
 538                                  * then the first page of the group we just zero'd
 539                                  * will be handled by the I/O completion if the zero
 540                                  * fill started in the middle of the page
 541                                  */
 542                                 pg_count = (io_size - pg_offset) / PAGE_SIZE;
 543                         else {
 544                                 /*
 545                                  * no pending I/O to pick up that first page
 546                                  * so, we have to make sure it gets committed
 547                                  * here.
 548                                  * set the pg_offset to 0 so that the upl_commit_range
 549                                  * starts with this page
 550                                  */
 551                                 pg_count = (io_size + pg_offset) / PAGE_SIZE;
 552                                 pg_offset = 0;
 553                         }
 554                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 555                                 /*
 556                                  * if we're done with the request for this UPL
 557                                  * then we have to make sure to commit the last page
 558                                  * even if we only partially zero-filled it
 559                                  */
 560                                 pg_count++;
 561
 562                         if (pg_count) {
 563                                 if (pg_offset)
 564                                         pg_resid = PAGE_SIZE - pg_offset;
 565                                 else
 566                                         pg_resid = 0;
 567
 568                                 if (flags & CL_COMMIT)
 569                                         ubc_upl_commit_range(upl,
 570                                                         (upl_offset + pg_resid) & ~PAGE_MASK,
 571                                                         pg_count * PAGE_SIZE,
 572                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 573                         }
 574                         upl_offset += io_size;
 575                         f_offset   += io_size;
 576                         size       -= io_size;
 577
 578                         if (cbp_head && pg_count)
 579                                 goto start_io;
 580                         continue;
 581
 582                 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
 583                         real_bp->b_blkno = blkno;
 584                 }
 585
 586                 if (pg_count > max_vectors) {
 587                         io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 588
 589                         if (io_size < 0) {
 590                                 io_size = PAGE_SIZE - pg_offset;
 591                                 pg_count = 1;
 592                         } else
 593                                 pg_count = max_vectors;
 594                 }
 595
 596                 if ( !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV))
 597                         /*
 598                          * if we're not targeting a virtual device i.e. a disk image
 599                          * it's safe to dip into the reserve pool since real devices
 600                          * can complete this I/O request without requiring additional
 601                          * bufs from the alloc_io_buf pool
 602                          */
 603                         priv = 1;
 604                 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 605                         /*
 606                          * Throttle the speculative IO
 607                          */
 608                         priv = 0;
 609                 else
 610                         priv = 1;
 611
 612                 cbp = alloc_io_buf(vp, priv);
 613
 614
 615                 if (flags & CL_PAGEOUT) {
 616                         for (i = 0; i < pg_count; i++) {
 617                                 int         s;
 618                                 struct buf *bp;
 619
 620                                 s = splbio();
 621                                 if (bp = incore(vp, lblkno + i)) {
 622                                         if (!ISSET(bp->b_flags, B_BUSY)) {
 623                                                 bremfree(bp);
 624                                                 SET(bp->b_flags, (B_BUSY | B_INVAL));
 625                                                 splx(s);
 626                                                 brelse(bp);
 627                                         } else
 628                                                 panic("BUSY bp found in cluster_io");
 629                                 }
 630                                 splx(s);
 631                         }
 632                 }
 633                 if (flags & CL_ASYNC) {
 634                         cbp->b_flags |= (B_CALL | B_ASYNC);
 635                         cbp->b_iodone = (void *)cluster_iodone;
 636                 }
 637                 cbp->b_flags |= io_flags;
 638
 639                 cbp->b_lblkno = lblkno;
 640                 cbp->b_blkno  = blkno;
 641                 cbp->b_bcount = io_size;
 642                 cbp->b_pagelist  = upl;
 643                 cbp->b_uploffset = upl_offset;
 644                 cbp->b_trans_next = (struct buf *)0;
 645
 646                 if (cbp->b_iostate = (void *)iostate)
 647                         /*
 648                          * caller wants to track the state of this
 649                          * io... bump the amount issued against this stream
 650                          */
 651                         iostate->io_issued += io_size;
 652
 653                 if (flags & CL_READ)
 654                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 655                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 656                 else
 657                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 658                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 659
 660                 if (cbp_head) {
 661                         cbp_tail->b_trans_next = cbp;
 662                         cbp_tail = cbp;
 663                 } else {
 664                         cbp_head = cbp;
 665                         cbp_tail = cbp;
 666                 }
 667                 (struct buf *)(cbp->b_trans_head) = cbp_head;
 668                 buf_count++;
 669
 670                 upl_offset += io_size;
 671                 f_offset   += io_size;
 672                 size       -= io_size;
 673
 674                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
 675                         /*
 676                          * if we have no more I/O to issue or
 677                          * the current I/O we've prepared fully
 678                          * completes the last page in this request
 679                          * and it's either an ASYNC request or
 680                          * we've already accumulated more than 8 I/O's into
 681                          * this transaction and it's not an I/O directed to
 682                          * special DEVICE memory
 683                          * then go ahead and issue the I/O
 684                          */
 685 start_io:
 686                         if (real_bp) {
 687                                 cbp_head->b_flags |= B_NEED_IODONE;
 688                                 cbp_head->b_real_bp = real_bp;
 689                         } else
 690                                 cbp_head->b_real_bp = (struct buf *)NULL;
 691
 692                         if (size == 0) {
 693                                 /*
 694                                  * we're about to issue the last I/O for this upl
 695                                  * if this was a read to the eof and the eof doesn't
 696                                  * finish on a page boundary, than we need to zero-fill
 697                                  * the rest of the page....
 698                                  */
 699                                 cbp_head->b_validend = zero_offset;
 700                         } else
 701                                 cbp_head->b_validend = 0;
 702
 703                         if (flags & CL_THROTTLE) {
 704                                 while (vp->v_numoutput >= async_throttle) {
 705                                         vp->v_flag |= VTHROTTLED;
 706                                         tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_io", 0);
 707                                 }
 708                         }
 709                         for (cbp = cbp_head; cbp;) {
 710                                 struct buf * cbp_next;
 711
 712                                 if (io_flags & B_WRITEINPROG)
 713                                         cbp->b_vp->v_numoutput++;
 714
 715                                 cbp_next = cbp->b_trans_next;
 716
 717                                 (void) VOP_STRATEGY(cbp);
 718                                 cbp = cbp_next;
 719                         }
 720                         if ( !(flags & CL_ASYNC)) {
 721                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 722                                         biowait(cbp);
 723
 724                                 if (error = cluster_iodone(cbp_head)) {
 725                                         if ((flags & CL_PAGEOUT) && (error == ENXIO))
 726                                                 retval = 0;     /* drop the error */
 727                                         else
 728                                                 retval = error;
 729                                         error  = 0;
 730                                 }
 731                         }
 732                         cbp_head = (struct buf *)0;
 733                         cbp_tail = (struct buf *)0;
 734
 735                         buf_count = 0;
 736                 }
 737         }
 738         if (error) {
 739                 int abort_size;
 740
 741                 io_size = 0;
 742
 743                 for (cbp = cbp_head; cbp;) {
 744                         struct buf * cbp_next;
 745
 746                         upl_offset -= cbp->b_bcount;
 747                         size       += cbp->b_bcount;
 748                         io_size    += cbp->b_bcount;
 749
 750                         cbp_next = cbp->b_trans_next;
 751                         free_io_buf(cbp);
 752                         cbp = cbp_next;
 753                 }
 754                 if (iostate) {
 755                         /*
 756                          * update the error condition for this stream
 757                          * since we never really issued the io
 758                          * just go ahead and adjust it back
 759                          */
 760                         if (iostate->io_error == 0)
 761                                 iostate->io_error = error;
 762                         iostate->io_issued -= io_size;
 763
 764                         if (iostate->io_wanted) {
 765                                 /*
 766                                  * someone is waiting for the state of
 767                                  * this io stream to change
 768                                  */
 769                                 iostate->io_wanted = 0;
 770                                 wakeup((caddr_t)&iostate->io_wanted);
 771                         }
 772                 }
 773                 pg_offset  = upl_offset & PAGE_MASK;
 774                 abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 775
 776                 if (flags & CL_COMMIT) {
 777                         int upl_abort_code;
 778
 779                         if (flags & CL_PRESERVE) {
 780                                 ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
 781                                                      UPL_COMMIT_FREE_ON_EMPTY);
 782                         } else {
 783                                 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
 784                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 785                                 else if (flags & CL_PAGEIN)
 786                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 787                                 else
 788                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 789
 790                                 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
 791                                                 upl_abort_code);
 792                         }
 793                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
 794                                      (int)upl, upl_offset - pg_offset, abort_size, error, 0);
 795                 }
 796                 if (real_bp) {
 797                         real_bp->b_flags |= B_ERROR;
 798                         real_bp->b_error  = error;
 799
 800                         biodone(real_bp);
 801                 }
 802                 if (retval == 0)
 803                         retval = error;
 804         }
 805         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
 806                      (int)f_offset, size, upl_offset, retval, 0);
 807
 808         return (retval);
 809 }
 810
 811
 812 static int
 813 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
 814         struct vnode *vp;
 815         off_t         f_offset;
 816         u_int         size;
 817         off_t         filesize;
 818         int           devblocksize;
 819 {
 820         int           pages_in_prefetch;
 821
 822         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
 823                      (int)f_offset, size, (int)filesize, 0, 0);
 824
 825         if (f_offset >= filesize) {
 826                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 827                              (int)f_offset, 0, 0, 0, 0);
 828                 return(0);
 829         }
 830         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
 831                 size = (MAX_UPL_TRANSFER * PAGE_SIZE);
 832         else
 833                 size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 834
 835         if ((off_t)size > (filesize - f_offset))
 836                 size = filesize - f_offset;
 837         pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
 838
 839         advisory_read(vp, filesize, f_offset, size, devblocksize);
 840
 841         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 842                      (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
 843
 844         return (pages_in_prefetch);
 845 }
 846
 847
 848
 849 static void
 850 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
 851         struct vnode *vp;
 852         daddr_t       b_lblkno;
 853         daddr_t       e_lblkno;
 854         off_t         filesize;
 855         int           devblocksize;
 856 {
 857         daddr_t       r_lblkno;
 858         off_t         f_offset;
 859         int           size_of_prefetch;
 860
 861         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
 862                      b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
 863
 864         if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
 865                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 866                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
 867                 return;
 868         }
 869         if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
 870                                  (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
 871                 vp->v_ralen = 0;
 872                 vp->v_maxra = 0;
 873
 874                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 875                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
 876
 877                 return;
 878         }
 879         if (e_lblkno < vp->v_maxra) {
 880                 if ((vp->v_maxra - e_lblkno) > (MAX_UPL_TRANSFER / 4)) {
 881
 882                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 883                                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
 884                         return;
 885                 }
 886         }
 887         r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
 888         f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
 889
 890         size_of_prefetch = 0;
 891
 892         ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
 893
 894         if (size_of_prefetch) {
 895                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 896                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
 897                 return;
 898         }
 899         if (f_offset < filesize) {
 900                 vp->v_ralen = vp->v_ralen ? min(MAX_UPL_TRANSFER, vp->v_ralen << 1) : 1;
 901
 902                 if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
 903                         vp->v_ralen = min(MAX_UPL_TRANSFER, (e_lblkno + 1) - b_lblkno);
 904
 905                 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
 906
 907                 if (size_of_prefetch)
 908                         vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
 909         }
 910         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 911                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 4, 0);
 912 }
 913
 914 int
 915 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 916         struct vnode *vp;
 917         upl_t         upl;
 918         vm_offset_t   upl_offset;
 919         off_t         f_offset;
 920         int           size;
 921         off_t         filesize;
 922         int           devblocksize;
 923         int           flags;
 924 {
 925         int           io_size;
 926         int           rounded_size;
 927         off_t         max_size;
 928         int           local_flags;
 929
 930         if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
 931                 /*
 932                  * if we know we're issuing this I/O to a virtual device (i.e. disk image)
 933                  * then we don't want to enforce this throttle... if we do, we can
 934                  * potentially deadlock since we're stalling the pageout thread at a time
 935                  * when the disk image might need additional memory (which won't be available
 936                  * if the pageout thread can't run)... instead we'll just depend on the throttle
 937                  * that the pageout thread now has in place to deal with external files
 938                  */
 939                 local_flags = CL_PAGEOUT;
 940         else
 941                 local_flags = CL_PAGEOUT | CL_THROTTLE;
 942
 943         if ((flags & UPL_IOSYNC) == 0)
 944                 local_flags |= CL_ASYNC;
 945         if ((flags & UPL_NOCOMMIT) == 0)
 946                 local_flags |= CL_COMMIT;
 947
 948
 949         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
 950                      (int)f_offset, size, (int)filesize, local_flags, 0);
 951
 952         /*
 953          * If they didn't specify any I/O, then we are done...
 954          * we can't issue an abort because we don't know how
 955          * big the upl really is
 956          */
 957         if (size <= 0)
 958                 return (EINVAL);
 959
 960         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
 961                 if (local_flags & CL_COMMIT)
 962                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 963                 return (EROFS);
 964         }
 965         /*
 966          * can't page-in from a negative offset
 967          * or if we're starting beyond the EOF
 968          * or if the file offset isn't page aligned
 969          * or the size requested isn't a multiple of PAGE_SIZE
 970          */
 971         if (f_offset < 0 || f_offset >= filesize ||
 972            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 973                 if (local_flags & CL_COMMIT)
 974                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 975                 return (EINVAL);
 976         }
 977         max_size = filesize - f_offset;
 978
 979         if (size < max_size)
 980                 io_size = size;
 981         else
 982                 io_size = max_size;
 983
 984         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 985
 986         if (size > rounded_size) {
 987                 if (local_flags & CL_COMMIT)
 988                         ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
 989                                         UPL_ABORT_FREE_ON_EMPTY);
 990         }
 991         vp->v_flag |= VHASBEENPAGED;
 992
 993         return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
 994                            local_flags, (struct buf *)0, (struct clios *)0));
 995 }
 996
 997 int
 998 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 999         struct vnode *vp;
1000         upl_t         upl;
1001         vm_offset_t   upl_offset;
1002         off_t         f_offset;
1003         int           size;
1004         off_t         filesize;
1005         int           devblocksize;
1006         int           flags;
1007 {
1008         u_int         io_size;
1009         int           rounded_size;
1010         off_t         max_size;
1011         int           retval;
1012         int           local_flags = 0;
1013
1014         if (upl == NULL || size < 0)
1015                 panic("cluster_pagein: NULL upl passed in");
1016
1017         if ((flags & UPL_IOSYNC) == 0)
1018                 local_flags |= CL_ASYNC;
1019         if ((flags & UPL_NOCOMMIT) == 0)
1020                 local_flags |= CL_COMMIT;
1021
1022
1023         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1024                      (int)f_offset, size, (int)filesize, local_flags, 0);
1025
1026         /*
1027          * can't page-in from a negative offset
1028          * or if we're starting beyond the EOF
1029          * or if the file offset isn't page aligned
1030          * or the size requested isn't a multiple of PAGE_SIZE
1031          */
1032         if (f_offset < 0 || f_offset >= filesize ||
1033            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1034                 if (local_flags & CL_COMMIT)
1035                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1036                 return (EINVAL);
1037         }
1038         max_size = filesize - f_offset;
1039
1040         if (size < max_size)
1041                 io_size = size;
1042         else
1043                 io_size = max_size;
1044
1045         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1046
1047         if (size > rounded_size && (local_flags & CL_COMMIT))
1048                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1049                                     size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1050
1051         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
1052                            local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
1053
1054         if (retval == 0) {
1055                 int b_lblkno;
1056                 int e_lblkno;
1057
1058                 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
1059                 e_lblkno = (int)
1060                         ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1061
1062                 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
1063                         /*
1064                          * we haven't read the last page in of the file yet
1065                          * so let's try to read ahead if we're in
1066                          * a sequential access pattern
1067                          */
1068                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1069                 }
1070                 vp->v_lastr = e_lblkno;
1071         }
1072         return (retval);
1073 }
1074
1075 int
1076 cluster_bp(bp)
1077         struct buf *bp;
1078 {
1079         off_t  f_offset;
1080         int    flags;
1081
1082         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1083                      (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1084
1085         if (bp->b_pagelist == (upl_t) 0)
1086                 panic("cluster_bp: can't handle NULL upl yet\n");
1087         if (bp->b_flags & B_READ)
1088                 flags = CL_ASYNC | CL_READ;
1089         else
1090                 flags = CL_ASYNC;
1091
1092         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1093
1094         return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
1095 }
1096
1097 int
1098 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1099         struct vnode *vp;
1100         struct uio   *uio;
1101         off_t         oldEOF;
1102         off_t         newEOF;
1103         off_t         headOff;
1104         off_t         tailOff;
1105         int           devblocksize;
1106         int           flags;
1107 {
1108         int           prev_resid;
1109         int           clip_size;
1110         off_t         max_io_size;
1111         struct iovec  *iov;
1112         int           upl_size;
1113         int           upl_flags;
1114         upl_t         upl;
1115         int           retval = 0;
1116
1117
1118         if (vp->v_flag & VHASBEENPAGED)
1119           {
1120             /*
1121              * this vnode had pages cleaned to it by
1122              * the pager which indicates that either
1123              * it's not very 'hot', or the system is
1124              * being overwhelmed by a lot of dirty
1125              * data being delayed in the VM cache...
1126              * in either event, we'll push our remaining
1127              * delayed data at this point...  this will
1128              * be more efficient than paging out 1 page at
1129              * a time, and will also act as a throttle
1130              * by delaying this client from writing any
1131              * more data until all his delayed data has
1132              * at least been queued to the uderlying driver.
1133              */
1134             cluster_push(vp);
1135
1136             vp->v_flag &= ~VHASBEENPAGED;
1137           }
1138
1139         if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
1140           {
1141             /*
1142              * go do a write through the cache if one of the following is true....
1143              *   NOCACHE is not true
1144              *   there is no uio structure or it doesn't target USERSPACE
1145              */
1146             return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags));
1147           }
1148
1149         while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1150           {
1151             /*
1152              * we know we have a resid, so this is safe
1153              * skip over any emtpy vectors
1154              */
1155             iov = uio->uio_iov;
1156
1157             while (iov->iov_len == 0) {
1158               uio->uio_iov++;
1159               uio->uio_iovcnt--;
1160               iov = uio->uio_iov;
1161             }
1162             upl_size  = PAGE_SIZE;
1163             upl_flags = UPL_QUERY_OBJECT_TYPE;
1164
1165             if ((vm_map_get_upl(current_map(),
1166                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1167                                &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS)
1168               {
1169                 /*
1170                  * the user app must have passed in an invalid address
1171                  */
1172                 return (EFAULT);
1173               }
1174
1175             /*
1176              * We check every vector target but if it is physically
1177              * contiguous space, we skip the sanity checks.
1178              */
1179             if (upl_flags & UPL_PHYS_CONTIG)
1180               {
1181                 if (flags & IO_HEADZEROFILL)
1182                   {
1183                     flags &= ~IO_HEADZEROFILL;
1184
1185                     if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1186                         return(retval);
1187                   }
1188
1189                 retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
1190
1191                 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1192                   {
1193                     return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL));
1194                   }
1195               }
1196             else if ((uio->uio_resid < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1197               {
1198                 /*
1199                  * we're here because we're don't have a physically contiguous target buffer
1200                  * go do a write through the cache if one of the following is true....
1201                  *   the total xfer size is less than a page...
1202                  *   we're being asked to ZEROFILL either the head or the tail of the I/O...
1203                  */
1204                 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags));
1205               }
1206             else if (((int)uio->uio_offset & PAGE_MASK) || ((int)iov->iov_base & PAGE_MASK))
1207               {
1208                 if (((int)uio->uio_offset & PAGE_MASK) == ((int)iov->iov_base & PAGE_MASK))
1209                   {
1210                     /*
1211                      * Bring the file offset write up to a pagesize boundary
1212                      * this will also bring the base address to a page boundary
1213                      * since they both are currently on the same offset within a page
1214                      * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1215                      * so the computed clip_size must always be less than the current uio_resid
1216                      */
1217                     clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1218
1219                     /*
1220                      * Fake the resid going into the cluster_write_x call
1221                      * and restore it on the way out.
1222                      */
1223                     prev_resid = uio->uio_resid;
1224                     uio->uio_resid = clip_size;
1225                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1226                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1227                   }
1228                 else
1229                   {
1230                     /*
1231                      * can't get both the file offset and the buffer offset aligned to a page boundary
1232                      * so fire an I/O through the cache for this entire vector
1233                      */
1234                     clip_size = iov->iov_len;
1235                     prev_resid = uio->uio_resid;
1236                     uio->uio_resid = clip_size;
1237                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1238                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1239                   }
1240               }
1241             else
1242               {
1243                 /*
1244                  * If we come in here, we know the offset into
1245                  * the file is on a pagesize boundary and the
1246                  * target buffer address is also on a page boundary
1247                  */
1248                 max_io_size = newEOF - uio->uio_offset;
1249                 clip_size = uio->uio_resid;
1250                 if (iov->iov_len < clip_size)
1251                   clip_size = iov->iov_len;
1252                 if (max_io_size < clip_size)
1253                   clip_size = max_io_size;
1254
1255                 if (clip_size < PAGE_SIZE)
1256                   {
1257                     /*
1258                      * Take care of tail end of write in this vector
1259                      */
1260                     prev_resid = uio->uio_resid;
1261                     uio->uio_resid = clip_size;
1262                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1263                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1264                   }
1265                 else
1266                   {
1267                     /* round clip_size down to a multiple of pagesize */
1268                     clip_size = clip_size & ~(PAGE_MASK);
1269                     prev_resid = uio->uio_resid;
1270                     uio->uio_resid = clip_size;
1271                     retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1272                     if ((retval == 0) && uio->uio_resid)
1273                       retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1274                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1275                   }
1276               } /* end else */
1277           } /* end while */
1278         return(retval);
1279 }
1280
1281
1282 static int
1283 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1284         struct vnode *vp;
1285         struct uio   *uio;
1286         off_t         newEOF;
1287         int           devblocksize;
1288         int           flags;
1289 {
1290         upl_t            upl;
1291         upl_page_info_t  *pl;
1292         off_t            upl_f_offset;
1293         vm_offset_t      upl_offset;
1294         off_t            max_io_size;
1295         int              io_size;
1296         int              io_flag;
1297         int              upl_size;
1298         int              upl_needed_size;
1299         int              pages_in_pl;
1300         int              upl_flags;
1301         kern_return_t    kret;
1302         struct iovec     *iov;
1303         int              i;
1304         int              force_data_sync;
1305         int              error  = 0;
1306         struct clios     iostate;
1307
1308         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1309                      (int)uio->uio_offset, (int)uio->uio_resid,
1310                      (int)newEOF, devblocksize, 0);
1311
1312         /*
1313          * When we enter this routine, we know
1314          *  -- the offset into the file is on a pagesize boundary
1315          *  -- the resid is a page multiple
1316          *  -- the resid will not exceed iov_len
1317          */
1318         cluster_try_push(vp, newEOF, 0, 1);
1319
1320         iostate.io_completed = 0;
1321         iostate.io_issued = 0;
1322         iostate.io_error = 0;
1323         iostate.io_wanted = 0;
1324
1325         iov = uio->uio_iov;
1326
1327         while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1328                 io_size = uio->uio_resid;
1329
1330                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1331                         io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1332
1333                 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
1334                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1335
1336                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1337                              (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1338
1339                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1340                         pages_in_pl = 0;
1341                         upl_size = upl_needed_size;
1342                         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1343                                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1344
1345                         kret = vm_map_get_upl(current_map(),
1346                                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1347                                               &upl_size,
1348                                               &upl,
1349                                               NULL,
1350                                               &pages_in_pl,
1351                                               &upl_flags,
1352                                               force_data_sync);
1353
1354                         if (kret != KERN_SUCCESS) {
1355                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1356                                              0, 0, 0, kret, 0);
1357                                 /*
1358                                  * cluster_nocopy_write: failed to get pagelist
1359                                  *
1360                                  * we may have already spun some portion of this request
1361                                  * off as async requests... we need to wait for the I/O
1362                                  * to complete before returning
1363                                  */
1364                                 goto wait_for_writes;
1365                         }
1366                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1367                         pages_in_pl = upl_size / PAGE_SIZE;
1368
1369                         for (i = 0; i < pages_in_pl; i++) {
1370                                 if (!upl_valid_page(pl, i))
1371                                         break;
1372                         }
1373                         if (i == pages_in_pl)
1374                                 break;
1375
1376                         /*
1377                          * didn't get all the pages back that we
1378                          * needed... release this upl and try again
1379                          */
1380                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1381                                             UPL_ABORT_FREE_ON_EMPTY);
1382                 }
1383                 if (force_data_sync >= 3) {
1384                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1385                                      i, pages_in_pl, upl_size, kret, 0);
1386                         /*
1387                          * for some reason, we couldn't acquire a hold on all
1388                          * the pages needed in the user's address space
1389                          *
1390                          * we may have already spun some portion of this request
1391                          * off as async requests... we need to wait for the I/O
1392                          * to complete before returning
1393                          */
1394                         goto wait_for_writes;
1395                 }
1396
1397                 /*
1398                  * Consider the possibility that upl_size wasn't satisfied.
1399                  */
1400                 if (upl_size != upl_needed_size)
1401                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1402
1403                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1404                              (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1405
1406                 if (io_size == 0) {
1407                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1408                                             UPL_ABORT_FREE_ON_EMPTY);
1409                         /*
1410                          * we may have already spun some portion of this request
1411                          * off as async requests... we need to wait for the I/O
1412                          * to complete before returning
1413                          */
1414                         goto wait_for_writes;
1415                 }
1416                 /*
1417                  * Now look for pages already in the cache
1418                  * and throw them away.
1419                  * uio->uio_offset is page aligned within the file
1420                  * io_size is a multiple of PAGE_SIZE
1421                  */
1422                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
1423
1424                 /*
1425                  * we want push out these writes asynchronously so that we can overlap
1426                  * the preparation of the next I/O
1427                  * if there are already too many outstanding writes
1428                  * wait until some complete before issuing the next
1429                  */
1430                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1431                         iostate.io_wanted = 1;
1432                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1433                 }
1434                 if (iostate.io_error) {
1435                         /*
1436                          * one of the earlier writes we issued ran into a hard error
1437                          * don't issue any more writes, cleanup the UPL
1438                          * that was just created but not used, then
1439                          * go wait for all writes that are part of this stream
1440                          * to complete before returning the error to the caller
1441                          */
1442                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1443                                             UPL_ABORT_FREE_ON_EMPTY);
1444
1445                         goto wait_for_writes;
1446                 }
1447                 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE;
1448
1449                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1450                              (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1451
1452                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1453                                    io_size, devblocksize, io_flag, (struct buf *)0, &iostate);
1454
1455                 iov->iov_len    -= io_size;
1456                 iov->iov_base   += io_size;
1457                 uio->uio_resid  -= io_size;
1458                 uio->uio_offset += io_size;
1459
1460                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1461                              (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1462
1463         } /* end while */
1464
1465 wait_for_writes:
1466         /*
1467          * make sure all async writes issued as part of this stream
1468          * have completed before we return
1469          */
1470         while (iostate.io_issued != iostate.io_completed) {
1471                 iostate.io_wanted = 1;
1472                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1473         }
1474         if (iostate.io_error)
1475                 error = iostate.io_error;
1476
1477         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1478                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1479
1480         return (error);
1481 }
1482
1483
1484 static int
1485 cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
1486         struct vnode *vp;
1487         struct uio   *uio;
1488         off_t        newEOF;
1489         int          devblocksize;
1490         int          flags;
1491 {
1492         upl_page_info_t *pl;
1493         addr64_t         src_paddr;
1494         upl_t            upl;
1495         vm_offset_t      upl_offset;
1496         int              tail_size;
1497         int              io_size;
1498         int              upl_size;
1499         int              upl_needed_size;
1500         int              pages_in_pl;
1501         int              upl_flags;
1502         kern_return_t    kret;
1503         struct iovec     *iov;
1504         int              error  = 0;
1505
1506         /*
1507          * When we enter this routine, we know
1508          *  -- the resid will not exceed iov_len
1509          *  -- the vector target address is physcially contiguous
1510          */
1511         cluster_try_push(vp, newEOF, 0, 1);
1512
1513         iov = uio->uio_iov;
1514         io_size = iov->iov_len;
1515         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
1516         upl_needed_size = upl_offset + io_size;
1517
1518         pages_in_pl = 0;
1519         upl_size = upl_needed_size;
1520         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1521                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1522
1523         kret = vm_map_get_upl(current_map(),
1524                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1525                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1526
1527         if (kret != KERN_SUCCESS) {
1528                 /*
1529                  * cluster_phys_write: failed to get pagelist
1530                  * note: return kret here
1531                  */
1532               return(EINVAL);
1533         }
1534         /*
1535          * Consider the possibility that upl_size wasn't satisfied.
1536          * This is a failure in the physical memory case.
1537          */
1538         if (upl_size < upl_needed_size) {
1539                 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1540                 return(EINVAL);
1541         }
1542         pl = ubc_upl_pageinfo(upl);
1543
1544         src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)((u_int)iov->iov_base & PAGE_MASK));
1545
1546         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1547                 int   head_size;
1548
1549                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1550
1551                 if (head_size > io_size)
1552                         head_size = io_size;
1553
1554                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
1555
1556                 if (error) {
1557                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1558
1559                         return(EINVAL);
1560                 }
1561                 upl_offset += head_size;
1562                 src_paddr  += head_size;
1563                 io_size    -= head_size;
1564         }
1565         tail_size = io_size & (devblocksize - 1);
1566         io_size  -= tail_size;
1567
1568         if (io_size) {
1569                 /*
1570                  * issue a synchronous write to cluster_io
1571                  */
1572                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1573                                    io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
1574         }
1575         if (error == 0) {
1576                 /*
1577                  * The cluster_io write completed successfully,
1578                  * update the uio structure
1579                  */
1580                 uio->uio_resid  -= io_size;
1581                 iov->iov_len    -= io_size;
1582                 iov->iov_base   += io_size;
1583                 uio->uio_offset += io_size;
1584                 src_paddr       += io_size;
1585
1586                 if (tail_size)
1587                         error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
1588         }
1589         /*
1590          * just release our hold on the physically contiguous
1591          * region without changing any state
1592          */
1593         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1594
1595         return (error);
1596 }
1597
1598
1599 static int
1600 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1601         struct vnode *vp;
1602         struct uio   *uio;
1603         off_t         oldEOF;
1604         off_t         newEOF;
1605         off_t         headOff;
1606         off_t         tailOff;
1607         int           devblocksize;
1608         int           flags;
1609 {
1610         upl_page_info_t *pl;
1611         upl_t            upl;
1612         vm_offset_t      upl_offset;
1613         int              upl_size;
1614         off_t            upl_f_offset;
1615         int              pages_in_upl;
1616         int              start_offset;
1617         int              xfer_resid;
1618         int              io_size;
1619         int              io_flags;
1620         int              io_offset;
1621         int              bytes_to_zero;
1622         int              bytes_to_move;
1623         kern_return_t    kret;
1624         int              retval = 0;
1625         int              uio_resid;
1626         long long        total_size;
1627         long long        zero_cnt;
1628         off_t            zero_off;
1629         long long        zero_cnt1;
1630         off_t            zero_off1;
1631         daddr_t          start_blkno;
1632         daddr_t          last_blkno;
1633         int              intersection;
1634
1635
1636         if (uio) {
1637                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1638                              (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1639
1640                 uio_resid = uio->uio_resid;
1641         } else {
1642                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1643                              0, 0, (int)oldEOF, (int)newEOF, 0);
1644
1645                 uio_resid = 0;
1646         }
1647         zero_cnt  = 0;
1648         zero_cnt1 = 0;
1649
1650         if (flags & IO_HEADZEROFILL) {
1651                 /*
1652                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1653                  * so we zero fill the intervening space between the old EOF and the offset
1654                  * where the next chunk of real data begins.... ftruncate will also use this
1655                  * routine to zero fill to the new EOF when growing a file... in this case, the
1656                  * uio structure will not be provided
1657                  */
1658                 if (uio) {
1659                         if (headOff < uio->uio_offset) {
1660                                 zero_cnt = uio->uio_offset - headOff;
1661                                 zero_off = headOff;
1662                         }
1663                 } else if (headOff < newEOF) {
1664                         zero_cnt = newEOF - headOff;
1665                         zero_off = headOff;
1666                 }
1667         }
1668         if (flags & IO_TAILZEROFILL) {
1669                 if (uio) {
1670                         zero_off1 = uio->uio_offset + uio->uio_resid;
1671
1672                         if (zero_off1 < tailOff)
1673                                 zero_cnt1 = tailOff - zero_off1;
1674                 }
1675         }
1676         if (zero_cnt == 0 && uio == (struct uio *) 0) {
1677             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1678                          retval, 0, 0, 0, 0);
1679             return (0);
1680         }
1681
1682         while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1683                 /*
1684                  * for this iteration of the loop, figure out where our starting point is
1685                  */
1686                 if (zero_cnt) {
1687                         start_offset = (int)(zero_off & PAGE_MASK_64);
1688                         upl_f_offset = zero_off - start_offset;
1689                 } else if (uio_resid) {
1690                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1691                         upl_f_offset = uio->uio_offset - start_offset;
1692                 } else {
1693                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1694                         upl_f_offset = zero_off1 - start_offset;
1695                 }
1696                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1697                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1698
1699                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1700                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1701
1702                 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1703
1704                 if (uio && !(vp->v_flag & VNOCACHE_DATA) &&
1705                    (flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0) {
1706                         /*
1707                          * assumption... total_size <= uio_resid
1708                          * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1709                          */
1710                         if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
1711                                 total_size -= start_offset;
1712                         xfer_resid = total_size;
1713
1714                         retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
1715
1716                         if (retval)
1717                                 break;
1718
1719                         uio_resid   -= (total_size - xfer_resid);
1720                         total_size   = xfer_resid;
1721                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1722                         upl_f_offset = uio->uio_offset - start_offset;
1723
1724                         if (total_size == 0) {
1725                                 if (start_offset) {
1726                                         /*
1727                                          * the write did not finish on a page boundary
1728                                          * which will leave upl_f_offset pointing to the
1729                                          * beginning of the last page written instead of
1730                                          * the page beyond it... bump it in this case
1731                                          * so that the cluster code records the last page
1732                                          * written as dirty
1733                                          */
1734                                         upl_f_offset += PAGE_SIZE_64;
1735                                 }
1736                                 upl_size = 0;
1737
1738                                 goto check_cluster;
1739                         }
1740                 }
1741                 /*
1742                  * compute the size of the upl needed to encompass
1743                  * the requested write... limit each call to cluster_io
1744                  * to the maximum UPL size... cluster_io will clip if
1745                  * this exceeds the maximum io_size for the device,
1746                  * make sure to account for
1747                  * a starting offset that's not page aligned
1748                  */
1749                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1750
1751                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1752                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1753
1754                 pages_in_upl = upl_size / PAGE_SIZE;
1755                 io_size      = upl_size - start_offset;
1756
1757                 if ((long long)io_size > total_size)
1758                         io_size = total_size;
1759
1760                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
1761
1762
1763                 kret = ubc_create_upl(vp,
1764                                                         upl_f_offset,
1765                                                         upl_size,
1766                                                         &upl,
1767                                                         &pl,
1768                                                         UPL_SET_LITE);
1769                 if (kret != KERN_SUCCESS)
1770                         panic("cluster_write: failed to get pagelist");
1771
1772                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
1773                         (int)upl, (int)upl_f_offset, start_offset, 0, 0);
1774
1775                 if (start_offset && !upl_valid_page(pl, 0)) {
1776                         int   read_size;
1777
1778                         /*
1779                          * we're starting in the middle of the first page of the upl
1780                          * and the page isn't currently valid, so we're going to have
1781                          * to read it in first... this is a synchronous operation
1782                          */
1783                         read_size = PAGE_SIZE;
1784
1785                         if ((upl_f_offset + read_size) > newEOF)
1786                                 read_size = newEOF - upl_f_offset;
1787
1788                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
1789                                             CL_READ, (struct buf *)0, (struct clios *)0);
1790                         if (retval) {
1791                                 /*
1792                                  * we had an error during the read which causes us to abort
1793                                  * the current cluster_write request... before we do, we need
1794                                  * to release the rest of the pages in the upl without modifying
1795                                  * there state and mark the failed page in error
1796                                  */
1797                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1798                                 ubc_upl_abort_range(upl, 0, upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1799
1800                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1801                                              (int)upl, 0, 0, retval, 0);
1802                                 break;
1803                         }
1804                 }
1805                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1806                         /*
1807                          * the last offset we're writing to in this upl does not end on a page
1808                          * boundary... if it's not beyond the old EOF, then we'll also need to
1809                          * pre-read this page in if it isn't already valid
1810                          */
1811                         upl_offset = upl_size - PAGE_SIZE;
1812
1813                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1814                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1815                                 int   read_size;
1816
1817                                 read_size = PAGE_SIZE;
1818
1819                                 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1820                                         read_size = newEOF - (upl_f_offset + upl_offset);
1821
1822                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
1823                                                     CL_READ, (struct buf *)0, (struct clios *)0);
1824                                 if (retval) {
1825                                         /*
1826                                          * we had an error during the read which causes us to abort
1827                                          * the current cluster_write request... before we do, we
1828                                          * need to release the rest of the pages in the upl without
1829                                          * modifying there state and mark the failed page in error
1830                                          */
1831                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1832                                         ubc_upl_abort_range(upl, 0,          upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1833
1834                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1835                                                      (int)upl, 0, 0, retval, 0);
1836                                         break;
1837                                 }
1838                         }
1839                 }
1840                 xfer_resid = io_size;
1841                 io_offset = start_offset;
1842
1843                 while (zero_cnt && xfer_resid) {
1844
1845                         if (zero_cnt < (long long)xfer_resid)
1846                                 bytes_to_zero = zero_cnt;
1847                         else
1848                                 bytes_to_zero = xfer_resid;
1849
1850                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1851                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1852                         } else {
1853                                 int zero_pg_index;
1854
1855                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1856                                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1857
1858                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1859                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1860
1861                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1862                                            !upl_dirty_page(pl, zero_pg_index)) {
1863                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1864                                 }
1865                         }
1866                         xfer_resid -= bytes_to_zero;
1867                         zero_cnt   -= bytes_to_zero;
1868                         zero_off   += bytes_to_zero;
1869                         io_offset  += bytes_to_zero;
1870                 }
1871                 if (xfer_resid && uio_resid) {
1872                         bytes_to_move = min(uio_resid, xfer_resid);
1873
1874                         retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
1875
1876                         if (retval) {
1877
1878                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1879
1880                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1881                                              (int)upl, 0, 0, retval, 0);
1882                         } else {
1883                                 uio_resid  -= bytes_to_move;
1884                                 xfer_resid -= bytes_to_move;
1885                                 io_offset  += bytes_to_move;
1886                         }
1887                 }
1888                 while (xfer_resid && zero_cnt1 && retval == 0) {
1889
1890                         if (zero_cnt1 < (long long)xfer_resid)
1891                                 bytes_to_zero = zero_cnt1;
1892                         else
1893                                 bytes_to_zero = xfer_resid;
1894
1895                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1896                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1897                         } else {
1898                                 int zero_pg_index;
1899
1900                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1901                                 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1902
1903                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1904                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1905                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1906                                            !upl_dirty_page(pl, zero_pg_index)) {
1907                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1908                                 }
1909                         }
1910                         xfer_resid -= bytes_to_zero;
1911                         zero_cnt1  -= bytes_to_zero;
1912                         zero_off1  += bytes_to_zero;
1913                         io_offset  += bytes_to_zero;
1914                 }
1915
1916                 if (retval == 0) {
1917                         int cl_index;
1918                         int can_delay;
1919
1920                         io_size += start_offset;
1921
1922                         if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1923                                 /*
1924                                  * if we're extending the file with this write
1925                                  * we'll zero fill the rest of the page so that
1926                                  * if the file gets extended again in such a way as to leave a
1927                                  * hole starting at this EOF, we'll have zero's in the correct spot
1928                                  */
1929                                 cluster_zero(upl, io_size, upl_size - io_size, NULL);
1930                         }
1931                         if (flags & IO_SYNC)
1932                                 /*
1933                                  * if the IO_SYNC flag is set than we need to
1934                                  * bypass any clusters and immediately issue
1935                                  * the I/O
1936                                  */
1937                                 goto issue_io;
1938 check_cluster:
1939                         /*
1940                          * calculate the last logical block number
1941                          * that this delayed I/O encompassed
1942                          */
1943                         last_blkno = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
1944
1945                         if (vp->v_flag & VHASDIRTY) {
1946
1947                                 if ( !(vp->v_flag & VNOCACHE_DATA)) {
1948                                         /*
1949                                          * we've fallen into the sparse
1950                                          * cluster method of delaying dirty pages
1951                                          * first, we need to release the upl if we hold one
1952                                          * since pages in it may be present in the sparse cluster map
1953                                          * and may span 2 separate buckets there... if they do and
1954                                          * we happen to have to flush a bucket to make room and it intersects
1955                                          * this upl, a deadlock may result on page BUSY
1956                                          */
1957                                         if (upl_size)
1958                                                 ubc_upl_commit_range(upl, 0, upl_size,
1959                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1960
1961                                         sparse_cluster_add(vp, newEOF, start_blkno, last_blkno);
1962
1963                                         continue;
1964                                 }
1965                                 /*
1966                                  * must have done cached writes that fell into
1967                                  * the sparse cluster mechanism... we've switched
1968                                  * to uncached writes on the file, so go ahead
1969                                  * and push whatever's in the sparse map
1970                                  * and switch back to normal clustering
1971                                  *
1972                                  * see the comment above concerning a possible deadlock...
1973                                  */
1974                                 if (upl_size) {
1975                                         ubc_upl_commit_range(upl, 0, upl_size,
1976                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1977                                         /*
1978                                          * setting upl_size to 0 keeps us from committing a
1979                                          * second time in the start_new_cluster path
1980                                          */
1981                                         upl_size = 0;
1982                                 }
1983                                 sparse_cluster_push(vp, ubc_getsize(vp), 1);
1984
1985                                 /*
1986                                  * no clusters of either type present at this point
1987                                  * so just go directly to start_new_cluster since
1988                                  * we know we need to delay this I/O since we've
1989                                  * already released the pages back into the cache
1990                                  * to avoid the deadlock with sparse_cluster_push
1991                                  */
1992                                 goto start_new_cluster;
1993                         }
1994                         upl_offset = 0;
1995
1996                         if (vp->v_clen == 0)
1997                                 /*
1998                                  * no clusters currently present
1999                                  */
2000                                 goto start_new_cluster;
2001
2002                         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
2003                                 /*
2004                                  * check each cluster that we currently hold
2005                                  * try to merge some or all of this write into
2006                                  * one or more of the existing clusters... if
2007                                  * any portion of the write remains, start a
2008                                  * new cluster
2009                                  */
2010                                 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
2011                                         /*
2012                                          * the current write starts at or after the current cluster
2013                                          */
2014                                         if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
2015                                                 /*
2016                                                  * we have a write that fits entirely
2017                                                  * within the existing cluster limits
2018                                                  */
2019                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg)
2020                                                         /*
2021                                                          * update our idea of where the cluster ends
2022                                                          */
2023                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
2024                                                 break;
2025                                         }
2026                                         if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
2027                                                 /*
2028                                                  * we have a write that starts in the middle of the current cluster
2029                                                  * but extends beyond the cluster's limit... we know this because
2030                                                  * of the previous checks
2031                                                  * we'll extend the current cluster to the max
2032                                                  * and update the start_blkno for the current write to reflect that
2033                                                  * the head of it was absorbed into this cluster...
2034                                                  * note that we'll always have a leftover tail in this case since
2035                                                  * full absorbtion would have occurred in the clause above
2036                                                  */
2037                                                 vp->v_clusters[cl_index].last_pg = vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER;
2038
2039                                                 if (upl_size) {
2040                                                         int  start_pg_in_upl;
2041
2042                                                         start_pg_in_upl = upl_f_offset / PAGE_SIZE_64;
2043
2044                                                         if (start_pg_in_upl < vp->v_clusters[cl_index].last_pg) {
2045                                                                 intersection = (vp->v_clusters[cl_index].last_pg - start_pg_in_upl) * PAGE_SIZE;
2046
2047                                                                 ubc_upl_commit_range(upl, upl_offset, intersection,
2048                                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2049                                                                 upl_f_offset += intersection;
2050                                                                 upl_offset   += intersection;
2051                                                                 upl_size     -= intersection;
2052                                                         }
2053                                                 }
2054                                                 start_blkno = vp->v_clusters[cl_index].last_pg;
2055                                         }
2056                                         /*
2057                                          * we come here for the case where the current write starts
2058                                          * beyond the limit of the existing cluster or we have a leftover
2059                                          * tail after a partial absorbtion
2060                                          *
2061                                          * in either case, we'll check the remaining clusters before
2062                                          * starting a new one
2063                                          */
2064                                 } else {
2065                                         /*
2066                                          * the current write starts in front of the cluster we're currently considering
2067                                          */
2068                                         if ((vp->v_clusters[cl_index].last_pg - start_blkno) <= MAX_UPL_TRANSFER) {
2069                                                 /*
2070                                                  * we can just merge the new request into
2071                                                  * this cluster and leave it in the cache
2072                                                  * since the resulting cluster is still
2073                                                  * less than the maximum allowable size
2074                                                  */
2075                                                 vp->v_clusters[cl_index].start_pg = start_blkno;
2076
2077                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
2078                                                         /*
2079                                                          * the current write completely
2080                                                          * envelops the existing cluster and since
2081                                                          * each write is limited to at most MAX_UPL_TRANSFER bytes
2082                                                          * we can just use the start and last blocknos of the write
2083                                                          * to generate the cluster limits
2084                                                          */
2085                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
2086                                                 }
2087                                                 break;
2088                                         }
2089
2090                                         /*
2091                                          * if we were to combine this write with the current cluster
2092                                          * we would exceed the cluster size limit.... so,
2093                                          * let's see if there's any overlap of the new I/O with
2094                                          * the cluster we're currently considering... in fact, we'll
2095                                          * stretch the cluster out to it's full limit and see if we
2096                                          * get an intersection with the current write
2097                                          *
2098                                          */
2099                                         if (last_blkno > vp->v_clusters[cl_index].last_pg - MAX_UPL_TRANSFER) {
2100                                                 /*
2101                                                  * the current write extends into the proposed cluster
2102                                                  * clip the length of the current write after first combining it's
2103                                                  * tail with the newly shaped cluster
2104                                                  */
2105                                                 vp->v_clusters[cl_index].start_pg = vp->v_clusters[cl_index].last_pg - MAX_UPL_TRANSFER;
2106
2107                                                 if (upl_size) {
2108                                                         intersection = (last_blkno - vp->v_clusters[cl_index].start_pg) * PAGE_SIZE;
2109
2110                                                         if (intersection > upl_size)
2111                                                                 /*
2112                                                                  * because the current write may consist of a number of pages found in the cache
2113                                                                  * which are not part of the UPL, we may have an intersection that exceeds
2114                                                                  * the size of the UPL that is also part of this write
2115                                                                  */
2116                                                                 intersection = upl_size;
2117
2118                                                         ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2119                                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2120                                                         upl_size -= intersection;
2121                                                 }
2122                                                 last_blkno = vp->v_clusters[cl_index].start_pg;
2123                                         }
2124                                         /*
2125                                          * if we get here, there was no way to merge
2126                                          * any portion of this write with this cluster
2127                                          * or we could only merge part of it which
2128                                          * will leave a tail...
2129                                          * we'll check the remaining clusters before starting a new one
2130                                          */
2131                                 }
2132                         }
2133                         if (cl_index < vp->v_clen)
2134                                 /*
2135                                  * we found an existing cluster(s) that we
2136                                  * could entirely merge this I/O into
2137                                  */
2138                                 goto delay_io;
2139
2140                         if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
2141                                 /*
2142                                  * we didn't find an existing cluster to
2143                                  * merge into, but there's room to start
2144                                  * a new one
2145                                  */
2146                                 goto start_new_cluster;
2147
2148                         /*
2149                          * no exisitng cluster to merge with and no
2150                          * room to start a new one... we'll try
2151                          * pushing one of the existing ones... if none of
2152                          * them are able to be pushed, we'll switch
2153                          * to the sparse cluster mechanism
2154                          * cluster_try_push updates v_clen to the
2155                          * number of remaining clusters... and
2156                          * returns the number of currently unused clusters
2157                          */
2158                         if (vp->v_flag & VNOCACHE_DATA)
2159                                 can_delay = 0;
2160                         else
2161                                 can_delay = 1;
2162
2163                         if (cluster_try_push(vp, newEOF, can_delay, 0) == 0) {
2164                                 /*
2165                                  * no more room in the normal cluster mechanism
2166                                  * so let's switch to the more expansive but expensive
2167                                  * sparse mechanism....
2168                                  * first, we need to release the upl if we hold one
2169                                  * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2170                                  * and may span 2 separate buckets there... if they do and
2171                                  * we happen to have to flush a bucket to make room and it intersects
2172                                  * this upl, a deadlock may result on page BUSY
2173                                  */
2174                                 if (upl_size)
2175                                         ubc_upl_commit_range(upl, upl_offset, upl_size,
2176                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2177
2178                                 sparse_cluster_switch(vp, newEOF);
2179                                 sparse_cluster_add(vp, newEOF, start_blkno, last_blkno);
2180
2181                                 continue;
2182                         }
2183                         /*
2184                          * we pushed one cluster successfully, so we must be sequentially writing this file
2185                          * otherwise, we would have failed and fallen into the sparse cluster support
2186                          * so let's take the opportunity to push out additional clusters as long as we
2187                          * remain below the throttle... this will give us better I/O locality if we're
2188                          * in a copy loop (i.e.  we won't jump back and forth between the read and write points
2189                          * however, we don't want to push so much out that the write throttle kicks in and
2190                          * hangs this thread up until some of the I/O completes...
2191                          */
2192                         while (vp->v_clen && (vp->v_numoutput <= (ASYNC_THROTTLE / 2)))
2193                                 cluster_try_push(vp, newEOF, 0, 0);
2194
2195 start_new_cluster:
2196                         if (vp->v_clen == 0)
2197                                 vp->v_ciosiz = devblocksize;
2198
2199                         vp->v_clusters[vp->v_clen].start_pg = start_blkno;
2200                         vp->v_clusters[vp->v_clen].last_pg  = last_blkno;
2201                         vp->v_clen++;
2202
2203 delay_io:
2204                         if (upl_size)
2205                                 ubc_upl_commit_range(upl, upl_offset, upl_size,
2206                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2207                         continue;
2208 issue_io:
2209                         /*
2210                          * in order to maintain some semblance of coherency with mapped writes
2211                          * we need to write the cluster back out as a multiple of the PAGESIZE
2212                          * unless the cluster encompasses the last page of the file... in this
2213                          * case we'll round out to the nearest device block boundary
2214                          */
2215                         io_size = upl_size;
2216
2217                         if ((upl_f_offset + io_size) > newEOF) {
2218                                 io_size = newEOF - upl_f_offset;
2219                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2220                         }
2221
2222                         if (flags & IO_SYNC)
2223                                 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE;
2224                         else
2225                                 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | CL_ASYNC;
2226
2227                         if (vp->v_flag & VNOCACHE_DATA)
2228                                 io_flags |= CL_DUMP;
2229
2230                         retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
2231                                             io_flags, (struct buf *)0, (struct clios *)0);
2232                 }
2233         }
2234         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2235                      retval, 0, uio_resid, 0, 0);
2236
2237         return (retval);
2238 }
2239
2240 int
2241 cluster_read(vp, uio, filesize, devblocksize, flags)
2242         struct vnode *vp;
2243         struct uio   *uio;
2244         off_t         filesize;
2245         int           devblocksize;
2246         int           flags;
2247 {
2248         int           prev_resid;
2249         int           clip_size;
2250         off_t         max_io_size;
2251         struct iovec  *iov;
2252         int           upl_size;
2253         int           upl_flags;
2254         upl_t         upl;
2255         int           retval = 0;
2256
2257
2258         if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
2259           {
2260             /*
2261              * go do a read through the cache if one of the following is true....
2262              *   NOCACHE is not true
2263              *   the uio request doesn't target USERSPACE
2264              */
2265             return (cluster_read_x(vp, uio, filesize, devblocksize, flags));
2266           }
2267
2268         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2269           {
2270             /*
2271              * we know we have a resid, so this is safe
2272              * skip over any emtpy vectors
2273              */
2274             iov = uio->uio_iov;
2275
2276             while (iov->iov_len == 0) {
2277               uio->uio_iov++;
2278               uio->uio_iovcnt--;
2279               iov = uio->uio_iov;
2280             }
2281             upl_size  = PAGE_SIZE;
2282             upl_flags = UPL_QUERY_OBJECT_TYPE;
2283
2284             if ((vm_map_get_upl(current_map(),
2285                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2286                                &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS)
2287               {
2288                 /*
2289                  * the user app must have passed in an invalid address
2290                  */
2291                 return (EFAULT);
2292               }
2293
2294             /*
2295              * We check every vector target but if it is physically
2296              * contiguous space, we skip the sanity checks.
2297              */
2298             if (upl_flags & UPL_PHYS_CONTIG)
2299               {
2300                 retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
2301               }
2302             else if (uio->uio_resid < PAGE_SIZE)
2303               {
2304                 /*
2305                  * we're here because we're don't have a physically contiguous target buffer
2306                  * go do a read through the cache if
2307                  *   the total xfer size is less than a page...
2308                  */
2309                 return (cluster_read_x(vp, uio, filesize, devblocksize, flags));
2310               }
2311             else if (((int)uio->uio_offset & PAGE_MASK) || ((int)iov->iov_base & PAGE_MASK))
2312               {
2313                 if (((int)uio->uio_offset & PAGE_MASK) == ((int)iov->iov_base & PAGE_MASK))
2314                   {
2315                     /*
2316                      * Bring the file offset read up to a pagesize boundary
2317                      * this will also bring the base address to a page boundary
2318                      * since they both are currently on the same offset within a page
2319                      * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2320                      * so the computed clip_size must always be less than the current uio_resid
2321                      */
2322                     clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2323
2324                     /*
2325                      * Fake the resid going into the cluster_read_x call
2326                      * and restore it on the way out.
2327                      */
2328                     prev_resid = uio->uio_resid;
2329                     uio->uio_resid = clip_size;
2330                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2331                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2332                   }
2333                 else
2334                   {
2335                     /*
2336                      * can't get both the file offset and the buffer offset aligned to a page boundary
2337                      * so fire an I/O through the cache for this entire vector
2338                      */
2339                     clip_size = iov->iov_len;
2340                     prev_resid = uio->uio_resid;
2341                     uio->uio_resid = clip_size;
2342                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2343                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2344                   }
2345               }
2346             else
2347               {
2348                 /*
2349                  * If we come in here, we know the offset into
2350                  * the file is on a pagesize boundary
2351                  */
2352
2353                 max_io_size = filesize - uio->uio_offset;
2354                 clip_size = uio->uio_resid;
2355                 if (iov->iov_len < clip_size)
2356                   clip_size = iov->iov_len;
2357                 if (max_io_size < clip_size)
2358                   clip_size = (int)max_io_size;
2359
2360                 if (clip_size < PAGE_SIZE)
2361                   {
2362                     /*
2363                      * Take care of the tail end of the read in this vector.
2364                      */
2365                     prev_resid = uio->uio_resid;
2366                     uio->uio_resid = clip_size;
2367                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2368                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2369                   }
2370                 else
2371                   {
2372                     /* round clip_size down to a multiple of pagesize */
2373                     clip_size = clip_size & ~(PAGE_MASK);
2374                     prev_resid = uio->uio_resid;
2375                     uio->uio_resid = clip_size;
2376                     retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2377                     if ((retval==0) && uio->uio_resid)
2378                       retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2379                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2380                   }
2381               } /* end else */
2382           } /* end while */
2383
2384         return(retval);
2385 }
2386
2387 static int
2388 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2389         struct vnode *vp;
2390         struct uio   *uio;
2391         off_t         filesize;
2392         int           devblocksize;
2393         int           flags;
2394 {
2395         upl_page_info_t *pl;
2396         upl_t            upl;
2397         vm_offset_t      upl_offset;
2398         int              upl_size;
2399         off_t            upl_f_offset;
2400         int              start_offset;
2401         int              start_pg;
2402         int              last_pg;
2403         int              uio_last;
2404         int              pages_in_upl;
2405         off_t            max_size;
2406         off_t            last_ioread_offset;
2407         off_t            last_request_offset;
2408         u_int            size_of_prefetch;
2409         int              io_size;
2410         kern_return_t    kret;
2411         int              error  = 0;
2412         int              retval = 0;
2413         u_int            b_lblkno;
2414         u_int            e_lblkno;
2415         struct clios     iostate;
2416         u_int            max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2417         u_int            rd_ahead_enabled = 1;
2418         u_int            prefetch_enabled = 1;
2419
2420
2421         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2422                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2423
2424         if (cluster_hard_throttle_on(vp)) {
2425                 rd_ahead_enabled = 0;
2426                 prefetch_enabled = 0;
2427
2428                 max_rd_size = HARD_THROTTLE_MAXSIZE;
2429         }
2430         if (vp->v_flag & (VRAOFF|VNOCACHE_DATA))
2431                 rd_ahead_enabled = 0;
2432
2433         last_request_offset = uio->uio_offset + uio->uio_resid;
2434
2435         if (last_request_offset > filesize)
2436                 last_request_offset = filesize;
2437         b_lblkno = (u_int)(uio->uio_offset / PAGE_SIZE_64);
2438         e_lblkno = (u_int)((last_request_offset - 1) / PAGE_SIZE_64);
2439
2440         if (vp->v_ralen && (vp->v_lastr == b_lblkno || (vp->v_lastr + 1) == b_lblkno)) {
2441                 /*
2442                  * determine if we already have a read-ahead in the pipe courtesy of the
2443                  * last read systemcall that was issued...
2444                  * if so, pick up it's extent to determine where we should start
2445                  * with respect to any read-ahead that might be necessary to
2446                  * garner all the data needed to complete this read systemcall
2447                  */
2448                 last_ioread_offset = (vp->v_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
2449
2450                 if (last_ioread_offset < uio->uio_offset)
2451                         last_ioread_offset = (off_t)0;
2452                 else if (last_ioread_offset > last_request_offset)
2453                         last_ioread_offset = last_request_offset;
2454         } else
2455                 last_ioread_offset = (off_t)0;
2456
2457         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2458                 /*
2459                  * compute the size of the upl needed to encompass
2460                  * the requested read... limit each call to cluster_io
2461                  * to the maximum UPL size... cluster_io will clip if
2462                  * this exceeds the maximum io_size for the device,
2463                  * make sure to account for
2464                  * a starting offset that's not page aligned
2465                  */
2466                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2467                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2468                 max_size     = filesize - uio->uio_offset;
2469
2470                 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2471                         io_size = uio->uio_resid;
2472                 else
2473                         io_size = max_size;
2474
2475                 if (!(vp->v_flag & VNOCACHE_DATA)) {
2476
2477                         while (io_size) {
2478                                 u_int io_resid;
2479                                 u_int io_requested;
2480
2481                                 /*
2482                                  * if we keep finding the pages we need already in the cache, then
2483                                  * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2484                                  * to determine that we have all the pages we need... once we miss in
2485                                  * the cache and have issued an I/O, than we'll assume that we're likely
2486                                  * to continue to miss in the cache and it's to our advantage to try and prefetch
2487                                  */
2488                                 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
2489                                         if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
2490                                                 /*
2491                                                  * we've already issued I/O for this request and
2492                                                  * there's still work to do and
2493                                                  * our prefetch stream is running dry, so issue a
2494                                                  * pre-fetch I/O... the I/O latency will overlap
2495                                                  * with the copying of the data
2496                                                  */
2497                                                 if (size_of_prefetch > max_rd_size)
2498                                                         size_of_prefetch = max_rd_size;
2499
2500                                                 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, devblocksize);
2501
2502                                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2503
2504                                                 if (last_ioread_offset > last_request_offset)
2505                                                         last_ioread_offset = last_request_offset;
2506                                         }
2507                                 }
2508                                 /*
2509                                  * limit the size of the copy we're about to do so that
2510                                  * we can notice that our I/O pipe is running dry and
2511                                  * get the next I/O issued before it does go dry
2512                                  */
2513                                 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
2514                                         io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
2515                                 else
2516                                         io_resid = io_size;
2517
2518                                 io_requested = io_resid;
2519
2520                                 retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
2521
2522                                 io_size -= (io_requested - io_resid);
2523
2524                                 if (retval || io_resid)
2525                                         /*
2526                                          * if we run into a real error or
2527                                          * a page that is not in the cache
2528                                          * we need to leave streaming mode
2529                                          */
2530                                         break;
2531
2532                                 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
2533                                         /*
2534                                          * we're already finished the I/O for this read request
2535                                          * let's see if we should do a read-ahead
2536                                          */
2537                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2538                                 }
2539                         }
2540                         if (retval)
2541                                 break;
2542                         if (io_size == 0) {
2543                                 if (e_lblkno < vp->v_lastr)
2544                                         vp->v_maxra = 0;
2545                                 vp->v_lastr = e_lblkno;
2546
2547                                 break;
2548                         }
2549                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2550                         upl_f_offset = uio->uio_offset - (off_t)start_offset;
2551                         max_size     = filesize - uio->uio_offset;
2552                 }
2553                 if (io_size > max_rd_size)
2554                         io_size = max_rd_size;
2555
2556                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2557
2558                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2559                         upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
2560                 pages_in_upl = upl_size / PAGE_SIZE;
2561
2562                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2563                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2564
2565                 kret = ubc_create_upl(vp,
2566                                                 upl_f_offset,
2567                                                 upl_size,
2568                                                 &upl,
2569                                                 &pl,
2570                                                 UPL_SET_LITE);
2571                 if (kret != KERN_SUCCESS)
2572                         panic("cluster_read: failed to get pagelist");
2573
2574                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2575                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2576
2577                 /*
2578                  * scan from the beginning of the upl looking for the first
2579                  * non-valid page.... this will become the first page in
2580                  * the request we're going to make to 'cluster_io'... if all
2581                  * of the pages are valid, we won't call through to 'cluster_io'
2582                  */
2583                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2584                         if (!upl_valid_page(pl, start_pg))
2585                                 break;
2586                 }
2587
2588                 /*
2589                  * scan from the starting invalid page looking for a valid
2590                  * page before the end of the upl is reached, if we
2591                  * find one, then it will be the last page of the request to
2592                  * 'cluster_io'
2593                  */
2594                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2595                         if (upl_valid_page(pl, last_pg))
2596                                 break;
2597                 }
2598                 iostate.io_completed = 0;
2599                 iostate.io_issued = 0;
2600                 iostate.io_error = 0;
2601                 iostate.io_wanted = 0;
2602
2603                 if (start_pg < last_pg) {
2604                         /*
2605                          * we found a range of 'invalid' pages that must be filled
2606                          * if the last page in this range is the last page of the file
2607                          * we may have to clip the size of it to keep from reading past
2608                          * the end of the last physical block associated with the file
2609                          */
2610                         upl_offset = start_pg * PAGE_SIZE;
2611                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2612
2613                         if ((upl_f_offset + upl_offset + io_size) > filesize)
2614                                 io_size = filesize - (upl_f_offset + upl_offset);
2615
2616                         /*
2617                          * issue an asynchronous read to cluster_io
2618                          */
2619
2620                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2621                                            io_size, devblocksize, CL_READ | CL_ASYNC, (struct buf *)0, &iostate);
2622                 }
2623                 if (error == 0) {
2624                         /*
2625                          * if the read completed successfully, or there was no I/O request
2626                          * issued, than copy the data into user land via 'cluster_upl_copy_data'
2627                          * we'll first add on any 'valid'
2628                          * pages that were present in the upl when we acquired it.
2629                          */
2630                         u_int  val_size;
2631
2632                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2633                                 if (!upl_valid_page(pl, uio_last))
2634                                         break;
2635                         }
2636                         /*
2637                          * compute size to transfer this round,  if uio->uio_resid is
2638                          * still non-zero after this attempt, we'll loop around and
2639                          * set up for another I/O.
2640                          */
2641                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2642
2643                         if (val_size > max_size)
2644                                 val_size = max_size;
2645
2646                         if (val_size > uio->uio_resid)
2647                                 val_size = uio->uio_resid;
2648
2649                         if (last_ioread_offset == 0)
2650                                 last_ioread_offset = uio->uio_offset + val_size;
2651
2652                         if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
2653                                 /*
2654                                  * if there's still I/O left to do for this request, and...
2655                                  * we're not in hard throttle mode, then issue a
2656                                  * pre-fetch I/O... the I/O latency will overlap
2657                                  * with the copying of the data
2658                                  */
2659                                 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, devblocksize);
2660
2661                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2662
2663                                 if (last_ioread_offset > last_request_offset)
2664                                         last_ioread_offset = last_request_offset;
2665
2666                         } else if ((uio->uio_offset + val_size) == last_request_offset) {
2667                                 /*
2668                                  * this transfer will finish this request, so...
2669                                  * let's try to read ahead if we're in
2670                                  * a sequential access pattern and we haven't
2671                                  * explicitly disabled it
2672                                  */
2673                                 if (rd_ahead_enabled)
2674                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2675
2676                                 if (e_lblkno < vp->v_lastr)
2677                                         vp->v_maxra = 0;
2678                                 vp->v_lastr = e_lblkno;
2679                         }
2680                         while (iostate.io_issued != iostate.io_completed) {
2681                                 iostate.io_wanted = 1;
2682                                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_read_x", 0);
2683                         }
2684                         if (iostate.io_error)
2685                                 error = iostate.io_error;
2686                         else
2687                                 retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
2688                 }
2689                 if (start_pg < last_pg) {
2690                         /*
2691                          * compute the range of pages that we actually issued an I/O for
2692                          * and either commit them as valid if the I/O succeeded
2693                          * or abort them if the I/O failed
2694                          */
2695                         io_size = (last_pg - start_pg) * PAGE_SIZE;
2696
2697                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2698                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2699
2700                         if (error || (vp->v_flag & VNOCACHE_DATA))
2701                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2702                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2703                         else
2704                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2705                                                      UPL_COMMIT_CLEAR_DIRTY |
2706                                                      UPL_COMMIT_FREE_ON_EMPTY |
2707                                                      UPL_COMMIT_INACTIVATE);
2708
2709                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2710                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2711                 }
2712                 if ((last_pg - start_pg) < pages_in_upl) {
2713                         int cur_pg;
2714                         int commit_flags;
2715
2716                         /*
2717                          * the set of pages that we issued an I/O for did not encompass
2718                          * the entire upl... so just release these without modifying
2719                          * their state
2720                          */
2721                         if (error)
2722                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2723                         else {
2724                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2725                                              (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2726
2727                                 if (start_pg) {
2728                                         /*
2729                                          * we found some already valid pages at the beginning of
2730                                          * the upl commit these back to the inactive list with
2731                                          * reference cleared
2732                                          */
2733                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2734                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2735                                                                    | UPL_COMMIT_INACTIVATE;
2736
2737                                                 if (upl_dirty_page(pl, cur_pg))
2738                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2739
2740                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2741                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2742                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2743                                                 else
2744                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2745                                                                 PAGE_SIZE, commit_flags);
2746                                         }
2747                                 }
2748                                 if (last_pg < uio_last) {
2749                                         /*
2750                                          * we found some already valid pages immediately after the
2751                                          * pages we issued I/O for, commit these back to the
2752                                          * inactive list with reference cleared
2753                                          */
2754                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2755                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
2756                                                                                 | UPL_COMMIT_INACTIVATE;
2757
2758                                                 if (upl_dirty_page(pl, cur_pg))
2759                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2760
2761                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2762                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2763                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2764                                                 else
2765                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2766                                                                 PAGE_SIZE, commit_flags);
2767                                         }
2768                                 }
2769                                 if (uio_last < pages_in_upl) {
2770                                         /*
2771                                          * there were some invalid pages beyond the valid pages
2772                                          * that we didn't issue an I/O for, just release them
2773                                          * unchanged
2774                                          */
2775                                         ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2776                                                             (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
2777                                 }
2778
2779                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2780                                         (int)upl, -1, -1, 0, 0);
2781                         }
2782                 }
2783                 if (retval == 0)
2784                         retval = error;
2785         }
2786         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2787                      (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2788
2789         return (retval);
2790 }
2791
2792
2793 static int
2794 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2795         struct vnode *vp;
2796         struct uio   *uio;
2797         off_t         filesize;
2798         int           devblocksize;
2799         int           flags;
2800 {
2801         upl_t            upl;
2802         upl_page_info_t  *pl;
2803         vm_offset_t      upl_offset;
2804         off_t            max_io_size;
2805         int              io_size;
2806         int              upl_size;
2807         int              upl_needed_size;
2808         int              pages_in_pl;
2809         int              upl_flags;
2810         kern_return_t    kret;
2811         struct iovec     *iov;
2812         int              i;
2813         int              force_data_sync;
2814         int              retval = 0;
2815         struct clios     iostate;
2816         u_int            max_rd_size  = MAX_UPL_TRANSFER * PAGE_SIZE;
2817         u_int            max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
2818
2819
2820         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2821                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2822
2823         /*
2824          * When we enter this routine, we know
2825          *  -- the offset into the file is on a pagesize boundary
2826          *  -- the resid is a page multiple
2827          *  -- the resid will not exceed iov_len
2828          */
2829
2830         iostate.io_completed = 0;
2831         iostate.io_issued = 0;
2832         iostate.io_error = 0;
2833         iostate.io_wanted = 0;
2834
2835         iov = uio->uio_iov;
2836
2837         if (cluster_hard_throttle_on(vp)) {
2838                 max_rd_size  = HARD_THROTTLE_MAXSIZE;
2839                 max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
2840         }
2841         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2842
2843                 max_io_size = filesize - uio->uio_offset;
2844
2845                 if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2846                         io_size = max_io_size;
2847                 else
2848                         io_size = uio->uio_resid;
2849
2850                 /*
2851                  * First look for pages already in the cache
2852                  * and move them to user space.
2853                  */
2854                 retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
2855
2856                 if (retval) {
2857                         /*
2858                          * we may have already spun some portion of this request
2859                          * off as async requests... we need to wait for the I/O
2860                          * to complete before returning
2861                          */
2862                         goto wait_for_reads;
2863                 }
2864                 /*
2865                  * If we are already finished with this read, then return
2866                  */
2867                 if (io_size == 0) {
2868                         /*
2869                          * we may have already spun some portion of this request
2870                          * off as async requests... we need to wait for the I/O
2871                          * to complete before returning
2872                          */
2873                         goto wait_for_reads;
2874                 }
2875                 max_io_size = io_size;
2876
2877                 if (max_io_size > max_rd_size)
2878                         max_io_size = max_rd_size;
2879
2880                 io_size = 0;
2881
2882                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
2883
2884                 if (io_size == 0)
2885                         /*
2886                          * we may have already spun some portion of this request
2887                          * off as async requests... we need to wait for the I/O
2888                          * to complete before returning
2889                          */
2890                         goto wait_for_reads;
2891
2892                 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
2893                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2894
2895                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2896                              (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
2897
2898                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2899                         pages_in_pl = 0;
2900                         upl_size = upl_needed_size;
2901                         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2902
2903                         kret = vm_map_get_upl(current_map(),
2904                                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2905                                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2906
2907                         if (kret != KERN_SUCCESS) {
2908                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2909                                              (int)upl_offset, upl_size, io_size, kret, 0);
2910                                 /*
2911                                  * cluster_nocopy_read: failed to get pagelist
2912                                  *
2913                                  * we may have already spun some portion of this request
2914                                  * off as async requests... we need to wait for the I/O
2915                                  * to complete before returning
2916                                  */
2917                                 goto wait_for_reads;
2918                         }
2919                         pages_in_pl = upl_size / PAGE_SIZE;
2920                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2921
2922                         for (i = 0; i < pages_in_pl; i++) {
2923                                 if (!upl_valid_page(pl, i))
2924                                         break;
2925                         }
2926                         if (i == pages_in_pl)
2927                                 break;
2928
2929                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2930                                             UPL_ABORT_FREE_ON_EMPTY);
2931                 }
2932                 if (force_data_sync >= 3) {
2933                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2934                                      (int)upl_offset, upl_size, io_size, kret, 0);
2935
2936                         goto wait_for_reads;
2937                 }
2938                 /*
2939                  * Consider the possibility that upl_size wasn't satisfied.
2940                  */
2941                 if (upl_size != upl_needed_size)
2942                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2943
2944                 if (io_size == 0) {
2945                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2946                                             UPL_ABORT_FREE_ON_EMPTY);
2947                         goto wait_for_reads;
2948                 }
2949                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2950                              (int)upl_offset, upl_size, io_size, kret, 0);
2951
2952                 /*
2953                  * request asynchronously so that we can overlap
2954                  * the preparation of the next I/O
2955                  * if there are already too many outstanding reads
2956                  * wait until some have completed before issuing the next read
2957                  */
2958                 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
2959                         iostate.io_wanted = 1;
2960                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2961                 }
2962                 if (iostate.io_error) {
2963                         /*
2964                          * one of the earlier reads we issued ran into a hard error
2965                          * don't issue any more reads, cleanup the UPL
2966                          * that was just created but not used, then
2967                          * go wait for any other reads to complete before
2968                          * returning the error to the caller
2969                          */
2970                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2971                                             UPL_ABORT_FREE_ON_EMPTY);
2972
2973                         goto wait_for_reads;
2974                 }
2975                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2976                              (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
2977
2978                 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2979                                    io_size, devblocksize,
2980                                    CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
2981                                    (struct buf *)0, &iostate);
2982
2983                 /*
2984                  * update the uio structure
2985                  */
2986                 iov->iov_base   += io_size;
2987                 iov->iov_len    -= io_size;
2988                 uio->uio_resid  -= io_size;
2989                 uio->uio_offset += io_size;
2990
2991                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2992                              (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, retval, 0);
2993
2994         } /* end while */
2995
2996 wait_for_reads:
2997         /*
2998          * make sure all async reads that are part of this stream
2999          * have completed before we return
3000          */
3001         while (iostate.io_issued != iostate.io_completed) {
3002                 iostate.io_wanted = 1;
3003                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
3004         }
3005         if (iostate.io_error)
3006                 retval = iostate.io_error;
3007
3008         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
3009                      (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
3010
3011         return (retval);
3012 }
3013
3014
3015 static int
3016 cluster_phys_read(vp, uio, filesize, devblocksize, flags)
3017         struct vnode *vp;
3018         struct uio   *uio;
3019         off_t        filesize;
3020         int          devblocksize;
3021         int          flags;
3022 {
3023         upl_page_info_t *pl;
3024         upl_t            upl;
3025         vm_offset_t      upl_offset;
3026         addr64_t         dst_paddr;
3027         off_t            max_size;
3028         int              io_size;
3029         int              tail_size;
3030         int              upl_size;
3031         int              upl_needed_size;
3032         int              pages_in_pl;
3033         int              upl_flags;
3034         kern_return_t    kret;
3035         struct iovec     *iov;
3036         struct clios     iostate;
3037         int              error;
3038
3039         /*
3040          * When we enter this routine, we know
3041          *  -- the resid will not exceed iov_len
3042          *  -- the target address is physically contiguous
3043          */
3044
3045         iov = uio->uio_iov;
3046
3047         max_size = filesize - uio->uio_offset;
3048
3049         if (max_size > (off_t)((unsigned int)iov->iov_len))
3050                 io_size = iov->iov_len;
3051         else
3052                 io_size = max_size;
3053
3054         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
3055         upl_needed_size = upl_offset + io_size;
3056
3057         error       = 0;
3058         pages_in_pl = 0;
3059         upl_size = upl_needed_size;
3060         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3061
3062         kret = vm_map_get_upl(current_map(),
3063                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
3064                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
3065
3066         if (kret != KERN_SUCCESS) {
3067                 /*
3068                  * cluster_phys_read: failed to get pagelist
3069                  */
3070                 return(EINVAL);
3071         }
3072         if (upl_size < upl_needed_size) {
3073                 /*
3074                  * The upl_size wasn't satisfied.
3075                  */
3076                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3077
3078                 return(EINVAL);
3079         }
3080         pl = ubc_upl_pageinfo(upl);
3081
3082         dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)((u_int)iov->iov_base & PAGE_MASK));
3083
3084         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3085                 int   head_size;
3086
3087                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
3088
3089                 if (head_size > io_size)
3090                         head_size = io_size;
3091
3092                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
3093
3094                 if (error) {
3095                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3096
3097                         return(EINVAL);
3098                 }
3099                 upl_offset += head_size;
3100                 dst_paddr  += head_size;
3101                 io_size    -= head_size;
3102         }
3103         tail_size = io_size & (devblocksize - 1);
3104         io_size  -= tail_size;
3105
3106         iostate.io_completed = 0;
3107         iostate.io_issued = 0;
3108         iostate.io_error = 0;
3109         iostate.io_wanted = 0;
3110
3111         while (io_size && error == 0) {
3112                 int  xsize;
3113
3114                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3115                         xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
3116                 else
3117                         xsize = io_size;
3118                 /*
3119                  * request asynchronously so that we can overlap
3120                  * the preparation of the next I/O... we'll do
3121                  * the commit after all the I/O has completed
3122                  * since its all issued against the same UPL
3123                  * if there are already too many outstanding reads
3124                  * wait until some have completed before issuing the next
3125                  */
3126                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
3127                         iostate.io_wanted = 1;
3128                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3129                 }
3130
3131                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0,
3132                                    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3133                                    (struct buf *)0, &iostate);
3134                 /*
3135                  * The cluster_io read was issued successfully,
3136                  * update the uio structure
3137                  */
3138                 if (error == 0) {
3139                         uio->uio_resid  -= xsize;
3140                         iov->iov_len    -= xsize;
3141                         iov->iov_base   += xsize;
3142                         uio->uio_offset += xsize;
3143                         dst_paddr       += xsize;
3144                         upl_offset      += xsize;
3145                         io_size         -= xsize;
3146                 }
3147         }
3148         /*
3149          * make sure all async reads that are part of this stream
3150          * have completed before we proceed
3151          */
3152         while (iostate.io_issued != iostate.io_completed) {
3153                 iostate.io_wanted = 1;
3154                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3155         }
3156         if (iostate.io_error) {
3157                 error = iostate.io_error;
3158         }
3159         if (error == 0 && tail_size)
3160                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
3161
3162         /*
3163          * just release our hold on the physically contiguous
3164          * region without changing any state
3165          */
3166         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3167
3168         return (error);
3169 }
3170
3171
3172 /*
3173  * generate advisory I/O's in the largest chunks possible
3174  * the completed pages will be released into the VM cache
3175  */
3176 int
3177 advisory_read(vp, filesize, f_offset, resid, devblocksize)
3178         struct vnode *vp;
3179         off_t         filesize;
3180         off_t         f_offset;
3181         int           resid;
3182         int           devblocksize;
3183 {
3184         upl_page_info_t *pl;
3185         upl_t            upl;
3186         vm_offset_t      upl_offset;
3187         int              upl_size;
3188         off_t            upl_f_offset;
3189         int              start_offset;
3190         int              start_pg;
3191         int              last_pg;
3192         int              pages_in_upl;
3193         off_t            max_size;
3194         int              io_size;
3195         kern_return_t    kret;
3196         int              retval = 0;
3197         int              issued_io;
3198         int              skip_range;
3199
3200         if (!UBCINFOEXISTS(vp))
3201                 return(EINVAL);
3202
3203         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3204                      (int)f_offset, resid, (int)filesize, devblocksize, 0);
3205
3206         while (resid && f_offset < filesize && retval == 0) {
3207                 /*
3208                  * compute the size of the upl needed to encompass
3209                  * the requested read... limit each call to cluster_io
3210                  * to the maximum UPL size... cluster_io will clip if
3211                  * this exceeds the maximum io_size for the device,
3212                  * make sure to account for
3213                  * a starting offset that's not page aligned
3214                  */
3215                 start_offset = (int)(f_offset & PAGE_MASK_64);
3216                 upl_f_offset = f_offset - (off_t)start_offset;
3217                 max_size     = filesize - f_offset;
3218
3219                 if (resid < max_size)
3220                         io_size = resid;
3221                 else
3222                         io_size = max_size;
3223
3224                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3225                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3226                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3227
3228                 skip_range = 0;
3229                 /*
3230                  * return the number of contiguously present pages in the cache
3231                  * starting at upl_f_offset within the file
3232                  */
3233                 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
3234
3235                 if (skip_range) {
3236                         /*
3237                          * skip over pages already present in the cache
3238                          */
3239                         io_size = skip_range - start_offset;
3240
3241                         f_offset += io_size;
3242                         resid    -= io_size;
3243
3244                         if (skip_range == upl_size)
3245                                 continue;
3246                         /*
3247                          * have to issue some real I/O
3248                          * at this point, we know it's starting on a page boundary
3249                          * because we've skipped over at least the first page in the request
3250                          */
3251                         start_offset = 0;
3252                         upl_f_offset += skip_range;
3253                         upl_size     -= skip_range;
3254                 }
3255                 pages_in_upl = upl_size / PAGE_SIZE;
3256
3257                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
3258                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3259
3260                 kret = ubc_create_upl(vp,
3261                                                 upl_f_offset,
3262                                                 upl_size,
3263                                                 &upl,
3264                                                 &pl,
3265                                                 UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
3266                 if (kret != KERN_SUCCESS)
3267                         return(retval);
3268                 issued_io = 0;
3269
3270                 /*
3271                  * before we start marching forward, we must make sure we end on
3272                  * a present page, otherwise we will be working with a freed
3273                  * upl
3274                  */
3275                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3276                         if (upl_page_present(pl, last_pg))
3277                                 break;
3278                 }
3279                 pages_in_upl = last_pg + 1;
3280
3281
3282                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
3283                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3284
3285
3286                 for (last_pg = 0; last_pg < pages_in_upl; ) {
3287                         /*
3288                          * scan from the beginning of the upl looking for the first
3289                          * page that is present.... this will become the first page in
3290                          * the request we're going to make to 'cluster_io'... if all
3291                          * of the pages are absent, we won't call through to 'cluster_io'
3292                          */
3293                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3294                                 if (upl_page_present(pl, start_pg))
3295                                         break;
3296                         }
3297
3298                         /*
3299                          * scan from the starting present page looking for an absent
3300                          * page before the end of the upl is reached, if we
3301                          * find one, then it will terminate the range of pages being
3302                          * presented to 'cluster_io'
3303                          */
3304                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3305                                 if (!upl_page_present(pl, last_pg))
3306                                         break;
3307                         }
3308
3309                         if (last_pg > start_pg) {
3310                                 /*
3311                                  * we found a range of pages that must be filled
3312                                  * if the last page in this range is the last page of the file
3313                                  * we may have to clip the size of it to keep from reading past
3314                                  * the end of the last physical block associated with the file
3315                                  */
3316                                 upl_offset = start_pg * PAGE_SIZE;
3317                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
3318
3319                                 if ((upl_f_offset + upl_offset + io_size) > filesize)
3320                                         io_size = filesize - (upl_f_offset + upl_offset);
3321
3322                                 /*
3323                                  * issue an asynchronous read to cluster_io
3324                                  */
3325                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
3326                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
3327
3328                                 issued_io = 1;
3329                         }
3330                 }
3331                 if (issued_io == 0)
3332                         ubc_upl_abort(upl, 0);
3333
3334                 io_size = upl_size - start_offset;
3335
3336                 if (io_size > resid)
3337                         io_size = resid;
3338                 f_offset += io_size;
3339                 resid    -= io_size;
3340         }
3341
3342         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3343                      (int)f_offset, resid, retval, 0, 0);
3344
3345         return(retval);
3346 }
3347
3348
3349 int
3350 cluster_push(vp)
3351         struct vnode *vp;
3352 {
3353         int  retval;
3354
3355         if (!UBCINFOEXISTS(vp) || (vp->v_clen == 0 && !(vp->v_flag & VHASDIRTY)))
3356                 return(0);
3357
3358         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3359                      vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
3360
3361         if (vp->v_flag & VHASDIRTY) {
3362                 sparse_cluster_push(vp, ubc_getsize(vp), 1);
3363
3364                 vp->v_clen = 0;
3365                 retval = 1;
3366         } else
3367                 retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
3368
3369         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3370                      vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
3371
3372         return (retval);
3373 }
3374
3375
3376 int
3377 cluster_release(vp)
3378         struct vnode *vp;
3379 {
3380         off_t offset;
3381         u_int length;
3382
3383         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3384
3385         if (vp->v_flag & VHASDIRTY) {
3386                 vfs_drt_control(&(vp->v_scmap), 0);
3387
3388                 vp->v_flag &= ~VHASDIRTY;
3389         }
3390         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3391 }
3392
3393
3394 static int
3395 cluster_try_push(vp, EOF, can_delay, push_all)
3396         struct vnode *vp;
3397         off_t  EOF;
3398         int    can_delay;
3399         int    push_all;
3400 {
3401         int cl_index;
3402         int cl_index1;
3403         int min_index;
3404         int cl_len;
3405         int cl_total;
3406         int cl_pushed = 0;
3407         struct v_cluster l_clusters[MAX_CLUSTERS];
3408
3409         /*
3410          * make a local 'sorted' copy of the clusters
3411          * and clear vp->v_clen so that new clusters can
3412          * be developed
3413          */
3414         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3415                 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3416                         if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3417                                 continue;
3418                         if (min_index == -1)
3419                                 min_index = cl_index1;
3420                         else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3421                                 min_index = cl_index1;
3422                 }
3423                 if (min_index == -1)
3424                         break;
3425                 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3426                 l_clusters[cl_index].last_pg  = vp->v_clusters[min_index].last_pg;
3427
3428                 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3429         }
3430         cl_len     = cl_index;
3431         vp->v_clen = 0;
3432
3433         if (can_delay && cl_len == MAX_CLUSTERS) {
3434                 int   i;
3435
3436                 /*
3437                  * determine if we appear to be writing the file sequentially
3438                  * if not, by returning without having pushed any clusters
3439                  * we will cause this vnode to be pushed into the sparse cluster mechanism
3440                  * used for managing more random I/O patterns
3441                  *
3442                  * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3443                  * that's why we're in try_push with can_delay true...
3444                  *
3445                  * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3446                  * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3447                  * so we can just make a simple pass through up, to but not including the last one...
3448                  * note that last_pg is not inclusive, so it will be equal to the start_pg of the next cluster if they
3449                  * are sequential
3450                  *
3451                  * we let the last one be partial as long as it was adjacent to the previous one...
3452                  * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3453                  * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3454                  */
3455                 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
3456                         if ((l_clusters[i].last_pg - l_clusters[i].start_pg) != MAX_UPL_TRANSFER)
3457                                 goto dont_try;
3458                         if (l_clusters[i].last_pg != l_clusters[i+1].start_pg)
3459                                 goto dont_try;
3460                 }
3461         }
3462         for (cl_index = 0; cl_index < cl_len; cl_index++) {
3463                 /*
3464                  * try to push each cluster in turn...  cluster_push_x may not
3465                  * push the cluster if can_delay is TRUE and the cluster doesn't
3466                  * meet the critera for an immediate push
3467                  */
3468                 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3469                         l_clusters[cl_index].start_pg = 0;
3470                         l_clusters[cl_index].last_pg  = 0;
3471
3472                         cl_pushed++;
3473
3474                         if (push_all == 0)
3475                                 break;
3476                 }
3477         }
3478 dont_try:
3479         if (cl_len > cl_pushed) {
3480                /*
3481                 * we didn't push all of the clusters, so
3482                 * lets try to merge them back in to the vnode
3483                 */
3484                 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3485                         /*
3486                          * we picked up some new clusters while we were trying to
3487                          * push the old ones (I don't think this can happen because
3488                          * I'm holding the lock, but just in case)... the sum of the
3489                          * leftovers plus the new cluster count exceeds our ability
3490                          * to represent them, so switch to the sparse cluster mechanism
3491                          */
3492
3493                         /*
3494                          * first collect the new clusters sitting in the vp
3495                          */
3496                         sparse_cluster_switch(vp, EOF);
3497
3498                         for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
3499                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3500                                         continue;
3501                                 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3502                                 vp->v_clusters[cl_index1].last_pg  = l_clusters[cl_index].last_pg;
3503
3504                                 cl_index1++;
3505                         }
3506                         /*
3507                          * update the cluster count
3508                          */
3509                         vp->v_clen = cl_index1;
3510
3511                         /*
3512                          * and collect the original clusters that were moved into the
3513                          * local storage for sorting purposes
3514                          */
3515                         sparse_cluster_switch(vp, EOF);
3516
3517                 } else {
3518                         /*
3519                          * we've got room to merge the leftovers back in
3520                          * just append them starting at the next 'hole'
3521                          * represented by vp->v_clen
3522                          */
3523                         for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3524                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3525                                         continue;
3526
3527                                 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3528                                 vp->v_clusters[cl_index1].last_pg  = l_clusters[cl_index].last_pg;
3529
3530                                 cl_index1++;
3531                         }
3532                         /*
3533                          * update the cluster count
3534                          */
3535                         vp->v_clen = cl_index1;
3536                 }
3537         }
3538         return(MAX_CLUSTERS - vp->v_clen);
3539 }
3540
3541
3542
3543 static int
3544 cluster_push_x(vp, EOF, first, last, can_delay)
3545         struct vnode *vp;
3546         off_t  EOF;
3547         unsigned int first;
3548         unsigned int last;
3549         int    can_delay;
3550 {
3551         upl_page_info_t *pl;
3552         upl_t            upl;
3553         vm_offset_t      upl_offset;
3554         int              upl_size;
3555         off_t            upl_f_offset;
3556         int              pages_in_upl;
3557         int              start_pg;
3558         int              last_pg;
3559         int              io_size;
3560         int              io_flags;
3561         int              upl_flags;
3562         int              size;
3563         kern_return_t    kret;
3564
3565
3566         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3567                      vp->v_clen, first, last, EOF, 0);
3568
3569         if ((pages_in_upl = last - first) == 0) {
3570                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
3571
3572                 return (1);
3573         }
3574         upl_size = pages_in_upl * PAGE_SIZE;
3575         upl_f_offset = (off_t)((unsigned long long)first * PAGE_SIZE_64);
3576
3577         if (upl_f_offset + upl_size >= EOF) {
3578
3579                 if (upl_f_offset >= EOF) {
3580                         /*
3581                          * must have truncated the file and missed
3582                          * clearing a dangling cluster (i.e. it's completely
3583                          * beyond the new EOF
3584                          */
3585                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3586
3587                         return(1);
3588                 }
3589                 size = EOF - upl_f_offset;
3590
3591                 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3592                 pages_in_upl = upl_size / PAGE_SIZE;
3593         } else
3594                 size = upl_size;
3595
3596         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
3597
3598         if (vp->v_flag & VNOCACHE_DATA)
3599                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
3600         else
3601                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
3602
3603         kret = ubc_create_upl(vp,
3604                                 upl_f_offset,
3605                                 upl_size,
3606                                 &upl,
3607                                 &pl,
3608                                 upl_flags);
3609         if (kret != KERN_SUCCESS)
3610                 panic("cluster_push: failed to get pagelist");
3611
3612         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
3613
3614         /*
3615          * since we only asked for the dirty pages back
3616          * it's possible that we may only get a few or even none, so...
3617          * before we start marching forward, we must make sure we know
3618          * where the last present page is in the UPL, otherwise we could
3619          * end up working with a freed upl due to the FREE_ON_EMPTY semantics
3620          * employed by commit_range and abort_range.
3621          */
3622         for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3623                 if (upl_page_present(pl, last_pg))
3624                         break;
3625         }
3626         pages_in_upl = last_pg + 1;
3627
3628         if (pages_in_upl == 0) {
3629                 ubc_upl_abort(upl, 0);
3630
3631                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
3632                 return(1);
3633         }
3634
3635         for (last_pg = 0; last_pg < pages_in_upl; ) {
3636                 /*
3637                  * find the next dirty page in the UPL
3638                  * this will become the first page in the
3639                  * next I/O to generate
3640                  */
3641                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3642                         if (upl_dirty_page(pl, start_pg))
3643                                 break;
3644                         if (upl_page_present(pl, start_pg))
3645                                 /*
3646                                  * RET_ONLY_DIRTY will return non-dirty 'precious' pages
3647                                  * just release these unchanged since we're not going
3648                                  * to steal them or change their state
3649                                  */
3650                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
3651                 }
3652                 if (start_pg >= pages_in_upl)
3653                         /*
3654                          * done... no more dirty pages to push
3655                          */
3656                         break;
3657                 if (start_pg > last_pg)
3658                         /*
3659                          * skipped over some non-dirty pages
3660                          */
3661                         size -= ((start_pg - last_pg) * PAGE_SIZE);
3662
3663                 /*
3664                  * find a range of dirty pages to write
3665                  */
3666                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3667                         if (!upl_dirty_page(pl, last_pg))
3668                                 break;
3669                 }
3670                 upl_offset = start_pg * PAGE_SIZE;
3671
3672                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3673
3674                 if (vp->v_flag & VNOCACHE_DATA)
3675                         io_flags = CL_THROTTLE | CL_COMMIT | CL_ASYNC | CL_DUMP;
3676                 else
3677                         io_flags = CL_THROTTLE | CL_COMMIT | CL_ASYNC;
3678
3679                 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
3680
3681                 size -= io_size;
3682         }
3683         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3684
3685         return(1);
3686 }
3687
3688
3689 static int
3690 sparse_cluster_switch(struct vnode *vp, off_t EOF)
3691 {
3692         int cl_index;
3693
3694         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3695
3696         if ( !(vp->v_flag & VHASDIRTY)) {
3697                 vp->v_flag |= VHASDIRTY;
3698                 vp->v_scdirty = 0;
3699                 vp->v_scmap   = 0;
3700         }
3701         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3702                 int    flags;
3703                 int    start_pg;
3704                 int    last_pg;
3705
3706                 for (start_pg = vp->v_clusters[cl_index].start_pg; start_pg < vp->v_clusters[cl_index].last_pg; start_pg++) {
3707
3708                         if (ubc_page_op(vp, (off_t)(((off_t)start_pg) * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
3709                                 if (flags & UPL_POP_DIRTY)
3710                                         sparse_cluster_add(vp, EOF, start_pg, start_pg + 1);
3711                         }
3712                 }
3713         }
3714         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3715 }
3716
3717
3718 static int
3719 sparse_cluster_push(struct vnode *vp, off_t EOF, int push_all)
3720 {
3721         unsigned int first;
3722         unsigned int last;
3723         off_t offset;
3724         u_int length;
3725
3726         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, push_all, 0);
3727
3728         if (push_all)
3729                 vfs_drt_control(&(vp->v_scmap), 1);
3730
3731         for (;;) {
3732                 if (vfs_drt_get_cluster(&(vp->v_scmap), &offset, &length) != KERN_SUCCESS) {
3733                         vp->v_flag &= ~VHASDIRTY;
3734                         vp->v_clen = 0;
3735                         break;
3736                 }
3737                 first = (unsigned int)(offset / PAGE_SIZE_64);
3738                 last  = (unsigned int)((offset + length) / PAGE_SIZE_64);
3739
3740                 cluster_push_x(vp, EOF, first, last, 0);
3741
3742                 vp->v_scdirty -= (last - first);
3743
3744                 if (push_all == 0)
3745                         break;
3746         }
3747         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3748 }
3749
3750
3751 static int
3752 sparse_cluster_add(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last)
3753 {
3754         u_int new_dirty;
3755         u_int length;
3756         off_t offset;
3757
3758         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)vp->v_scmap, vp->v_scdirty, first, last, 0);
3759
3760         offset = (off_t)first * PAGE_SIZE_64;
3761         length = (last - first) * PAGE_SIZE;
3762
3763         while (vfs_drt_mark_pages(&(vp->v_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
3764                 /*
3765                  * no room left in the map
3766                  * only a partial update was done
3767                  * push out some pages and try again
3768                  */
3769                 vp->v_scdirty += new_dirty;
3770
3771                 sparse_cluster_push(vp, EOF, 0);
3772
3773                 offset += (new_dirty * PAGE_SIZE_64);
3774                 length -= (new_dirty * PAGE_SIZE);
3775         }
3776         vp->v_scdirty += new_dirty;
3777
3778         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3779 }
3780
3781
3782 static int
3783 cluster_align_phys_io(struct vnode *vp, struct uio *uio, addr64_t usr_paddr, int xsize, int devblocksize, int flags)
3784 {
3785         struct iovec     *iov;
3786         upl_page_info_t  *pl;
3787         upl_t            upl;
3788         addr64_t         ubc_paddr;
3789         kern_return_t    kret;
3790         int              error = 0;
3791
3792         iov = uio->uio_iov;
3793
3794         kret = ubc_create_upl(vp,
3795                               uio->uio_offset & ~PAGE_MASK_64,
3796                               PAGE_SIZE,
3797                               &upl,
3798                               &pl,
3799                               UPL_SET_LITE);
3800
3801         if (kret != KERN_SUCCESS)
3802                 return(EINVAL);
3803
3804         if (!upl_valid_page(pl, 0)) {
3805                 /*
3806                  * issue a synchronous read to cluster_io
3807                  */
3808                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3809                                    CL_READ, (struct buf *)0, (struct clios *)0);
3810                 if (error) {
3811                           ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3812
3813                           return(error);
3814                 }
3815         }
3816         ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
3817
3818 /*
3819  *      NOTE:  There is no prototype for the following in BSD. It, and the definitions
3820  *      of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
3821  *      osfmk/ppc/mappings.h.  They are not included here because there appears to be no
3822  *      way to do so without exporting them to kexts as well.
3823  */
3824         if (flags & CL_READ)
3825 //              copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);    /* Copy physical to physical and flush the destination */
3826                 copypv(ubc_paddr, usr_paddr, xsize,        2 |        1 |        4);    /* Copy physical to physical and flush the destination */
3827         else
3828 //              copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);    /* Copy physical to physical and flush the source */
3829                 copypv(usr_paddr, ubc_paddr, xsize,        2 |        1 |        8);    /* Copy physical to physical and flush the source */
3830
3831         if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
3832                 /*
3833                  * issue a synchronous write to cluster_io
3834                  */
3835                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3836                                         0, (struct buf *)0, (struct clios *)0);
3837         }
3838         if (error == 0) {
3839                 uio->uio_offset += xsize;
3840                 iov->iov_base   += xsize;
3841                 iov->iov_len    -= xsize;
3842                 uio->uio_resid  -= xsize;
3843         }
3844         ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3845
3846         return (error);
3847 }
3848
3849
3850
3851 int
3852 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
3853 {
3854         int       pg_offset;
3855         int       pg_index;
3856         int       csize;
3857         int       segflg;
3858         int       retval = 0;
3859         upl_page_info_t *pl;
3860         boolean_t funnel_state = FALSE;
3861
3862
3863         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
3864                      (int)uio->uio_offset, uio->uio_resid, upl_offset, xsize, 0);
3865
3866         if (xsize >= (16 * 1024))
3867                 funnel_state = thread_funnel_set(kernel_flock, FALSE);
3868
3869         segflg = uio->uio_segflg;
3870
3871         switch(segflg) {
3872
3873           case UIO_USERSPACE:
3874           case UIO_USERISPACE:
3875                 uio->uio_segflg = UIO_PHYS_USERSPACE;
3876                 break;
3877
3878           case UIO_SYSSPACE:
3879                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
3880                 break;
3881         }
3882         pl = ubc_upl_pageinfo(upl);
3883
3884         pg_index  = upl_offset / PAGE_SIZE;
3885         pg_offset = upl_offset & PAGE_MASK;
3886         csize     = min(PAGE_SIZE - pg_offset, xsize);
3887
3888         while (xsize && retval == 0) {
3889                 addr64_t  paddr;
3890
3891                 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
3892
3893                 retval = uiomove64(paddr, csize, uio);
3894
3895                 pg_index += 1;
3896                 pg_offset = 0;
3897                 xsize    -= csize;
3898                 csize     = min(PAGE_SIZE, xsize);
3899         }
3900         uio->uio_segflg = segflg;
3901
3902         if (funnel_state == TRUE)
3903                 thread_funnel_set(kernel_flock, TRUE);
3904
3905         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3906                      (int)uio->uio_offset, uio->uio_resid, retval, segflg, 0);
3907
3908         return (retval);
3909 }
3910
3911
3912 int
3913 cluster_copy_ubc_data(struct vnode *vp, struct uio *uio, int *io_resid, int mark_dirty)
3914 {
3915         int       segflg;
3916         int       io_size;
3917         int       xsize;
3918         int       start_offset;
3919         off_t     f_offset;
3920         int       retval = 0;
3921         memory_object_control_t  control;
3922         int       op_flags = UPL_POP_SET | UPL_POP_BUSY;
3923         boolean_t funnel_state = FALSE;
3924
3925
3926         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
3927                      (int)uio->uio_offset, uio->uio_resid, 0, *io_resid, 0);
3928
3929         control = ubc_getobject(vp, UBC_FLAGS_NONE);
3930         if (control == MEMORY_OBJECT_CONTROL_NULL) {
3931                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3932                              (int)uio->uio_offset, uio->uio_resid, retval, 3, 0);
3933
3934                 return(0);
3935         }
3936         if (mark_dirty)
3937                 op_flags |= UPL_POP_DIRTY;
3938
3939         segflg = uio->uio_segflg;
3940
3941         switch(segflg) {
3942
3943           case UIO_USERSPACE:
3944           case UIO_USERISPACE:
3945                 uio->uio_segflg = UIO_PHYS_USERSPACE;
3946                 break;
3947
3948           case UIO_SYSSPACE:
3949                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
3950                 break;
3951         }
3952         io_size      = *io_resid;
3953         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3954         f_offset     = uio->uio_offset - start_offset;
3955         xsize        = min(PAGE_SIZE - start_offset, io_size);
3956
3957         while (io_size && retval == 0) {
3958                 ppnum_t pgframe;
3959
3960                 if (ubc_page_op_with_control(control, f_offset, op_flags, &pgframe, 0) != KERN_SUCCESS)
3961                         break;
3962
3963                 if (funnel_state == FALSE && io_size >= (16 * 1024))
3964                         funnel_state = thread_funnel_set(kernel_flock, FALSE);
3965
3966                 retval = uiomove64((addr64_t)(((addr64_t)pgframe << 12) + start_offset), xsize, uio);
3967
3968                 ubc_page_op_with_control(control, f_offset, UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
3969
3970                 io_size     -= xsize;
3971                 start_offset = 0;
3972                 f_offset     = uio->uio_offset;
3973                 xsize        = min(PAGE_SIZE, io_size);
3974         }
3975         uio->uio_segflg = segflg;
3976         *io_resid       = io_size;
3977
3978         if (funnel_state == TRUE)
3979                 thread_funnel_set(kernel_flock, TRUE);
3980
3981         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3982                      (int)uio->uio_offset, uio->uio_resid, retval, 0x80000000 | segflg, 0);
3983
3984         return(retval);
3985 }
3986
3987
3988 int
3989 is_file_clean(struct vnode *vp, off_t filesize)
3990 {
3991         off_t f_offset;
3992         int   flags;
3993         int   total_dirty = 0;
3994
3995         for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
3996                 if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
3997                         if (flags & UPL_POP_DIRTY) {
3998                                 total_dirty++;
3999                         }
4000                 }
4001         }
4002         if (total_dirty)
4003                 return(EINVAL);
4004
4005         return (0);
4006 }
4007
4008
4009
4010 /*
4011  * Dirty region tracking/clustering mechanism.
4012  *
4013  * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4014  * dirty regions within a larger space (file).  It is primarily intended to
4015  * support clustering in large files with many dirty areas.
4016  *
4017  * The implementation assumes that the dirty regions are pages.
4018  *
4019  * To represent dirty pages within the file, we store bit vectors in a
4020  * variable-size circular hash.
4021  */
4022
4023 /*
4024  * Bitvector size.  This determines the number of pages we group in a
4025  * single hashtable entry.  Each hashtable entry is aligned to this
4026  * size within the file.
4027  */
4028 #define DRT_BITVECTOR_PAGES             256
4029
4030 /*
4031  * File offset handling.
4032  *
4033  * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4034  * the correct formula is  (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4035  */
4036 #define DRT_ADDRESS_MASK                (~((1 << 20) - 1))
4037 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK)
4038
4039 /*
4040  * Hashtable address field handling.
4041  *
4042  * The low-order bits of the hashtable address are used to conserve
4043  * space.
4044  *
4045  * DRT_HASH_COUNT_MASK must be large enough to store the range
4046  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4047  * to indicate that the bucket is actually unoccupied.
4048  */
4049 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4050 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \
4051         do {                                                                                            \
4052                 (scm)->scm_hashtable[(i)].dhe_control =                                                 \
4053                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4054         } while (0)
4055 #define DRT_HASH_COUNT_MASK             0x1ff
4056 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4057 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \
4058         do {                                                                                                            \
4059                 (scm)->scm_hashtable[(i)].dhe_control =                                                                 \
4060                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \
4061         } while (0)
4062 #define DRT_HASH_CLEAR(scm, i)                                                                                          \
4063         do {                                                                                                            \
4064                 (scm)->scm_hashtable[(i)].dhe_control = 0;                                                              \
4065         } while (0)
4066 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4067 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4068 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \
4069         do {                                                                                            \
4070                 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \
4071                 DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \
4072         } while(0);
4073
4074
4075 /*
4076  * Hash table moduli.
4077  *
4078  * Since the hashtable entry's size is dependent on the size of
4079  * the bitvector, and since the hashtable size is constrained to
4080  * both being prime and fitting within the desired allocation
4081  * size, these values need to be manually determined.
4082  *
4083  * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4084  *
4085  * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4086  * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4087  */
4088 #define DRT_HASH_SMALL_MODULUS  23
4089 #define DRT_HASH_LARGE_MODULUS  401
4090
4091 #define DRT_SMALL_ALLOCATION    1024    /* 104 bytes spare */
4092 #define DRT_LARGE_ALLOCATION    16384   /* 344 bytes spare */
4093
4094 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4095
4096 /*
4097  * Hashtable bitvector handling.
4098  *
4099  * Bitvector fields are 32 bits long.
4100  */
4101
4102 #define DRT_HASH_SET_BIT(scm, i, bit)                           \
4103         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4104
4105 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \
4106         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4107
4108 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \
4109         ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4110
4111 #define DRT_BITVECTOR_CLEAR(scm, i)                             \
4112         bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4113
4114 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \
4115         bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \
4116             &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \
4117             (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4118
4119
4120
4121 /*
4122  * Hashtable entry.
4123  */
4124 struct vfs_drt_hashentry {
4125         u_int64_t       dhe_control;
4126         u_int32_t       dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
4127 };
4128
4129 /*
4130  * Dirty Region Tracking structure.
4131  *
4132  * The hashtable is allocated entirely inside the DRT structure.
4133  *
4134  * The hash is a simple circular prime modulus arrangement, the structure
4135  * is resized from small to large if it overflows.
4136  */
4137
4138 struct vfs_drt_clustermap {
4139         u_int32_t               scm_magic;      /* sanity/detection */
4140 #define DRT_SCM_MAGIC           0x12020003
4141         u_int32_t               scm_modulus;    /* current ring size */
4142         u_int32_t               scm_buckets;    /* number of occupied buckets */
4143         u_int32_t               scm_lastclean;  /* last entry we cleaned */
4144         u_int32_t               scm_iskips;     /* number of slot skips */
4145
4146         struct vfs_drt_hashentry scm_hashtable[0];
4147 };
4148
4149
4150 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus)
4151 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus)
4152
4153 /*
4154  * Debugging codes and arguments.
4155  */
4156 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4157 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4158 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4159 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4160 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4161                                                             * dirty */
4162                                                            /* 0, setcount */
4163                                                            /* 1 (clean, no map) */
4164                                                            /* 2 (map alloc fail) */
4165                                                            /* 3, resid (partial) */
4166 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87))
4167 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4168                                                             * lastclean, iskips */
4169
4170
4171 static void             vfs_drt_sanity(struct vfs_drt_clustermap *cmap);
4172 static kern_return_t    vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
4173 static kern_return_t    vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
4174 static kern_return_t    vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
4175         u_int64_t offset, int *indexp);
4176 static kern_return_t    vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
4177         u_int64_t offset,
4178         int *indexp,
4179         int recursed);
4180 static kern_return_t    vfs_drt_do_mark_pages(
4181         void            **cmapp,
4182         u_int64_t       offset,
4183         u_int           length,
4184         int             *setcountp,
4185         int             dirty);
4186 static void             vfs_drt_trace(
4187         struct vfs_drt_clustermap *cmap,
4188         int code,
4189         int arg1,
4190         int arg2,
4191         int arg3,
4192         int arg4);
4193
4194
4195 /*
4196  * Allocate and initialise a sparse cluster map.
4197  *
4198  * Will allocate a new map, resize or compact an existing map.
4199  *
4200  * XXX we should probably have at least one intermediate map size,
4201  * as the 1:16 ratio seems a bit drastic.
4202  */
4203 static kern_return_t
4204 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
4205 {
4206         struct vfs_drt_clustermap *cmap, *ocmap;
4207         kern_return_t   kret;
4208         u_int64_t       offset;
4209         int             nsize, i, active_buckets, index, copycount;
4210
4211         ocmap = NULL;
4212         if (cmapp != NULL)
4213                 ocmap = *cmapp;
4214
4215         /*
4216          * Decide on the size of the new map.
4217          */
4218         if (ocmap == NULL) {
4219                 nsize = DRT_HASH_SMALL_MODULUS;
4220         } else {
4221                 /* count the number of active buckets in the old map */
4222                 active_buckets = 0;
4223                 for (i = 0; i < ocmap->scm_modulus; i++) {
4224                         if (!DRT_HASH_VACANT(ocmap, i) &&
4225                             (DRT_HASH_GET_COUNT(ocmap, i) != 0))
4226                                 active_buckets++;
4227                 }
4228                 /*
4229                  * If we're currently using the small allocation, check to
4230                  * see whether we should grow to the large one.
4231                  */
4232                 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
4233                         /* if the ring is nearly full */
4234                         if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
4235                                 nsize = DRT_HASH_LARGE_MODULUS;
4236                         } else {
4237                                 nsize = DRT_HASH_SMALL_MODULUS;
4238                         }
4239                 } else {
4240                         /* already using the large modulus */
4241                         nsize = DRT_HASH_LARGE_MODULUS;
4242                         /*
4243                          * If the ring is completely full, there's
4244                          * nothing useful for us to do.  Behave as
4245                          * though we had compacted into the new
4246                          * array and return.
4247                          */
4248                         if (active_buckets >= DRT_HASH_LARGE_MODULUS)
4249                                 return(KERN_SUCCESS);
4250                 }
4251         }
4252
4253         /*
4254          * Allocate and initialise the new map.
4255          */
4256
4257         kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
4258             (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4259         if (kret != KERN_SUCCESS)
4260                 return(kret);
4261         cmap->scm_magic = DRT_SCM_MAGIC;
4262         cmap->scm_modulus = nsize;
4263         cmap->scm_buckets = 0;
4264         cmap->scm_lastclean = 0;
4265         cmap->scm_iskips = 0;
4266         for (i = 0; i < cmap->scm_modulus; i++) {
4267                 DRT_HASH_CLEAR(cmap, i);
4268                 DRT_HASH_VACATE(cmap, i);
4269                 DRT_BITVECTOR_CLEAR(cmap, i);
4270         }
4271
4272         /*
4273          * If there's an old map, re-hash entries from it into the new map.
4274          */
4275         copycount = 0;
4276         if (ocmap != NULL) {
4277                 for (i = 0; i < ocmap->scm_modulus; i++) {
4278                         /* skip empty buckets */
4279                         if (DRT_HASH_VACANT(ocmap, i) ||
4280                             (DRT_HASH_GET_COUNT(ocmap, i) == 0))
4281                                 continue;
4282                         /* get new index */
4283                         offset = DRT_HASH_GET_ADDRESS(ocmap, i);
4284                         kret = vfs_drt_get_index(&cmap, offset, &index, 1);
4285                         if (kret != KERN_SUCCESS) {
4286                                 /* XXX need to bail out gracefully here */
4287                                 panic("vfs_drt: new cluster map mysteriously too small");
4288                         }
4289                         /* copy */
4290                         DRT_HASH_COPY(ocmap, i, cmap, index);
4291                         copycount++;
4292                 }
4293         }
4294
4295         /* log what we've done */
4296         vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
4297
4298         /*
4299          * It's important to ensure that *cmapp always points to
4300          * a valid map, so we must overwrite it before freeing
4301          * the old map.
4302          */
4303         *cmapp = cmap;
4304         if (ocmap != NULL) {
4305                 /* emit stats into trace buffer */
4306                 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
4307                               ocmap->scm_modulus,
4308                               ocmap->scm_buckets,
4309                               ocmap->scm_lastclean,
4310                               ocmap->scm_iskips);
4311
4312                 vfs_drt_free_map(ocmap);
4313         }
4314         return(KERN_SUCCESS);
4315 }
4316
4317
4318 /*
4319  * Free a sparse cluster map.
4320  */
4321 static kern_return_t
4322 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
4323 {
4324         kern_return_t   ret;
4325
4326         kmem_free(kernel_map, (vm_offset_t)cmap,
4327                   (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4328         return(KERN_SUCCESS);
4329 }
4330
4331
4332 /*
4333  * Find the hashtable slot currently occupied by an entry for the supplied offset.
4334  */
4335 static kern_return_t
4336 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
4337 {
4338         kern_return_t   kret;
4339         int             index, i, tries;
4340
4341         offset = DRT_ALIGN_ADDRESS(offset);
4342         index = DRT_HASH(cmap, offset);
4343
4344         /* traverse the hashtable */
4345         for (i = 0; i < cmap->scm_modulus; i++) {
4346
4347                 /*
4348                  * If the slot is vacant, we can stop.
4349                  */
4350                 if (DRT_HASH_VACANT(cmap, index))
4351                         break;
4352
4353                 /*
4354                  * If the address matches our offset, we have success.
4355                  */
4356                 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
4357                         *indexp = index;
4358                         return(KERN_SUCCESS);
4359                 }
4360
4361                 /*
4362                  * Move to the next slot, try again.
4363                  */
4364                 index = DRT_HASH_NEXT(cmap, index);
4365         }
4366         /*
4367          * It's not there.
4368          */
4369         return(KERN_FAILURE);
4370 }
4371
4372 /*
4373  * Find the hashtable slot for the supplied offset.  If we haven't allocated
4374  * one yet, allocate one and populate the address field.  Note that it will
4375  * not have a nonzero page count and thus will still technically be free, so
4376  * in the case where we are called to clean pages, the slot will remain free.
4377  */
4378 static kern_return_t
4379 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
4380 {
4381         struct vfs_drt_clustermap *cmap;
4382         kern_return_t   kret;
4383         int             index, i;
4384
4385         cmap = *cmapp;
4386
4387         /* look for an existing entry */
4388         kret = vfs_drt_search_index(cmap, offset, indexp);
4389         if (kret == KERN_SUCCESS)
4390                 return(kret);
4391
4392         /* need to allocate an entry */
4393         offset = DRT_ALIGN_ADDRESS(offset);
4394         index = DRT_HASH(cmap, offset);
4395
4396         /* scan from the index forwards looking for a vacant slot */
4397         for (i = 0; i < cmap->scm_modulus; i++) {
4398                 /* slot vacant? */
4399                 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
4400                         cmap->scm_buckets++;
4401                         if (index < cmap->scm_lastclean)
4402                                 cmap->scm_lastclean = index;
4403                         DRT_HASH_SET_ADDRESS(cmap, index, offset);
4404                         DRT_HASH_SET_COUNT(cmap, index, 0);
4405                         DRT_BITVECTOR_CLEAR(cmap, index);
4406                         *indexp = index;
4407                         vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
4408                         return(KERN_SUCCESS);
4409                 }
4410                 cmap->scm_iskips += i;
4411                 index = DRT_HASH_NEXT(cmap, index);
4412         }
4413
4414         /*
4415          * We haven't found a vacant slot, so the map is full.  If we're not
4416          * already recursed, try reallocating/compacting it.
4417          */
4418         if (recursed)
4419                 return(KERN_FAILURE);
4420         kret = vfs_drt_alloc_map(cmapp);
4421         if (kret == KERN_SUCCESS) {
4422                 /* now try to insert again */
4423                 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
4424         }
4425         return(kret);
4426 }
4427
4428 /*
4429  * Implementation of set dirty/clean.
4430  *
4431  * In the 'clean' case, not finding a map is OK.
4432  */
4433 static kern_return_t
4434 vfs_drt_do_mark_pages(
4435         void            **private,
4436         u_int64_t       offset,
4437         u_int           length,
4438         int             *setcountp,
4439         int             dirty)
4440 {
4441         struct vfs_drt_clustermap *cmap, **cmapp;
4442         kern_return_t   kret;
4443         int             i, index, pgoff, pgcount, setcount, ecount;
4444
4445         cmapp = (struct vfs_drt_clustermap **)private;
4446         cmap = *cmapp;
4447
4448         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
4449
4450         if (setcountp != NULL)
4451                 *setcountp = 0;
4452
4453         /* allocate a cluster map if we don't already have one */
4454         if (cmap == NULL) {
4455                 /* no cluster map, nothing to clean */
4456                 if (!dirty) {
4457                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
4458                         return(KERN_SUCCESS);
4459                 }
4460                 kret = vfs_drt_alloc_map(cmapp);
4461                 if (kret != KERN_SUCCESS) {
4462                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
4463                         return(kret);
4464                 }
4465         }
4466         setcount = 0;
4467
4468         /*
4469          * Iterate over the length of the region.
4470          */
4471         while (length > 0) {
4472                 /*
4473                  * Get the hashtable index for this offset.
4474                  *
4475                  * XXX this will add blank entries if we are clearing a range
4476                  * that hasn't been dirtied.
4477                  */
4478                 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
4479                 cmap = *cmapp;  /* may have changed! */
4480                 /* this may be a partial-success return */
4481                 if (kret != KERN_SUCCESS) {
4482                         if (setcountp != NULL)
4483                                 *setcountp = setcount;
4484                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
4485
4486                         return(kret);
4487                 }
4488
4489                 /*
4490                  * Work out how many pages we're modifying in this
4491                  * hashtable entry.
4492                  */
4493                 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
4494                 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
4495
4496                 /*
4497                  * Iterate over pages, dirty/clearing as we go.
4498                  */
4499                 ecount = DRT_HASH_GET_COUNT(cmap, index);
4500                 for (i = 0; i < pgcount; i++) {
4501                         if (dirty) {
4502                                 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4503                                         DRT_HASH_SET_BIT(cmap, index, pgoff + i);
4504                                         ecount++;
4505                                         setcount++;
4506                                 }
4507                         } else {
4508                                 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4509                                         DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
4510                                         ecount--;
4511                                         setcount++;
4512                                 }
4513                         }
4514                 }
4515                 DRT_HASH_SET_COUNT(cmap, index, ecount);
4516 next:
4517                 offset += pgcount * PAGE_SIZE;
4518                 length -= pgcount * PAGE_SIZE;
4519         }
4520         if (setcountp != NULL)
4521                 *setcountp = setcount;
4522
4523         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
4524
4525         return(KERN_SUCCESS);
4526 }
4527
4528 /*
4529  * Mark a set of pages as dirty/clean.
4530  *
4531  * This is a public interface.
4532  *
4533  * cmapp
4534  *      Pointer to storage suitable for holding a pointer.  Note that
4535  *      this must either be NULL or a value set by this function.
4536  *
4537  * size
4538  *      Current file size in bytes.
4539  *
4540  * offset
4541  *      Offset of the first page to be marked as dirty, in bytes.  Must be
4542  *      page-aligned.
4543  *
4544  * length
4545  *      Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE.
4546  *
4547  * setcountp
4548  *      Number of pages newly marked dirty by this call (optional).
4549  *
4550  * Returns KERN_SUCCESS if all the pages were successfully marked.
4551  */
4552 static kern_return_t
4553 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp)
4554 {
4555         /* XXX size unused, drop from interface */
4556         return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
4557 }
4558
4559 static kern_return_t
4560 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
4561 {
4562         return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
4563 }
4564
4565 /*
4566  * Get a cluster of dirty pages.
4567  *
4568  * This is a public interface.
4569  *
4570  * cmapp
4571  *      Pointer to storage managed by drt_mark_pages.  Note that this must
4572  *      be NULL or a value set by drt_mark_pages.
4573  *
4574  * offsetp
4575  *      Returns the byte offset into the file of the first page in the cluster.
4576  *
4577  * lengthp
4578  *      Returns the length in bytes of the cluster of dirty pages.
4579  *
4580  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there
4581  * are no dirty pages meeting the minmum size criteria.  Private storage will
4582  * be released if there are no more dirty pages left in the map
4583  *
4584  */
4585 static kern_return_t
4586 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
4587 {
4588         struct vfs_drt_clustermap *cmap;
4589         u_int64_t       offset;
4590         u_int           length;
4591         int             index, i, j, fs, ls;
4592
4593         /* sanity */
4594         if ((cmapp == NULL) || (*cmapp == NULL))
4595                 return(KERN_FAILURE);
4596         cmap = *cmapp;
4597
4598         /* walk the hashtable */
4599         for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
4600                 index = DRT_HASH(cmap, offset);
4601
4602                 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
4603                         continue;
4604
4605                 /* scan the bitfield for a string of bits */
4606                 fs = -1;
4607
4608                 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
4609                         if (DRT_HASH_TEST_BIT(cmap, index, i)) {
4610                                 fs = i;
4611                                 break;
4612                         }
4613                 }
4614                 if (fs == -1) {
4615                         /*  didn't find any bits set */
4616                         panic("vfs_drt: entry summary count > 0 but no bits set in map");
4617                 }
4618                 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
4619                         if (!DRT_HASH_TEST_BIT(cmap, index, i))
4620                                 break;
4621                 }
4622
4623                 /* compute offset and length, mark pages clean */
4624                 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
4625                 length = ls * PAGE_SIZE;
4626                 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
4627                 cmap->scm_lastclean = index;
4628
4629                 /* return successful */
4630                 *offsetp = (off_t)offset;
4631                 *lengthp = length;
4632
4633                 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
4634                 return(KERN_SUCCESS);
4635         }
4636         /*
4637          * We didn't find anything... hashtable is empty
4638          * emit stats into trace buffer and
4639          * then free it
4640          */
4641         vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
4642                       cmap->scm_modulus,
4643                       cmap->scm_buckets,
4644                       cmap->scm_lastclean,
4645                       cmap->scm_iskips);
4646
4647         vfs_drt_free_map(cmap);
4648         *cmapp = NULL;
4649
4650         return(KERN_FAILURE);
4651 }
4652
4653
4654 static kern_return_t
4655 vfs_drt_control(void **cmapp, int op_type)
4656 {
4657         struct vfs_drt_clustermap *cmap;
4658
4659         /* sanity */
4660         if ((cmapp == NULL) || (*cmapp == NULL))
4661                 return(KERN_FAILURE);
4662         cmap = *cmapp;
4663
4664         switch (op_type) {
4665         case 0:
4666                 /* emit stats into trace buffer */
4667                 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
4668                               cmap->scm_modulus,
4669                               cmap->scm_buckets,
4670                               cmap->scm_lastclean,
4671                               cmap->scm_iskips);
4672
4673                 vfs_drt_free_map(cmap);
4674                 *cmapp = NULL;
4675                 break;
4676
4677         case 1:
4678                 cmap->scm_lastclean = 0;
4679                 break;
4680         }
4681         return(KERN_SUCCESS);
4682 }
4683
4684
4685
4686 /*
4687  * Emit a summary of the state of the clustermap into the trace buffer
4688  * along with some caller-provided data.
4689  */
4690 static void
4691 vfs_drt_trace(struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
4692 {
4693         KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
4694 }
4695
4696 /*
4697  * Perform basic sanity check on the hash entry summary count
4698  * vs. the actual bits set in the entry.
4699  */
4700 static void
4701 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
4702 {
4703         int index, i;
4704         int bits_on;
4705
4706         for (index = 0; index < cmap->scm_modulus; index++) {
4707                 if (DRT_HASH_VACANT(cmap, index))
4708                         continue;
4709
4710                 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
4711                         if (DRT_HASH_TEST_BIT(cmap, index, i))
4712                                 bits_on++;
4713                 }
4714                 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
4715                         panic("bits_on = %d,  index = %d\n", bits_on, index);
4716         }
4717 }