bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  26 /*
  27  * Copyright (c) 1993
  28  *      The Regents of the University of California.  All rights reserved.
  29  *
  30  * Redistribution and use in source and binary forms, with or without
  31  * modification, are permitted provided that the following conditions
  32  * are met:
  33  * 1. Redistributions of source code must retain the above copyright
  34  *    notice, this list of conditions and the following disclaimer.
  35  * 2. Redistributions in binary form must reproduce the above copyright
  36  *    notice, this list of conditions and the following disclaimer in the
  37  *    documentation and/or other materials provided with the distribution.
  38  * 3. All advertising materials mentioning features or use of this software
  39  *    must display the following acknowledgement:
  40  *      This product includes software developed by the University of
  41  *      California, Berkeley and its contributors.
  42  * 4. Neither the name of the University nor the names of its contributors
  43  *    may be used to endorse or promote products derived from this software
  44  *    without specific prior written permission.
  45  *
  46  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  47  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  48  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  49  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  50  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  51  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  52  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  53  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  54  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  55  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  56  * SUCH DAMAGE.
  57  *
  58  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  59  */
  60
  61 #include <sys/param.h>
  62 #include <sys/proc.h>
  63 #include <sys/buf.h>
  64 #include <sys/vnode.h>
  65 #include <sys/mount.h>
  66 #include <sys/trace.h>
  67 #include <sys/malloc.h>
  68 #include <sys/resourcevar.h>
  69 #include <libkern/libkern.h>
  70
  71 #include <sys/ubc.h>
  72 #include <vm/vm_pageout.h>
  73
  74 #include <sys/kdebug.h>
  75
  76 #define CL_READ      0x01
  77 #define CL_ASYNC     0x02
  78 #define CL_COMMIT    0x04
  79 #define CL_PAGEOUT   0x10
  80 #define CL_AGE       0x20
  81 #define CL_DUMP      0x40
  82 #define CL_NOZERO    0x80
  83 #define CL_PAGEIN    0x100
  84 #define CL_DEV_MEMORY 0x200
  85 #define CL_PRESERVE   0x400
  86
  87
  88 struct clios {
  89         u_int  io_completed;       /* amount of io that has currently completed */
  90         u_int  io_issued;          /* amount of io that was successfully issued */
  91         int    io_error;           /* error code of first error encountered */
  92         int    io_wanted;          /* someone is sleeping waiting for a change in state */
  93 };
  94
  95
  96 static void cluster_zero(upl_t upl, vm_offset_t   upl_offset,
  97                 int size, struct buf *bp);
  98 static int cluster_read_x(struct vnode *vp, struct uio *uio,
  99                 off_t filesize, int devblocksize, int flags);
 100 static int cluster_write_x(struct vnode *vp, struct uio *uio,
 101                 off_t oldEOF, off_t newEOF, off_t headOff,
 102                 off_t tailOff, int devblocksize, int flags);
 103 static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
 104                 off_t filesize, int devblocksize, int flags);
 105 static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
 106                 off_t newEOF, int devblocksize, int flags);
 107 static int cluster_phys_read(struct vnode *vp, struct uio *uio,
 108                 off_t filesize, int devblocksize, int flags);
 109 static int cluster_phys_write(struct vnode *vp, struct uio *uio,
 110                 off_t newEOF, int devblocksize, int flags);
 111 static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
 112                 addr64_t usr_paddr, int xsize, int devblocksize, int flags);
 113 static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
 114 static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all);
 115
 116
 117 /*
 118  * throttle the number of async writes that
 119  * can be outstanding on a single vnode
 120  * before we issue a synchronous write
 121  */
 122 #define ASYNC_THROTTLE  9
 123
 124 static int
 125 cluster_iodone(bp)
 126         struct buf *bp;
 127 {
 128         int         b_flags;
 129         int         error;
 130         int         total_size;
 131         int         total_resid;
 132         int         upl_offset;
 133         int         zero_offset;
 134         upl_t       upl;
 135         struct buf *cbp;
 136         struct buf *cbp_head;
 137         struct buf *cbp_next;
 138         struct buf *real_bp;
 139         struct vnode *vp;
 140         struct clios *iostate;
 141         int         commit_size;
 142         int         pg_offset;
 143
 144
 145         cbp_head = (struct buf *)(bp->b_trans_head);
 146
 147         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 148                      (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 149
 150         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 151                 /*
 152                  * all I/O requests that are part of this transaction
 153                  * have to complete before we can process it
 154                  */
 155                 if ( !(cbp->b_flags & B_DONE)) {
 156
 157                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 158                                      (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
 159
 160                         return 0;
 161                 }
 162         }
 163         error       = 0;
 164         total_size  = 0;
 165         total_resid = 0;
 166
 167         cbp        = cbp_head;
 168         upl_offset = cbp->b_uploffset;
 169         upl        = cbp->b_pagelist;
 170         b_flags    = cbp->b_flags;
 171         real_bp    = cbp->b_real_bp;
 172         vp         = cbp->b_vp;
 173         zero_offset= cbp->b_validend;
 174         iostate    = (struct clios *)cbp->b_iostate;
 175
 176         while (cbp) {
 177                 if ((cbp->b_flags & B_ERROR) && error == 0)
 178                         error = cbp->b_error;
 179
 180                 total_resid += cbp->b_resid;
 181                 total_size  += cbp->b_bcount;
 182
 183                 cbp_next = cbp->b_trans_next;
 184
 185                 free_io_buf(cbp);
 186
 187                 cbp = cbp_next;
 188         }
 189         if (zero_offset)
 190                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 191
 192         if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
 193                 vp->v_flag &= ~VTHROTTLED;
 194                 wakeup((caddr_t)&vp->v_numoutput);
 195         }
 196         if (iostate) {
 197                 /*
 198                  * someone has issued multiple I/Os asynchrounsly
 199                  * and is waiting for them to complete (streaming)
 200                  */
 201                 if (error && iostate->io_error == 0)
 202                         iostate->io_error = error;
 203
 204                 iostate->io_completed += total_size;
 205
 206                 if (iostate->io_wanted) {
 207                         /*
 208                          * someone is waiting for the state of
 209                          * this io stream to change
 210                          */
 211                         iostate->io_wanted = 0;
 212                         wakeup((caddr_t)&iostate->io_wanted);
 213                 }
 214         }
 215         if ((b_flags & B_NEED_IODONE) && real_bp) {
 216                 if (error) {
 217                         real_bp->b_flags |= B_ERROR;
 218                         real_bp->b_error = error;
 219                 }
 220                 real_bp->b_resid = total_resid;
 221
 222                 biodone(real_bp);
 223         }
 224         if (error == 0 && total_resid)
 225                 error = EIO;
 226
 227         if (b_flags & B_COMMIT_UPL) {
 228                 pg_offset   = upl_offset & PAGE_MASK;
 229                 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 230
 231                 if (error || (b_flags & B_NOCACHE) || ((b_flags & B_PHYS) && !(b_flags & B_READ))) {
 232                         int upl_abort_code;
 233
 234                         if (b_flags & B_PHYS)
 235                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 236                         else if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
 237                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 238                         else if (b_flags & B_PGIN)
 239                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 240                         else
 241                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 242
 243                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 244                                         upl_abort_code);
 245
 246                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 247                                      (int)upl, upl_offset - pg_offset, commit_size,
 248                                      0x80000000|upl_abort_code, 0);
 249
 250                 } else {
 251                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 252
 253                         if (b_flags & B_PHYS)
 254                                 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
 255                         else if ( !(b_flags & B_PAGEOUT))
 256                                 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
 257                         if (b_flags & B_AGE)
 258                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 259
 260                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 261                                         upl_commit_flags);
 262
 263                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 264                                      (int)upl, upl_offset - pg_offset, commit_size,
 265                                      upl_commit_flags, 0);
 266                 }
 267         } else
 268                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 269                              (int)upl, upl_offset, 0, error, 0);
 270
 271         return (error);
 272 }
 273
 274
 275 static void
 276 cluster_zero(upl, upl_offset, size, bp)
 277         upl_t         upl;
 278         vm_offset_t   upl_offset;
 279         int           size;
 280         struct buf   *bp;
 281 {
 282         vm_offset_t   io_addr = 0;
 283         int           must_unmap = 0;
 284         kern_return_t kret;
 285
 286         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
 287                      upl_offset, size, (int)bp, 0, 0);
 288
 289         if (bp == NULL || bp->b_data == NULL) {
 290                 kret = ubc_upl_map(upl, &io_addr);
 291
 292                 if (kret != KERN_SUCCESS)
 293                         panic("cluster_zero: ubc_upl_map() failed with (%d)", kret);
 294                 if (io_addr == 0)
 295                         panic("cluster_zero: ubc_upl_map() mapped 0");
 296
 297                 must_unmap = 1;
 298         } else
 299                 io_addr = (vm_offset_t)bp->b_data;
 300         bzero((caddr_t)(io_addr + upl_offset), size);
 301
 302         if (must_unmap) {
 303                 kret = ubc_upl_unmap(upl);
 304
 305                 if (kret != KERN_SUCCESS)
 306                         panic("cluster_zero: kernel_upl_unmap failed");
 307         }
 308 }
 309
 310 static int
 311 cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
 312         struct vnode *vp;
 313         upl_t         upl;
 314         vm_offset_t   upl_offset;
 315         off_t         f_offset;
 316         int           non_rounded_size;
 317         int           devblocksize;
 318         int           flags;
 319         struct buf   *real_bp;
 320         struct clios *iostate;
 321 {
 322         struct buf   *cbp;
 323         u_int         size;
 324         u_int         io_size;
 325         int           io_flags;
 326         int           error = 0;
 327         int           retval = 0;
 328         struct buf   *cbp_head = 0;
 329         struct buf   *cbp_tail = 0;
 330         upl_page_info_t *pl;
 331         int buf_count = 0;
 332         int pg_count;
 333         int pg_offset;
 334         u_int max_iosize;
 335         u_int max_vectors;
 336         int priv;
 337         int zero_offset = 0;
 338         u_int  first_lblkno;
 339
 340         if (flags & CL_READ) {
 341                 io_flags = (B_VECTORLIST | B_READ);
 342
 343                 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
 344         } else {
 345                 io_flags = (B_VECTORLIST | B_WRITEINPROG);
 346
 347                 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
 348         }
 349         pl = ubc_upl_pageinfo(upl);
 350
 351         if (flags & CL_AGE)
 352                 io_flags |= B_AGE;
 353         if (flags & CL_DUMP)
 354                 io_flags |= B_NOCACHE;
 355         if (flags & CL_PAGEIN)
 356                 io_flags |= B_PGIN;
 357         if (flags & CL_PAGEOUT)
 358                 io_flags |= B_PAGEOUT;
 359         if (flags & CL_COMMIT)
 360                 io_flags |= B_COMMIT_UPL;
 361         if (flags & CL_PRESERVE)
 362                 io_flags |= B_PHYS;
 363
 364         if (devblocksize)
 365                 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
 366         else
 367                 size = non_rounded_size;
 368
 369
 370         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 371                      (int)f_offset, size, upl_offset, flags, 0);
 372
 373         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 374                 /*
 375                  * then we are going to end up
 376                  * with a page that we can't complete (the file size wasn't a multiple
 377                  * of PAGE_SIZE and we're trying to read to the end of the file
 378                  * so we'll go ahead and zero out the portion of the page we can't
 379                  * read in from the file
 380                  */
 381                 zero_offset = upl_offset + non_rounded_size;
 382         }
 383         while (size) {
 384                 int i;
 385                 int pl_index;
 386                 int pg_resid;
 387                 int num_contig;
 388                 daddr_t lblkno;
 389                 daddr_t blkno;
 390
 391                 if (size > max_iosize)
 392                         io_size = max_iosize;
 393                 else
 394                         io_size = size;
 395
 396                 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
 397                         if (error == EOPNOTSUPP)
 398                                 panic("VOP_CMAP Unimplemented");
 399                         break;
 400                 }
 401
 402                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 403                              (int)f_offset, (int)blkno, io_size, zero_offset, 0);
 404
 405                 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
 406                         if (flags & CL_PAGEOUT) {
 407                                 error = EINVAL;
 408                                 break;
 409                         };
 410
 411                         /* Try paging out the page individually before
 412                            giving up entirely and dumping it (it could
 413                            be mapped in a "hole" and require allocation
 414                            before the I/O:
 415                          */
 416                          ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE_64, UPL_ABORT_FREE_ON_EMPTY);
 417                          if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
 418                                 error = EINVAL;
 419                                 break;
 420                          };
 421
 422                         upl_offset += PAGE_SIZE_64;
 423                         f_offset   += PAGE_SIZE_64;
 424                         size       -= PAGE_SIZE_64;
 425                         continue;
 426                 }
 427                 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
 428                 /*
 429                  * we have now figured out how much I/O we can do - this is in 'io_size'
 430                  * pl_index represents the first page in the 'upl' that the I/O will occur for
 431                  * pg_offset is the starting point in the first page for the I/O
 432                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 433                  */
 434                 pl_index  = upl_offset / PAGE_SIZE;
 435                 pg_offset = upl_offset & PAGE_MASK;
 436                 pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 437
 438                 if (flags & CL_DEV_MEMORY) {
 439                         /*
 440                          * currently, can't deal with reading 'holes' in file
 441                          */
 442                         if ((long)blkno == -1) {
 443                                 error = EINVAL;
 444                                 break;
 445                         }
 446                         /*
 447                          * treat physical requests as one 'giant' page
 448                          */
 449                         pg_count = 1;
 450                 }
 451                 if ((flags & CL_READ) && (long)blkno == -1) {
 452                         int bytes_to_zero;
 453
 454                         /*
 455                          * if we're reading and blkno == -1, then we've got a
 456                          * 'hole' in the file that we need to deal with by zeroing
 457                          * out the affected area in the upl
 458                          */
 459                         if (zero_offset && io_size == size) {
 460                                 /*
 461                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
 462                                  * than 'zero_offset' will be non-zero
 463                                  * if the 'hole' returned by VOP_CMAP extends all the way to the eof
 464                                  * (indicated by the io_size finishing off the I/O request for this UPL)
 465                                  * than we're not going to issue an I/O for the
 466                                  * last page in this upl... we need to zero both the hole and the tail
 467                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
 468                                  */
 469                                 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
 470
 471                                 zero_offset = 0;
 472                         } else
 473                                 bytes_to_zero = io_size;
 474
 475                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
 476
 477                         if (cbp_head)
 478                                 /*
 479                                  * if there is a current I/O chain pending
 480                                  * then the first page of the group we just zero'd
 481                                  * will be handled by the I/O completion if the zero
 482                                  * fill started in the middle of the page
 483                                  */
 484                                 pg_count = (io_size - pg_offset) / PAGE_SIZE;
 485                         else {
 486                                 /*
 487                                  * no pending I/O to pick up that first page
 488                                  * so, we have to make sure it gets committed
 489                                  * here.
 490                                  * set the pg_offset to 0 so that the upl_commit_range
 491                                  * starts with this page
 492                                  */
 493                                 pg_count = (io_size + pg_offset) / PAGE_SIZE;
 494                                 pg_offset = 0;
 495                         }
 496                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 497                                 /*
 498                                  * if we're done with the request for this UPL
 499                                  * then we have to make sure to commit the last page
 500                                  * even if we only partially zero-filled it
 501                                  */
 502                                 pg_count++;
 503
 504                         if (pg_count) {
 505                                 if (pg_offset)
 506                                         pg_resid = PAGE_SIZE - pg_offset;
 507                                 else
 508                                         pg_resid = 0;
 509
 510                                 if (flags & CL_COMMIT)
 511                                         ubc_upl_commit_range(upl,
 512                                                         (upl_offset + pg_resid) & ~PAGE_MASK,
 513                                                         pg_count * PAGE_SIZE,
 514                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 515                         }
 516                         upl_offset += io_size;
 517                         f_offset   += io_size;
 518                         size       -= io_size;
 519
 520                         if (cbp_head && pg_count)
 521                                 goto start_io;
 522                         continue;
 523
 524                 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
 525                         real_bp->b_blkno = blkno;
 526                 }
 527
 528                 if (pg_count > max_vectors) {
 529                         io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 530
 531                         if (io_size < 0) {
 532                                 io_size = PAGE_SIZE - pg_offset;
 533                                 pg_count = 1;
 534                         } else
 535                                 pg_count = max_vectors;
 536                 }
 537
 538                 /* Throttle the speculative IO */
 539                 if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 540                         priv = 0;
 541                 else
 542                         priv = 1;
 543
 544                 cbp = alloc_io_buf(vp, priv);
 545
 546
 547                 if (flags & CL_PAGEOUT) {
 548                         for (i = 0; i < pg_count; i++) {
 549                                 int         s;
 550                                 struct buf *bp;
 551
 552                                 s = splbio();
 553                                 if (bp = incore(vp, lblkno + i)) {
 554                                         if (!ISSET(bp->b_flags, B_BUSY)) {
 555                                                 bremfree(bp);
 556                                                 SET(bp->b_flags, (B_BUSY | B_INVAL));
 557                                                 splx(s);
 558                                                 brelse(bp);
 559                                         } else
 560                                                 panic("BUSY bp found in cluster_io");
 561                                 }
 562                                 splx(s);
 563                         }
 564                 }
 565                 if (flags & CL_ASYNC) {
 566                         cbp->b_flags |= (B_CALL | B_ASYNC);
 567                         cbp->b_iodone = (void *)cluster_iodone;
 568                 }
 569                 cbp->b_flags |= io_flags;
 570
 571                 cbp->b_lblkno = lblkno;
 572                 cbp->b_blkno  = blkno;
 573                 cbp->b_bcount = io_size;
 574                 cbp->b_pagelist  = upl;
 575                 cbp->b_uploffset = upl_offset;
 576                 cbp->b_trans_next = (struct buf *)0;
 577
 578                 if (cbp->b_iostate = (void *)iostate)
 579                         /*
 580                          * caller wants to track the state of this
 581                          * io... bump the amount issued against this stream
 582                          */
 583                         iostate->io_issued += io_size;
 584
 585                 if (flags & CL_READ)
 586                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 587                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 588                 else
 589                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 590                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 591
 592                 if (cbp_head) {
 593                         cbp_tail->b_trans_next = cbp;
 594                         cbp_tail = cbp;
 595                 } else {
 596                         cbp_head = cbp;
 597                         cbp_tail = cbp;
 598                 }
 599                 (struct buf *)(cbp->b_trans_head) = cbp_head;
 600                 buf_count++;
 601
 602                 upl_offset += io_size;
 603                 f_offset   += io_size;
 604                 size       -= io_size;
 605
 606                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
 607                         /*
 608                          * if we have no more I/O to issue or
 609                          * the current I/O we've prepared fully
 610                          * completes the last page in this request
 611                          * and it's either an ASYNC request or
 612                          * we've already accumulated more than 8 I/O's into
 613                          * this transaction and it's not an I/O directed to
 614                          * special DEVICE memory
 615                          * then go ahead and issue the I/O
 616                          */
 617 start_io:
 618                         if (real_bp) {
 619                                 cbp_head->b_flags |= B_NEED_IODONE;
 620                                 cbp_head->b_real_bp = real_bp;
 621                         } else
 622                                 cbp_head->b_real_bp = (struct buf *)NULL;
 623
 624                         if (size == 0) {
 625                                 /*
 626                                  * we're about to issue the last I/O for this upl
 627                                  * if this was a read to the eof and the eof doesn't
 628                                  * finish on a page boundary, than we need to zero-fill
 629                                  * the rest of the page....
 630                                  */
 631                                 cbp_head->b_validend = zero_offset;
 632                         } else
 633                                 cbp_head->b_validend = 0;
 634
 635                         for (cbp = cbp_head; cbp;) {
 636                                 struct buf * cbp_next;
 637
 638                                 if (io_flags & B_WRITEINPROG)
 639                                         cbp->b_vp->v_numoutput++;
 640
 641                                 cbp_next = cbp->b_trans_next;
 642
 643                                 (void) VOP_STRATEGY(cbp);
 644                                 cbp = cbp_next;
 645                         }
 646                         if ( !(flags & CL_ASYNC)) {
 647                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 648                                         biowait(cbp);
 649
 650                                 if (error = cluster_iodone(cbp_head)) {
 651                                         if ((flags & CL_PAGEOUT) && (error == ENXIO))
 652                                                 retval = 0;     /* drop the error */
 653                                         else
 654                                                 retval = error;
 655                                         error  = 0;
 656                                 }
 657                         }
 658                         cbp_head = (struct buf *)0;
 659                         cbp_tail = (struct buf *)0;
 660
 661                         buf_count = 0;
 662                 }
 663         }
 664         if (error) {
 665                 int abort_size;
 666
 667                 io_size = 0;
 668
 669                 for (cbp = cbp_head; cbp;) {
 670                         struct buf * cbp_next;
 671
 672                         upl_offset -= cbp->b_bcount;
 673                         size       += cbp->b_bcount;
 674                         io_size    += cbp->b_bcount;
 675
 676                         cbp_next = cbp->b_trans_next;
 677                         free_io_buf(cbp);
 678                         cbp = cbp_next;
 679                 }
 680                 if (iostate) {
 681                         /*
 682                          * update the error condition for this stream
 683                          * since we never really issued the io
 684                          * just go ahead and adjust it back
 685                          */
 686                         if (iostate->io_error == 0)
 687                                 iostate->io_error = error;
 688                         iostate->io_issued -= io_size;
 689
 690                         if (iostate->io_wanted) {
 691                                 /*
 692                                  * someone is waiting for the state of
 693                                  * this io stream to change
 694                                  */
 695                                 iostate->io_wanted = 0;
 696                                 wakeup((caddr_t)&iostate->io_wanted);
 697                         }
 698                 }
 699                 pg_offset  = upl_offset & PAGE_MASK;
 700                 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 701
 702                 if (flags & CL_COMMIT) {
 703                         int upl_abort_code;
 704
 705                         if (flags & CL_PRESERVE)
 706                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 707                         else if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
 708                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 709                         else if (flags & CL_PAGEIN)
 710                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 711                         else
 712                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 713
 714                         ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
 715                                                 upl_abort_code);
 716
 717                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
 718                                      (int)upl, upl_offset - pg_offset, abort_size, error, 0);
 719                 }
 720                 if (real_bp) {
 721                         real_bp->b_flags |= B_ERROR;
 722                         real_bp->b_error  = error;
 723
 724                         biodone(real_bp);
 725                 }
 726                 if (retval == 0)
 727                         retval = error;
 728         }
 729         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
 730                      (int)f_offset, size, upl_offset, retval, 0);
 731
 732         return (retval);
 733 }
 734
 735
 736 static int
 737 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
 738         struct vnode *vp;
 739         off_t         f_offset;
 740         u_int         size;
 741         off_t         filesize;
 742         int           devblocksize;
 743 {
 744         int           pages_to_fetch;
 745         int           skipped_pages;
 746
 747         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
 748                      (int)f_offset, size, (int)filesize, 0, 0);
 749
 750         if (f_offset >= filesize) {
 751                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 752                              (int)f_offset, 0, 0, 0, 0);
 753                 return(0);
 754         }
 755         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
 756                 size = MAX_UPL_TRANSFER * PAGE_SIZE;
 757         else
 758                 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
 759
 760         if ((off_t)size > (filesize - f_offset))
 761                 size = filesize - f_offset;
 762
 763         pages_to_fetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
 764
 765         for (skipped_pages = 0; skipped_pages < pages_to_fetch; skipped_pages++) {
 766                 if (ubc_page_op(vp, f_offset, 0, 0, 0) != KERN_SUCCESS)
 767                         break;
 768                 f_offset += PAGE_SIZE;
 769                 size     -= PAGE_SIZE;
 770         }
 771         if (skipped_pages < pages_to_fetch)
 772                 advisory_read(vp, filesize, f_offset, size, devblocksize);
 773
 774         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 775                      (int)f_offset + (pages_to_fetch * PAGE_SIZE), skipped_pages, 0, 1, 0);
 776
 777         return (pages_to_fetch);
 778 }
 779
 780
 781
 782 static void
 783 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
 784         struct vnode *vp;
 785         daddr_t       b_lblkno;
 786         daddr_t       e_lblkno;
 787         off_t         filesize;
 788         int           devblocksize;
 789 {
 790         daddr_t       r_lblkno;
 791         off_t         f_offset;
 792         int           size_of_prefetch;
 793         int           max_pages;
 794
 795         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
 796                      b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
 797
 798         if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
 799                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 800                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
 801                 return;
 802         }
 803
 804         if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
 805                                  (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
 806                 vp->v_ralen = 0;
 807                 vp->v_maxra = 0;
 808
 809                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 810                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
 811
 812                 return;
 813         }
 814         max_pages = MAX_UPL_TRANSFER;
 815
 816         vp->v_ralen = vp->v_ralen ? min(max_pages, vp->v_ralen << 1) : 1;
 817
 818         if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
 819                 vp->v_ralen = min(max_pages, (e_lblkno + 1) - b_lblkno);
 820
 821         if (e_lblkno < vp->v_maxra) {
 822                 if ((vp->v_maxra - e_lblkno) > max(max_pages / 16, 4)) {
 823
 824                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 825                                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
 826                         return;
 827                 }
 828         }
 829         r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
 830         f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
 831
 832         if (f_offset < filesize) {
 833                 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
 834
 835                 if (size_of_prefetch)
 836                         vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
 837         }
 838         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 839                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
 840 }
 841
 842 int
 843 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 844         struct vnode *vp;
 845         upl_t         upl;
 846         vm_offset_t   upl_offset;
 847         off_t         f_offset;
 848         int           size;
 849         off_t         filesize;
 850         int           devblocksize;
 851         int           flags;
 852 {
 853         int           io_size;
 854         int           pg_size;
 855         off_t         max_size;
 856         int local_flags = CL_PAGEOUT;
 857
 858         if ((flags & UPL_IOSYNC) == 0)
 859                 local_flags |= CL_ASYNC;
 860         if ((flags & UPL_NOCOMMIT) == 0)
 861                 local_flags |= CL_COMMIT;
 862
 863
 864         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
 865                      (int)f_offset, size, (int)filesize, local_flags, 0);
 866
 867         /*
 868          * If they didn't specify any I/O, then we are done...
 869          * we can't issue an abort because we don't know how
 870          * big the upl really is
 871          */
 872         if (size <= 0)
 873                 return (EINVAL);
 874
 875         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
 876                 if (local_flags & CL_COMMIT)
 877                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 878                 return (EROFS);
 879         }
 880         /*
 881          * can't page-in from a negative offset
 882          * or if we're starting beyond the EOF
 883          * or if the file offset isn't page aligned
 884          * or the size requested isn't a multiple of PAGE_SIZE
 885          */
 886         if (f_offset < 0 || f_offset >= filesize ||
 887            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 888                 if (local_flags & CL_COMMIT)
 889                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 890                 return (EINVAL);
 891         }
 892         max_size = filesize - f_offset;
 893
 894         if (size < max_size)
 895                 io_size = size;
 896         else
 897                 io_size = max_size;
 898
 899         pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 900
 901         if (size > pg_size) {
 902                 if (local_flags & CL_COMMIT)
 903                         ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
 904                                         UPL_ABORT_FREE_ON_EMPTY);
 905         }
 906         while (vp->v_numoutput >= ASYNC_THROTTLE) {
 907                 vp->v_flag |= VTHROTTLED;
 908                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_pageout", 0);
 909         }
 910
 911         return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
 912                            local_flags, (struct buf *)0, (struct clios *)0));
 913 }
 914
 915 int
 916 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 917         struct vnode *vp;
 918         upl_t         upl;
 919         vm_offset_t   upl_offset;
 920         off_t         f_offset;
 921         int           size;
 922         off_t         filesize;
 923         int           devblocksize;
 924         int           flags;
 925 {
 926         u_int         io_size;
 927         int           rounded_size;
 928         off_t         max_size;
 929         int           retval;
 930         int           local_flags = 0;
 931
 932         if (upl == NULL || size < 0)
 933                 panic("cluster_pagein: NULL upl passed in");
 934
 935         if ((flags & UPL_IOSYNC) == 0)
 936                 local_flags |= CL_ASYNC;
 937         if ((flags & UPL_NOCOMMIT) == 0)
 938                 local_flags |= CL_COMMIT;
 939
 940
 941         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
 942                      (int)f_offset, size, (int)filesize, local_flags, 0);
 943
 944         /*
 945          * can't page-in from a negative offset
 946          * or if we're starting beyond the EOF
 947          * or if the file offset isn't page aligned
 948          * or the size requested isn't a multiple of PAGE_SIZE
 949          */
 950         if (f_offset < 0 || f_offset >= filesize ||
 951            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
 952                 if (local_flags & CL_COMMIT)
 953                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
 954                 return (EINVAL);
 955         }
 956         max_size = filesize - f_offset;
 957
 958         if (size < max_size)
 959                 io_size = size;
 960         else
 961                 io_size = max_size;
 962
 963         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 964
 965         if (size > rounded_size && (local_flags & CL_COMMIT))
 966                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
 967                                     size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
 968
 969         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
 970                            local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
 971
 972         if (retval == 0) {
 973                 int b_lblkno;
 974                 int e_lblkno;
 975
 976                 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
 977                 e_lblkno = (int)
 978                         ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
 979
 980                 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
 981                         /*
 982                          * we haven't read the last page in of the file yet
 983                          * so let's try to read ahead if we're in
 984                          * a sequential access pattern
 985                          */
 986                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
 987                 }
 988                 vp->v_lastr = e_lblkno;
 989         }
 990         return (retval);
 991 }
 992
 993 int
 994 cluster_bp(bp)
 995         struct buf *bp;
 996 {
 997         off_t  f_offset;
 998         int    flags;
 999
1000         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1001                      (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1002
1003         if (bp->b_pagelist == (upl_t) 0)
1004                 panic("cluster_bp: can't handle NULL upl yet\n");
1005         if (bp->b_flags & B_READ)
1006                 flags = CL_ASYNC | CL_READ;
1007         else
1008                 flags = CL_ASYNC;
1009
1010         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1011
1012         return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
1013 }
1014
1015 int
1016 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1017         struct vnode *vp;
1018         struct uio   *uio;
1019         off_t         oldEOF;
1020         off_t         newEOF;
1021         off_t         headOff;
1022         off_t         tailOff;
1023         int           devblocksize;
1024         int           flags;
1025 {
1026         int           prev_resid;
1027         int           clip_size;
1028         off_t         max_io_size;
1029         struct iovec  *iov;
1030         vm_offset_t   upl_offset;
1031         int           upl_size;
1032         int           pages_in_pl;
1033         upl_page_info_t *pl;
1034         int           upl_flags;
1035         upl_t         upl;
1036         int           retval = 0;
1037
1038
1039         if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
1040           {
1041             retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1042             return(retval);
1043           }
1044
1045         while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1046           {
1047             /* we know we have a resid, so this is safe */
1048             iov = uio->uio_iov;
1049             while (iov->iov_len == 0) {
1050               uio->uio_iov++;
1051               uio->uio_iovcnt--;
1052               iov = uio->uio_iov;
1053             }
1054
1055             /*
1056              * We check every vector target and if it is physically
1057              * contiguous space, we skip the sanity checks.
1058              */
1059
1060             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1061             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1062             pages_in_pl = 0;
1063             upl_flags = UPL_QUERY_OBJECT_TYPE;
1064             if ((vm_map_get_upl(current_map(),
1065                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1066                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1067               {
1068                 /*
1069                  * the user app must have passed in an invalid address
1070                  */
1071                 return (EFAULT);
1072               }
1073
1074             if (upl_flags & UPL_PHYS_CONTIG)
1075               {
1076                 if (flags & IO_HEADZEROFILL)
1077                   {
1078                     flags &= ~IO_HEADZEROFILL;
1079
1080                     if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1081                         return(retval);
1082                   }
1083
1084                 retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
1085
1086                 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1087                   {
1088                     retval = cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL);
1089                     return(retval);
1090                   }
1091               }
1092             else if ((uio->uio_resid < 4 * PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1093               {
1094                 /*
1095                  * We set a threshhold of 4 pages to decide if the nocopy
1096                  * write loop is worth the trouble...
1097                  * we also come here if we're trying to zero the head and/or tail
1098                  * of a partially written page, and the user source is not a physically contiguous region
1099                  */
1100                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1101                 return(retval);
1102               }
1103             else if (uio->uio_offset & PAGE_MASK_64)
1104               {
1105                 /* Bring the file offset write up to a pagesize boundary */
1106                 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1107                 if (uio->uio_resid < clip_size)
1108                   clip_size = uio->uio_resid;
1109                 /*
1110                  * Fake the resid going into the cluster_write_x call
1111                  * and restore it on the way out.
1112                  */
1113                 prev_resid = uio->uio_resid;
1114                 uio->uio_resid = clip_size;
1115                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1116                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1117               }
1118             else if ((int)iov->iov_base & PAGE_MASK_64)
1119               {
1120                 clip_size = iov->iov_len;
1121                 prev_resid = uio->uio_resid;
1122                 uio->uio_resid = clip_size;
1123                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1124                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1125               }
1126             else
1127               {
1128                 /*
1129                  * If we come in here, we know the offset into
1130                  * the file is on a pagesize boundary
1131                  */
1132
1133                 max_io_size = newEOF - uio->uio_offset;
1134                 clip_size = uio->uio_resid;
1135                 if (iov->iov_len < clip_size)
1136                   clip_size = iov->iov_len;
1137                 if (max_io_size < clip_size)
1138                   clip_size = max_io_size;
1139
1140                 if (clip_size < PAGE_SIZE)
1141                   {
1142                     /*
1143                      * Take care of tail end of write in this vector
1144                      */
1145                     prev_resid = uio->uio_resid;
1146                     uio->uio_resid = clip_size;
1147                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1148                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1149                   }
1150                 else
1151                   {
1152                     /* round clip_size down to a multiple of pagesize */
1153                     clip_size = clip_size & ~(PAGE_MASK);
1154                     prev_resid = uio->uio_resid;
1155                     uio->uio_resid = clip_size;
1156                     retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1157                     if ((retval == 0) && uio->uio_resid)
1158                       retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1159                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1160                   }
1161               } /* end else */
1162           } /* end while */
1163         return(retval);
1164 }
1165
1166
1167 static int
1168 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1169         struct vnode *vp;
1170         struct uio   *uio;
1171         off_t         newEOF;
1172         int           devblocksize;
1173         int           flags;
1174 {
1175         upl_t            upl;
1176         upl_page_info_t  *pl;
1177         off_t            upl_f_offset;
1178         vm_offset_t      upl_offset;
1179         off_t            max_io_size;
1180         int              io_size;
1181         int              io_flag;
1182         int              upl_size;
1183         int              upl_needed_size;
1184         int              pages_in_pl;
1185         int              upl_flags;
1186         kern_return_t    kret;
1187         struct iovec     *iov;
1188         int              i;
1189         int              first = 1;
1190         int              force_data_sync;
1191         int              error  = 0;
1192         struct clios     iostate;
1193
1194         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1195                      (int)uio->uio_offset, (int)uio->uio_resid,
1196                      (int)newEOF, devblocksize, 0);
1197
1198         /*
1199          * When we enter this routine, we know
1200          *  -- the offset into the file is on a pagesize boundary
1201          *  -- the resid is a page multiple
1202          *  -- the resid will not exceed iov_len
1203          */
1204         cluster_try_push(vp, newEOF, 0, 1);
1205
1206         iostate.io_completed = 0;
1207         iostate.io_issued = 0;
1208         iostate.io_error = 0;
1209         iostate.io_wanted = 0;
1210
1211         iov = uio->uio_iov;
1212
1213         while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1214                 io_size = uio->uio_resid;
1215
1216                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1217                         io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1218
1219                 if (first) {
1220                         if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
1221                                 io_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 8;
1222                         first = 0;
1223                 }
1224                 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1225                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1226
1227                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1228                              (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1229
1230                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1231                         pages_in_pl = 0;
1232                         upl_size = upl_needed_size;
1233                         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1234                                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1235
1236                         kret = vm_map_get_upl(current_map(),
1237                                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1238                                               &upl_size,
1239                                               &upl,
1240                                               NULL,
1241                                               &pages_in_pl,
1242                                               &upl_flags,
1243                                               force_data_sync);
1244
1245                         if (kret != KERN_SUCCESS) {
1246                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1247                                              0, 0, 0, kret, 0);
1248
1249                                 /*
1250                                  * cluster_nocopy_write: failed to get pagelist
1251                                  *
1252                                  * we may have already spun some portion of this request
1253                                  * off as async requests... we need to wait for the I/O
1254                                  * to complete before returning
1255                                  */
1256                                 goto wait_for_writes;
1257                         }
1258                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1259                         pages_in_pl = upl_size / PAGE_SIZE;
1260
1261                         for (i = 0; i < pages_in_pl; i++) {
1262                                 if (!upl_valid_page(pl, i))
1263                                         break;
1264                         }
1265                         if (i == pages_in_pl)
1266                                 break;
1267
1268                         /*
1269                          * didn't get all the pages back that we
1270                          * needed... release this upl and try again
1271                          */
1272                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1273                                             UPL_ABORT_FREE_ON_EMPTY);
1274                 }
1275                 if (force_data_sync >= 3) {
1276                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1277                                      i, pages_in_pl, upl_size, kret, 0);
1278
1279                         /*
1280                          * for some reason, we couldn't acquire a hold on all
1281                          * the pages needed in the user's address space
1282                          *
1283                          * we may have already spun some portion of this request
1284                          * off as async requests... we need to wait for the I/O
1285                          * to complete before returning
1286                          */
1287                         goto wait_for_writes;
1288                 }
1289
1290                 /*
1291                  * Consider the possibility that upl_size wasn't satisfied.
1292                  */
1293                 if (upl_size != upl_needed_size)
1294                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1295
1296                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1297                              (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1298
1299                 if (io_size == 0) {
1300                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1301                                             UPL_ABORT_FREE_ON_EMPTY);
1302
1303                         /*
1304                          * we may have already spun some portion of this request
1305                          * off as async requests... we need to wait for the I/O
1306                          * to complete before returning
1307                          */
1308                         goto wait_for_writes;
1309                 }
1310                 /*
1311                  * Now look for pages already in the cache
1312                  * and throw them away.
1313                  */
1314
1315                 upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
1316                 max_io_size = io_size;
1317
1318                 while (max_io_size) {
1319                         /*
1320                          * Flag UPL_POP_DUMP says if the page is found
1321                          * in the page cache it must be thrown away.
1322                          */
1323                         ubc_page_op(vp,
1324                                     upl_f_offset,
1325                                     UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1326                                     0, 0);
1327                         max_io_size  -= PAGE_SIZE_64;
1328                         upl_f_offset += PAGE_SIZE_64;
1329                 }
1330                 /*
1331                  * we want push out these writes asynchronously so that we can overlap
1332                  * the preparation of the next I/O
1333                  * if there are already too many outstanding writes
1334                  * wait until some complete before issuing the next
1335                  */
1336                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1337                         iostate.io_wanted = 1;
1338                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1339                 }
1340                 if (iostate.io_error) {
1341                         /*
1342                          * one of the earlier writes we issued ran into a hard error
1343                          * don't issue any more writes, cleanup the UPL
1344                          * that was just created but not used, then
1345                          * go wait for all writes that are part of this stream
1346                          * to complete before returning the error to the caller
1347                          */
1348                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1349                                             UPL_ABORT_FREE_ON_EMPTY);
1350
1351                         goto wait_for_writes;
1352                 }
1353                 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT;
1354
1355                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1356                              (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1357
1358                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1359                                    io_size, devblocksize, io_flag, (struct buf *)0, &iostate);
1360
1361                 iov->iov_len    -= io_size;
1362                 iov->iov_base   += io_size;
1363                 uio->uio_resid  -= io_size;
1364                 uio->uio_offset += io_size;
1365
1366                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1367                              (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1368
1369         } /* end while */
1370
1371 wait_for_writes:
1372         /*
1373          * make sure all async writes issued as part of this stream
1374          * have completed before we return
1375          */
1376         while (iostate.io_issued != iostate.io_completed) {
1377                 iostate.io_wanted = 1;
1378                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1379         }
1380         if (iostate.io_error)
1381                 error = iostate.io_error;
1382
1383         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1384                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1385
1386         return (error);
1387 }
1388
1389
1390 static int
1391 cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
1392         struct vnode *vp;
1393         struct uio   *uio;
1394         off_t        newEOF;
1395         int          devblocksize;
1396         int          flags;
1397 {
1398         upl_page_info_t *pl;
1399         addr64_t             src_paddr;
1400         upl_t            upl;
1401         vm_offset_t      upl_offset;
1402         int              tail_size;
1403         int              io_size;
1404         int              upl_size;
1405         int              upl_needed_size;
1406         int              pages_in_pl;
1407         int              upl_flags;
1408         kern_return_t    kret;
1409         struct iovec     *iov;
1410         int              error  = 0;
1411
1412         /*
1413          * When we enter this routine, we know
1414          *  -- the resid will not exceed iov_len
1415          *  -- the vector target address is physcially contiguous
1416          */
1417         cluster_try_push(vp, newEOF, 0, 1);
1418
1419         iov = uio->uio_iov;
1420         io_size = iov->iov_len;
1421         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1422         upl_needed_size = upl_offset + io_size;
1423
1424         pages_in_pl = 0;
1425         upl_size = upl_needed_size;
1426         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1427                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1428
1429         kret = vm_map_get_upl(current_map(),
1430                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1431                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1432
1433         if (kret != KERN_SUCCESS) {
1434                 /*
1435                  * cluster_phys_write: failed to get pagelist
1436                  * note: return kret here
1437                  */
1438               return(EINVAL);
1439         }
1440         /*
1441          * Consider the possibility that upl_size wasn't satisfied.
1442          * This is a failure in the physical memory case.
1443          */
1444         if (upl_size < upl_needed_size) {
1445                 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1446                 return(EINVAL);
1447         }
1448         pl = ubc_upl_pageinfo(upl);
1449
1450         src_paddr = (((addr64_t)(int)upl_phys_page(pl, 0)) << 12) + ((addr64_t)iov->iov_base & PAGE_MASK);
1451
1452         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1453                 int   head_size;
1454
1455                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1456
1457                 if (head_size > io_size)
1458                         head_size = io_size;
1459
1460                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
1461
1462                 if (error) {
1463                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1464
1465                         return(EINVAL);
1466                 }
1467                 upl_offset += head_size;
1468                 src_paddr  += head_size;
1469                 io_size    -= head_size;
1470         }
1471         tail_size = io_size & (devblocksize - 1);
1472         io_size  -= tail_size;
1473
1474         if (io_size) {
1475                 /*
1476                  * issue a synchronous write to cluster_io
1477                  */
1478                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1479                                    io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
1480         }
1481         if (error == 0) {
1482                 /*
1483                  * The cluster_io write completed successfully,
1484                  * update the uio structure
1485                  */
1486                 uio->uio_resid  -= io_size;
1487                 iov->iov_len    -= io_size;
1488                 iov->iov_base   += io_size;
1489                 uio->uio_offset += io_size;
1490                 src_paddr       += io_size;
1491
1492                 if (tail_size)
1493                         error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
1494         }
1495         /*
1496          * just release our hold on the physically contiguous
1497          * region without changing any state
1498          */
1499         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1500
1501         return (error);
1502 }
1503
1504
1505 static int
1506 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1507         struct vnode *vp;
1508         struct uio   *uio;
1509         off_t         oldEOF;
1510         off_t         newEOF;
1511         off_t         headOff;
1512         off_t         tailOff;
1513         int           devblocksize;
1514         int           flags;
1515 {
1516         upl_page_info_t *pl;
1517         upl_t            upl;
1518         vm_offset_t      upl_offset;
1519         int              upl_size;
1520         off_t            upl_f_offset;
1521         int              pages_in_upl;
1522         int              start_offset;
1523         int              xfer_resid;
1524         int              io_size;
1525         int              io_flags;
1526         vm_offset_t      io_address;
1527         int              io_offset;
1528         int              bytes_to_zero;
1529         int              bytes_to_move;
1530         kern_return_t    kret;
1531         int              retval = 0;
1532         int              uio_resid;
1533         long long        total_size;
1534         long long        zero_cnt;
1535         off_t            zero_off;
1536         long long        zero_cnt1;
1537         off_t            zero_off1;
1538         daddr_t          start_blkno;
1539         daddr_t          last_blkno;
1540
1541         if (uio) {
1542                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1543                              (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1544
1545                 uio_resid = uio->uio_resid;
1546         } else {
1547                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1548                              0, 0, (int)oldEOF, (int)newEOF, 0);
1549
1550                 uio_resid = 0;
1551         }
1552         zero_cnt  = 0;
1553         zero_cnt1 = 0;
1554
1555         if (flags & IO_HEADZEROFILL) {
1556                 /*
1557                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1558                  * so we zero fill the intervening space between the old EOF and the offset
1559                  * where the next chunk of real data begins.... ftruncate will also use this
1560                  * routine to zero fill to the new EOF when growing a file... in this case, the
1561                  * uio structure will not be provided
1562                  */
1563                 if (uio) {
1564                         if (headOff < uio->uio_offset) {
1565                                 zero_cnt = uio->uio_offset - headOff;
1566                                 zero_off = headOff;
1567                         }
1568                 } else if (headOff < newEOF) {
1569                         zero_cnt = newEOF - headOff;
1570                         zero_off = headOff;
1571                 }
1572         }
1573         if (flags & IO_TAILZEROFILL) {
1574                 if (uio) {
1575                         zero_off1 = uio->uio_offset + uio->uio_resid;
1576
1577                         if (zero_off1 < tailOff)
1578                                 zero_cnt1 = tailOff - zero_off1;
1579                 }
1580         }
1581         if (zero_cnt == 0 && uio == (struct uio *) 0)
1582           {
1583             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1584                          retval, 0, 0, 0, 0);
1585             return (0);
1586           }
1587
1588         while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1589                 /*
1590                  * for this iteration of the loop, figure out where our starting point is
1591                  */
1592                 if (zero_cnt) {
1593                         start_offset = (int)(zero_off & PAGE_MASK_64);
1594                         upl_f_offset = zero_off - start_offset;
1595                 } else if (uio_resid) {
1596                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1597                         upl_f_offset = uio->uio_offset - start_offset;
1598                 } else {
1599                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1600                         upl_f_offset = zero_off1 - start_offset;
1601                 }
1602                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1603                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1604
1605                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1606                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1607
1608                 /*
1609                  * compute the size of the upl needed to encompass
1610                  * the requested write... limit each call to cluster_io
1611                  * to the maximum UPL size... cluster_io will clip if
1612                  * this exceeds the maximum io_size for the device,
1613                  * make sure to account for
1614                  * a starting offset that's not page aligned
1615                  */
1616                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1617
1618                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1619                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1620
1621                 pages_in_upl = upl_size / PAGE_SIZE;
1622                 io_size      = upl_size - start_offset;
1623
1624                 if ((long long)io_size > total_size)
1625                         io_size = total_size;
1626
1627                 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1628                 last_blkno  = start_blkno + pages_in_upl;
1629
1630                 kret = ubc_create_upl(vp,
1631                                                         upl_f_offset,
1632                                                         upl_size,
1633                                                         &upl,
1634                                                         &pl,
1635                                                         UPL_FLAGS_NONE);
1636                 if (kret != KERN_SUCCESS)
1637                         panic("cluster_write: failed to get pagelist");
1638
1639                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
1640                         (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1641
1642                 if (start_offset && !upl_valid_page(pl, 0)) {
1643                         int   read_size;
1644
1645                         /*
1646                          * we're starting in the middle of the first page of the upl
1647                          * and the page isn't currently valid, so we're going to have
1648                          * to read it in first... this is a synchronous operation
1649                          */
1650                         read_size = PAGE_SIZE;
1651
1652                         if ((upl_f_offset + read_size) > newEOF)
1653                                 read_size = newEOF - upl_f_offset;
1654
1655                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
1656                                             CL_READ, (struct buf *)0, (struct clios *)0);
1657                         if (retval) {
1658                                 /*
1659                                  * we had an error during the read which causes us to abort
1660                                  * the current cluster_write request... before we do, we need
1661                                  * to release the rest of the pages in the upl without modifying
1662                                  * there state and mark the failed page in error
1663                                  */
1664                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1665                                 ubc_upl_abort_range(upl, 0, upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1666
1667                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1668                                              (int)upl, 0, 0, retval, 0);
1669                                 break;
1670                         }
1671                 }
1672                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1673                         /*
1674                          * the last offset we're writing to in this upl does not end on a page
1675                          * boundary... if it's not beyond the old EOF, then we'll also need to
1676                          * pre-read this page in if it isn't already valid
1677                          */
1678                         upl_offset = upl_size - PAGE_SIZE;
1679
1680                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1681                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1682                                 int   read_size;
1683
1684                                 read_size = PAGE_SIZE;
1685
1686                                 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1687                                         read_size = newEOF - (upl_f_offset + upl_offset);
1688
1689                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
1690                                                     CL_READ, (struct buf *)0, (struct clios *)0);
1691                                 if (retval) {
1692                                         /*
1693                                          * we had an error during the read which causes us to abort
1694                                          * the current cluster_write request... before we do, we
1695                                          * need to release the rest of the pages in the upl without
1696                                          * modifying there state and mark the failed page in error
1697                                          */
1698                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1699                                         ubc_upl_abort_range(upl, 0,          upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1700
1701                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1702                                                      (int)upl, 0, 0, retval, 0);
1703                                         break;
1704                                 }
1705                         }
1706                 }
1707                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
1708                         panic("cluster_write: ubc_upl_map failed\n");
1709                 xfer_resid = io_size;
1710                 io_offset = start_offset;
1711
1712                 while (zero_cnt && xfer_resid) {
1713
1714                         if (zero_cnt < (long long)xfer_resid)
1715                                 bytes_to_zero = zero_cnt;
1716                         else
1717                                 bytes_to_zero = xfer_resid;
1718
1719                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1720                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1721
1722                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1723                                              (int)upl_f_offset + io_offset, bytes_to_zero,
1724                                              (int)io_offset, xfer_resid, 0);
1725                         } else {
1726                                 int zero_pg_index;
1727
1728                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1729                                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1730
1731                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1732                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1733
1734                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1735                                                      (int)upl_f_offset + io_offset, bytes_to_zero,
1736                                                      (int)io_offset, xfer_resid, 0);
1737
1738                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1739                                            !upl_dirty_page(pl, zero_pg_index)) {
1740                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1741
1742                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1743                                                      (int)upl_f_offset + io_offset, bytes_to_zero,
1744                                                      (int)io_offset, xfer_resid, 0);
1745                                 }
1746                         }
1747                         xfer_resid -= bytes_to_zero;
1748                         zero_cnt   -= bytes_to_zero;
1749                         zero_off   += bytes_to_zero;
1750                         io_offset  += bytes_to_zero;
1751                 }
1752                 if (xfer_resid && uio_resid) {
1753                         bytes_to_move = min(uio_resid, xfer_resid);
1754
1755                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1756                                      (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1757
1758                         retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1759
1760
1761                         if (retval) {
1762                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1763                                         panic("cluster_write: kernel_upl_unmap failed\n");
1764
1765                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1766
1767                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1768                                              (int)upl, 0, 0, retval, 0);
1769                         } else {
1770                                 uio_resid  -= bytes_to_move;
1771                                 xfer_resid -= bytes_to_move;
1772                                 io_offset  += bytes_to_move;
1773                         }
1774                 }
1775                 while (xfer_resid && zero_cnt1 && retval == 0) {
1776
1777                         if (zero_cnt1 < (long long)xfer_resid)
1778                                 bytes_to_zero = zero_cnt1;
1779                         else
1780                                 bytes_to_zero = xfer_resid;
1781
1782                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1783                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1784
1785                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1786                                              (int)upl_f_offset + io_offset,
1787                                              bytes_to_zero, (int)io_offset, xfer_resid, 0);
1788                         } else {
1789                                 int zero_pg_index;
1790
1791                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1792                                 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1793
1794                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1795                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1796
1797                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1798                                                      (int)upl_f_offset + io_offset,
1799                                                      bytes_to_zero, (int)io_offset, xfer_resid, 0);
1800
1801                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1802                                            !upl_dirty_page(pl, zero_pg_index)) {
1803                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1804
1805                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1806                                                      (int)upl_f_offset + io_offset,
1807                                                      bytes_to_zero, (int)io_offset, xfer_resid, 0);
1808                                 }
1809                         }
1810                         xfer_resid -= bytes_to_zero;
1811                         zero_cnt1  -= bytes_to_zero;
1812                         zero_off1  += bytes_to_zero;
1813                         io_offset  += bytes_to_zero;
1814                 }
1815
1816                 if (retval == 0) {
1817                         int cl_index;
1818                         int can_delay;
1819
1820                         io_size += start_offset;
1821
1822                         if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1823                                 /*
1824                                  * if we're extending the file with this write
1825                                  * we'll zero fill the rest of the page so that
1826                                  * if the file gets extended again in such a way as to leave a
1827                                  * hole starting at this EOF, we'll have zero's in the correct spot
1828                                  */
1829                                 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1830
1831                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1832                                              (int)upl_f_offset + io_size,
1833                                              upl_size - io_size, 0, 0, 0);
1834                         }
1835                         if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1836                                 panic("cluster_write: kernel_upl_unmap failed\n");
1837
1838                         if (flags & IO_SYNC)
1839                                 /*
1840                                  * if the IO_SYNC flag is set than we need to
1841                                  * bypass any clusters and immediately issue
1842                                  * the I/O
1843                                  */
1844                                 goto issue_io;
1845
1846                         if (vp->v_clen == 0)
1847                                 /*
1848                                  * no clusters currently present
1849                                  */
1850                                 goto start_new_cluster;
1851
1852                         /*
1853                          * keep track of the overall dirty page
1854                          * range we've developed
1855                          * in case we have to fall back to the
1856                          * VHASDIRTY method of flushing
1857                          */
1858                         if (vp->v_flag & VHASDIRTY)
1859                                 goto delay_io;
1860
1861                         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
1862                                 /*
1863                                  * we have an existing cluster... see if this write will extend it nicely
1864                                  */
1865                                 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
1866                                         /*
1867                                          * the current write starts at or after the current cluster
1868                                          */
1869                                         if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1870                                                 /*
1871                                                  * we have a write that fits entirely
1872                                                  * within the existing cluster limits
1873                                                  */
1874                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg)
1875                                                         /*
1876                                                          * update our idea of where the cluster ends
1877                                                          */
1878                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
1879                                                 break;
1880                                         }
1881                                         if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1882                                                 /*
1883                                                  * we have a write that starts in the middle of the current cluster
1884                                                  * but extends beyond the cluster's limit
1885                                                  * we'll clip the current cluster if we actually
1886                                                  * overlap with the new write
1887                                                  * and start a new cluster with the current write
1888                                                  */
1889                                                  if (vp->v_clusters[cl_index].last_pg > start_blkno)
1890                                                         vp->v_clusters[cl_index].last_pg = start_blkno;
1891                                         }
1892                                         /*
1893                                          * we also get here for the case where the current write starts
1894                                          * beyond the limit of the existing cluster
1895                                          *
1896                                          * in either case, we'll check the remaining clusters before
1897                                          * starting a new one
1898                                          */
1899                                 } else {
1900                                         /*
1901                                          * the current write starts in front of the current cluster
1902                                          */
1903                                         if ((vp->v_clusters[cl_index].last_pg - start_blkno) <=  MAX_UPL_TRANSFER) {
1904                                                 /*
1905                                                  * we can just merge the old cluster
1906                                                  * with the new request and leave it
1907                                                  * in the cache
1908                                                  */
1909                                                 vp->v_clusters[cl_index].start_pg = start_blkno;
1910
1911                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
1912                                                         /*
1913                                                          * the current write completely
1914                                                          * envelops the existing cluster
1915                                                          */
1916                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
1917                                                 }
1918                                                 break;
1919                                         }
1920
1921                                         /*
1922                                          * if we were to combine this write with the current cluster
1923                                          * we would exceed the cluster size limit.... so,
1924                                          * let's see if there's any overlap of the new I/O with
1925                                          * the existing cluster...
1926                                          *
1927                                          */
1928                                         if (last_blkno > vp->v_clusters[cl_index].start_pg)
1929                                                 /*
1930                                                  * the current write extends into the existing cluster
1931                                                  * clip the current cluster by moving the start position
1932                                                  * to where the current write ends
1933                                                  */
1934                                                 vp->v_clusters[cl_index].start_pg = last_blkno;
1935                                         /*
1936                                          * if we get here, there was no way to merge
1937                                          * the new I/O with this cluster and
1938                                          * keep it under our maximum cluster length
1939                                          * we'll check the remaining clusters before starting a new one
1940                                          */
1941                                 }
1942                         }
1943                         if (cl_index < vp->v_clen)
1944                                 /*
1945                                  * we found an existing cluster that we
1946                                  * could merger this I/O into
1947                                  */
1948                                 goto delay_io;
1949
1950                         if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
1951                                 /*
1952                                  * we didn't find an existing cluster to
1953                                  * merge into, but there's room to start
1954                                  * a new one
1955                                  */
1956                                 goto start_new_cluster;
1957
1958                         /*
1959                          * no exisitng cluster to merge with and no
1960                          * room to start a new one... we'll try
1961                          * pushing the existing ones... if none of
1962                          * them are able to be pushed, we'll have
1963                          * to fall back on the VHASDIRTY mechanism
1964                          * cluster_try_push will set v_clen to the
1965                          * number of remaining clusters if it is
1966                          * unable to push all of them
1967                          */
1968                         if (vp->v_flag & VNOCACHE_DATA)
1969                                 can_delay = 0;
1970                         else
1971                                 can_delay = 1;
1972
1973                         if (cluster_try_push(vp, newEOF, 0, 0) == 0) {
1974                                 vp->v_flag |= VHASDIRTY;
1975                                 goto delay_io;
1976                         }
1977 start_new_cluster:
1978                         if (vp->v_clen == 0) {
1979                                 vp->v_ciosiz = devblocksize;
1980                                 vp->v_cstart = start_blkno;
1981                                 vp->v_lastw  = last_blkno;
1982                         }
1983                         vp->v_clusters[vp->v_clen].start_pg = start_blkno;
1984                         vp->v_clusters[vp->v_clen].last_pg  = last_blkno;
1985                         vp->v_clen++;
1986 delay_io:
1987                         /*
1988                          * make sure we keep v_cstart and v_lastw up to
1989                          * date in case we have to fall back on the
1990                          * V_HASDIRTY mechanism (or we've already entered it)
1991                          */
1992                         if (start_blkno < vp->v_cstart)
1993                                 vp->v_cstart = start_blkno;
1994                         if (last_blkno > vp->v_lastw)
1995                                 vp->v_lastw = last_blkno;
1996
1997                         ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1998                         continue;
1999 issue_io:
2000                         /*
2001                          * in order to maintain some semblance of coherency with mapped writes
2002                          * we need to write the cluster back out as a multiple of the PAGESIZE
2003                          * unless the cluster encompasses the last page of the file... in this
2004                          * case we'll round out to the nearest device block boundary
2005                          */
2006                         io_size = upl_size;
2007
2008                         if ((upl_f_offset + io_size) > newEOF) {
2009                                 io_size = newEOF - upl_f_offset;
2010                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2011                         }
2012
2013                         if (flags & IO_SYNC)
2014                                 io_flags = CL_COMMIT | CL_AGE;
2015                         else
2016                                 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
2017
2018                         if (vp->v_flag & VNOCACHE_DATA)
2019                                 io_flags |= CL_DUMP;
2020
2021                         while (vp->v_numoutput >= ASYNC_THROTTLE) {
2022                                 vp->v_flag |= VTHROTTLED;
2023                                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
2024                         }
2025                         retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
2026                                             io_flags, (struct buf *)0, (struct clios *)0);
2027                 }
2028         }
2029         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2030                      retval, 0, 0, 0, 0);
2031
2032         return (retval);
2033 }
2034
2035 int
2036 cluster_read(vp, uio, filesize, devblocksize, flags)
2037         struct vnode *vp;
2038         struct uio   *uio;
2039         off_t         filesize;
2040         int           devblocksize;
2041         int           flags;
2042 {
2043         int           prev_resid;
2044         int           clip_size;
2045         off_t         max_io_size;
2046         struct iovec  *iov;
2047         vm_offset_t   upl_offset;
2048         int           upl_size;
2049         int           pages_in_pl;
2050         upl_page_info_t *pl;
2051         int           upl_flags;
2052         upl_t         upl;
2053         int           retval = 0;
2054
2055         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2056                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2057
2058         /*
2059          * We set a threshhold of 4 pages to decide if the nocopy
2060          * read loop is worth the trouble...
2061          */
2062
2063         if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
2064           {
2065             retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2066             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2067                          (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2068             return(retval);
2069           }
2070
2071         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2072           {
2073             /* we know we have a resid, so this is safe */
2074             iov = uio->uio_iov;
2075             while (iov->iov_len == 0) {
2076               uio->uio_iov++;
2077               uio->uio_iovcnt--;
2078               iov = uio->uio_iov;
2079             }
2080
2081             /*
2082              * We check every vector target and if it is physically
2083              * contiguous space, we skip the sanity checks.
2084              */
2085
2086             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
2087             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
2088             pages_in_pl = 0;
2089             upl_flags = UPL_QUERY_OBJECT_TYPE;
2090             if((vm_map_get_upl(current_map(),
2091                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2092                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
2093               {
2094                 /*
2095                  * the user app must have passed in an invalid address
2096                  */
2097                 return (EFAULT);
2098               }
2099
2100             if (upl_flags & UPL_PHYS_CONTIG)
2101               {
2102                 retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
2103               }
2104             else if (uio->uio_resid < 4 * PAGE_SIZE)
2105               {
2106                 /*
2107                  * We set a threshhold of 4 pages to decide if the nocopy
2108                  * read loop is worth the trouble...
2109                  */
2110                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2111                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2112                              (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2113                 return(retval);
2114               }
2115             else if (uio->uio_offset & PAGE_MASK_64)
2116               {
2117                 /* Bring the file offset read up to a pagesize boundary */
2118                 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2119                 if (uio->uio_resid < clip_size)
2120                   clip_size = uio->uio_resid;
2121                 /*
2122                  * Fake the resid going into the cluster_read_x call
2123                  * and restore it on the way out.
2124                  */
2125                 prev_resid = uio->uio_resid;
2126                 uio->uio_resid = clip_size;
2127                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2128                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2129               }
2130             else if ((int)iov->iov_base & PAGE_MASK_64)
2131               {
2132                 clip_size = iov->iov_len;
2133                 prev_resid = uio->uio_resid;
2134                 uio->uio_resid = clip_size;
2135                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2136                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2137               }
2138             else
2139               {
2140                 /*
2141                  * If we come in here, we know the offset into
2142                  * the file is on a pagesize boundary
2143                  */
2144
2145                 max_io_size = filesize - uio->uio_offset;
2146                 clip_size = uio->uio_resid;
2147                 if (iov->iov_len < clip_size)
2148                   clip_size = iov->iov_len;
2149                 if (max_io_size < clip_size)
2150                   clip_size = (int)max_io_size;
2151
2152                 if (clip_size < PAGE_SIZE)
2153                   {
2154                     /*
2155                      * Take care of the tail end of the read in this vector.
2156                      */
2157                     prev_resid = uio->uio_resid;
2158                     uio->uio_resid = clip_size;
2159                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2160                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2161                   }
2162                 else
2163                   {
2164                     /* round clip_size down to a multiple of pagesize */
2165                     clip_size = clip_size & ~(PAGE_MASK);
2166                     prev_resid = uio->uio_resid;
2167                     uio->uio_resid = clip_size;
2168                     retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2169                     if ((retval==0) && uio->uio_resid)
2170                       retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2171                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2172                   }
2173               } /* end else */
2174           } /* end while */
2175
2176         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2177                      (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2178
2179         return(retval);
2180 }
2181
2182
2183 static int
2184 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2185         struct vnode *vp;
2186         struct uio   *uio;
2187         off_t         filesize;
2188         int           devblocksize;
2189         int           flags;
2190 {
2191         upl_page_info_t *pl;
2192         upl_t            upl;
2193         vm_offset_t      upl_offset;
2194         int              upl_size;
2195         off_t            upl_f_offset;
2196         int              start_offset;
2197         int              start_pg;
2198         int              last_pg;
2199         int              uio_last;
2200         int              pages_in_upl;
2201         off_t            max_size;
2202         int              io_size;
2203         vm_offset_t      io_address;
2204         kern_return_t    kret;
2205         int              segflg;
2206         int              error  = 0;
2207         int              retval = 0;
2208         int              b_lblkno;
2209         int              e_lblkno;
2210
2211         b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
2212
2213         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2214                 /*
2215                  * compute the size of the upl needed to encompass
2216                  * the requested read... limit each call to cluster_io
2217                  * to the maximum UPL size... cluster_io will clip if
2218                  * this exceeds the maximum io_size for the device,
2219                  * make sure to account for
2220                  * a starting offset that's not page aligned
2221                  */
2222                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2223                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2224                 max_size     = filesize - uio->uio_offset;
2225
2226                 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2227                         io_size = uio->uio_resid;
2228                 else
2229                         io_size = max_size;
2230
2231                 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
2232                         segflg = uio->uio_segflg;
2233
2234                         uio->uio_segflg = UIO_PHYS_USERSPACE;
2235
2236                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2237                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2238
2239                         while (io_size && retval == 0) {
2240                             int         xsize;
2241                                 ppnum_t paddr;
2242
2243                                 if (ubc_page_op(vp,
2244                                                 upl_f_offset,
2245                                                 UPL_POP_SET | UPL_POP_BUSY,
2246                                                 &paddr, 0) != KERN_SUCCESS)
2247                                         break;
2248
2249                                 xsize = PAGE_SIZE - start_offset;
2250
2251                                 if (xsize > io_size)
2252                                         xsize = io_size;
2253
2254                                 retval = uiomove64((addr64_t)(((addr64_t)paddr << 12) + start_offset), xsize, uio);
2255
2256                                 ubc_page_op(vp, upl_f_offset,
2257                                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2258
2259                                 io_size     -= xsize;
2260                                 start_offset = (int)
2261                                         (uio->uio_offset & PAGE_MASK_64);
2262                                 upl_f_offset = uio->uio_offset - start_offset;
2263                         }
2264                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2265                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2266
2267                         uio->uio_segflg = segflg;
2268
2269                         if (retval)
2270                                 break;
2271
2272                         if (io_size == 0) {
2273                                 /*
2274                                  * we're already finished with this read request
2275                                  * let's see if we should do a read-ahead
2276                                  */
2277                                 e_lblkno = (int)
2278                                         ((uio->uio_offset - 1) / PAGE_SIZE_64);
2279
2280                                 if (!(vp->v_flag & VRAOFF))
2281                                         /*
2282                                          * let's try to read ahead if we're in
2283                                          * a sequential access pattern
2284                                          */
2285                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2286                                 vp->v_lastr = e_lblkno;
2287
2288                                 break;
2289                         }
2290                         max_size = filesize - uio->uio_offset;
2291                 }
2292                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2293                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2294                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2295                 pages_in_upl = upl_size / PAGE_SIZE;
2296
2297                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2298                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2299
2300                 kret = ubc_create_upl(vp,
2301                                                 upl_f_offset,
2302                                                 upl_size,
2303                                                 &upl,
2304                                                 &pl,
2305                                                 UPL_FLAGS_NONE);
2306                 if (kret != KERN_SUCCESS)
2307                         panic("cluster_read: failed to get pagelist");
2308
2309                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2310                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2311
2312                 /*
2313                  * scan from the beginning of the upl looking for the first
2314                  * non-valid page.... this will become the first page in
2315                  * the request we're going to make to 'cluster_io'... if all
2316                  * of the pages are valid, we won't call through to 'cluster_io'
2317                  */
2318                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2319                         if (!upl_valid_page(pl, start_pg))
2320                                 break;
2321                 }
2322
2323                 /*
2324                  * scan from the starting invalid page looking for a valid
2325                  * page before the end of the upl is reached, if we
2326                  * find one, then it will be the last page of the request to
2327                  * 'cluster_io'
2328                  */
2329                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2330                         if (upl_valid_page(pl, last_pg))
2331                                 break;
2332                 }
2333
2334                 if (start_pg < last_pg) {
2335                         /*
2336                          * we found a range of 'invalid' pages that must be filled
2337                          * if the last page in this range is the last page of the file
2338                          * we may have to clip the size of it to keep from reading past
2339                          * the end of the last physical block associated with the file
2340                          */
2341                         upl_offset = start_pg * PAGE_SIZE;
2342                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2343
2344                         if ((upl_f_offset + upl_offset + io_size) > filesize)
2345                                 io_size = filesize - (upl_f_offset + upl_offset);
2346
2347                         /*
2348                          * issue a synchronous read to cluster_io
2349                          */
2350
2351                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2352                                            io_size, devblocksize, CL_READ, (struct buf *)0, (struct clios *)0);
2353                 }
2354                 if (error == 0) {
2355                         /*
2356                          * if the read completed successfully, or there was no I/O request
2357                          * issued, than map the upl into kernel address space and
2358                          * move the data into user land.... we'll first add on any 'valid'
2359                          * pages that were present in the upl when we acquired it.
2360                          */
2361                         u_int  val_size;
2362                         u_int  size_of_prefetch;
2363
2364                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2365                                 if (!upl_valid_page(pl, uio_last))
2366                                         break;
2367                         }
2368                         /*
2369                          * compute size to transfer this round,  if uio->uio_resid is
2370                          * still non-zero after this uiomove, we'll loop around and
2371                          * set up for another I/O.
2372                          */
2373                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2374
2375                         if (max_size < val_size)
2376                                 val_size = max_size;
2377
2378                         if (uio->uio_resid < val_size)
2379                                 val_size = uio->uio_resid;
2380
2381                         e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2382
2383                         if (size_of_prefetch = (uio->uio_resid - val_size)) {
2384                                 /*
2385                                  * if there's still I/O left to do for this request, then issue a
2386                                  * pre-fetch I/O... the I/O wait time will overlap
2387                                  * with the copying of the data
2388                                  */
2389                                 cluster_rd_prefetch(vp, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
2390                         } else {
2391                                 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2392                                         /*
2393                                          * let's try to read ahead if we're in
2394                                          * a sequential access pattern
2395                                          */
2396                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2397                                 vp->v_lastr = e_lblkno;
2398                         }
2399                         if (uio->uio_segflg == UIO_USERSPACE) {
2400                                 int       offset;
2401
2402                                 segflg = uio->uio_segflg;
2403
2404                                 uio->uio_segflg = UIO_PHYS_USERSPACE;
2405
2406
2407                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2408                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2409
2410                                 offset = start_offset;
2411
2412                                 while (val_size && retval == 0) {
2413                                         int       csize;
2414                                         int       i;
2415                                         addr64_t        paddr;
2416
2417                                         i = offset / PAGE_SIZE;
2418                                         csize = min(PAGE_SIZE - start_offset, val_size);
2419
2420                                         paddr = ((addr64_t)upl_phys_page(pl, i) << 12) + start_offset;
2421
2422                                         retval = uiomove64(paddr, csize, uio);
2423
2424                                         val_size    -= csize;
2425                                         offset      += csize;
2426                                         start_offset = offset & PAGE_MASK;
2427                                 }
2428                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2429                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2430
2431                                 uio->uio_segflg = segflg;
2432                         }
2433                         else
2434                         {
2435                                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
2436                                         panic("cluster_read: ubc_upl_map() failed\n");
2437
2438                                 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2439
2440                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
2441                                         panic("cluster_read: ubc_upl_unmap() failed\n");
2442                         }
2443                 }
2444                 if (start_pg < last_pg) {
2445                         /*
2446                          * compute the range of pages that we actually issued an I/O for
2447                          * and either commit them as valid if the I/O succeeded
2448                          * or abort them if the I/O failed
2449                          */
2450                         io_size = (last_pg - start_pg) * PAGE_SIZE;
2451
2452                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2453                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2454
2455                         if (error || (vp->v_flag & VNOCACHE_DATA))
2456                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2457                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2458                         else
2459                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2460                                                 UPL_COMMIT_CLEAR_DIRTY
2461                                                 | UPL_COMMIT_FREE_ON_EMPTY
2462                                                 | UPL_COMMIT_INACTIVATE);
2463
2464                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2465                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2466                 }
2467                 if ((last_pg - start_pg) < pages_in_upl) {
2468                         int cur_pg;
2469                         int commit_flags;
2470
2471                         /*
2472                          * the set of pages that we issued an I/O for did not encompass
2473                          * the entire upl... so just release these without modifying
2474                          * there state
2475                          */
2476                         if (error)
2477                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2478                         else {
2479                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2480                                              (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2481
2482                                 if (start_pg) {
2483                                         /*
2484                                          * we found some already valid pages at the beginning of
2485                                          * the upl commit these back to the inactive list with
2486                                          * reference cleared
2487                                          */
2488                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2489                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2490                                                                    | UPL_COMMIT_INACTIVATE;
2491
2492                                                 if (upl_dirty_page(pl, cur_pg))
2493                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2494
2495                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2496                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2497                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2498                                                 else
2499                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2500                                                                 PAGE_SIZE, commit_flags);
2501                                         }
2502                                 }
2503                                 if (last_pg < uio_last) {
2504                                         /*
2505                                          * we found some already valid pages immediately after the
2506                                          * pages we issued I/O for, commit these back to the
2507                                          * inactive list with reference cleared
2508                                          */
2509                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2510                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
2511                                                                                 | UPL_COMMIT_INACTIVATE;
2512
2513                                                 if (upl_dirty_page(pl, cur_pg))
2514                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2515
2516                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2517                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2518                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2519                                                 else
2520                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2521                                                                 PAGE_SIZE, commit_flags);
2522                                         }
2523                                 }
2524                                 if (uio_last < pages_in_upl) {
2525                                         /*
2526                                          * there were some invalid pages beyond the valid pages
2527                                          * that we didn't issue an I/O for, just release them
2528                                          * unchanged
2529                                          */
2530                                         ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2531                                                             (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
2532                                 }
2533
2534                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2535                                         (int)upl, -1, -1, 0, 0);
2536                         }
2537                 }
2538                 if (retval == 0)
2539                         retval = error;
2540         }
2541
2542         return (retval);
2543 }
2544
2545
2546 static int
2547 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2548         struct vnode *vp;
2549         struct uio   *uio;
2550         off_t         filesize;
2551         int           devblocksize;
2552         int           flags;
2553 {
2554         upl_t            upl;
2555         upl_page_info_t  *pl;
2556         off_t            upl_f_offset;
2557         vm_offset_t      upl_offset;
2558         off_t            start_upl_f_offset;
2559         off_t            max_io_size;
2560         int              io_size;
2561         int              upl_size;
2562         int              upl_needed_size;
2563         int              pages_in_pl;
2564         ppnum_t              paddr;
2565         int              upl_flags;
2566         kern_return_t    kret;
2567         int              segflg;
2568         struct iovec     *iov;
2569         int              i;
2570         int              force_data_sync;
2571         int              retval = 0;
2572         int              first = 1;
2573         struct clios     iostate;
2574
2575         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2576                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2577
2578         /*
2579          * When we enter this routine, we know
2580          *  -- the offset into the file is on a pagesize boundary
2581          *  -- the resid is a page multiple
2582          *  -- the resid will not exceed iov_len
2583          */
2584
2585         iostate.io_completed = 0;
2586         iostate.io_issued = 0;
2587         iostate.io_error = 0;
2588         iostate.io_wanted = 0;
2589
2590         iov = uio->uio_iov;
2591
2592         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2593
2594                 max_io_size = filesize - uio->uio_offset;
2595
2596                 if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2597                         io_size = max_io_size;
2598                 else
2599                         io_size = uio->uio_resid;
2600
2601                 /*
2602                  * We don't come into this routine unless
2603                  * UIO_USERSPACE is set.
2604                  */
2605                 segflg = uio->uio_segflg;
2606
2607                 uio->uio_segflg = UIO_PHYS_USERSPACE;
2608
2609                 /*
2610                  * First look for pages already in the cache
2611                  * and move them to user space.
2612                  */
2613                 while (io_size && (retval == 0)) {
2614                         upl_f_offset = uio->uio_offset;
2615
2616                         /*
2617                          * If this call fails, it means the page is not
2618                          * in the page cache.
2619                          */
2620                         if (ubc_page_op(vp, upl_f_offset,
2621                                         UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS)
2622                                 break;
2623
2624                         retval = uiomove64((addr64_t)paddr << 12, PAGE_SIZE, uio);
2625
2626                         ubc_page_op(vp, upl_f_offset,
2627                                     UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2628
2629                         io_size -= PAGE_SIZE;
2630                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
2631                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2632                 }
2633                 uio->uio_segflg = segflg;
2634
2635                 if (retval) {
2636                         /*
2637                          * we may have already spun some portion of this request
2638                          * off as async requests... we need to wait for the I/O
2639                          * to complete before returning
2640                          */
2641                         goto wait_for_reads;
2642                 }
2643                 /*
2644                  * If we are already finished with this read, then return
2645                  */
2646                 if (io_size == 0) {
2647                         /*
2648                          * we may have already spun some portion of this request
2649                          * off as async requests... we need to wait for the I/O
2650                          * to complete before returning
2651                          */
2652                         goto wait_for_reads;
2653                 }
2654                 max_io_size = io_size;
2655
2656                 if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2657                         max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2658                 if (first) {
2659                         if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2660                                 max_io_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 8;
2661                         first = 0;
2662                 }
2663                 start_upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
2664                 upl_f_offset = start_upl_f_offset;
2665                 io_size = 0;
2666
2667                 while (io_size < max_io_size) {
2668                         if (ubc_page_op(vp, upl_f_offset,
2669                                         UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS) {
2670                                 ubc_page_op(vp, upl_f_offset,
2671                                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2672                                 break;
2673                         }
2674                         /*
2675                          * Build up the io request parameters.
2676                          */
2677                         io_size += PAGE_SIZE_64;
2678                         upl_f_offset += PAGE_SIZE_64;
2679                 }
2680                 if (io_size == 0)
2681                         /*
2682                          * we may have already spun some portion of this request
2683                          * off as async requests... we need to wait for the I/O
2684                          * to complete before returning
2685                          */
2686                         goto wait_for_reads;
2687
2688                 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2689                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2690
2691                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2692                              (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
2693
2694                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2695                         pages_in_pl = 0;
2696                         upl_size = upl_needed_size;
2697                         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2698
2699                         kret = vm_map_get_upl(current_map(),
2700                                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2701                                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2702
2703                         if (kret != KERN_SUCCESS) {
2704                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2705                                              (int)upl_offset, upl_size, io_size, kret, 0);
2706
2707                                 /*
2708                                  * cluster_nocopy_read: failed to get pagelist
2709                                  *
2710                                  * we may have already spun some portion of this request
2711                                  * off as async requests... we need to wait for the I/O
2712                                  * to complete before returning
2713                                  */
2714                                 goto wait_for_reads;
2715                         }
2716                         pages_in_pl = upl_size / PAGE_SIZE;
2717                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2718
2719                         for (i = 0; i < pages_in_pl; i++) {
2720                                 if (!upl_valid_page(pl, i))
2721                                         break;
2722                         }
2723                         if (i == pages_in_pl)
2724                                 break;
2725
2726                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2727                                             UPL_ABORT_FREE_ON_EMPTY);
2728                 }
2729                 if (force_data_sync >= 3) {
2730                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2731                                      (int)upl_offset, upl_size, io_size, kret, 0);
2732
2733                         goto wait_for_reads;
2734                 }
2735                 /*
2736                  * Consider the possibility that upl_size wasn't satisfied.
2737                  */
2738                 if (upl_size != upl_needed_size)
2739                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2740
2741                 if (io_size == 0) {
2742                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2743                                             UPL_ABORT_FREE_ON_EMPTY);
2744                         goto wait_for_reads;
2745                 }
2746                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2747                              (int)upl_offset, upl_size, io_size, kret, 0);
2748
2749                 /*
2750                  * request asynchronously so that we can overlap
2751                  * the preparation of the next I/O
2752                  * if there are already too many outstanding reads
2753                  * wait until some have completed before issuing the next read
2754                  */
2755                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2756                         iostate.io_wanted = 1;
2757                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2758                 }
2759                 if (iostate.io_error) {
2760                         /*
2761                          * one of the earlier reads we issued ran into a hard error
2762                          * don't issue any more reads, cleanup the UPL
2763                          * that was just created but not used, then
2764                          * go wait for any other reads to complete before
2765                          * returning the error to the caller
2766                          */
2767                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2768                                             UPL_ABORT_FREE_ON_EMPTY);
2769
2770                         goto wait_for_reads;
2771                 }
2772                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2773                              (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
2774
2775                 retval = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
2776                                    io_size, devblocksize,
2777                                    CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
2778                                    (struct buf *)0, &iostate);
2779
2780                 /*
2781                  * update the uio structure
2782                  */
2783                 iov->iov_base   += io_size;
2784                 iov->iov_len    -= io_size;
2785                 uio->uio_resid  -= io_size;
2786                 uio->uio_offset += io_size;
2787
2788                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2789                              (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, retval, 0);
2790
2791         } /* end while */
2792
2793 wait_for_reads:
2794         /*
2795          * make sure all async reads that are part of this stream
2796          * have completed before we return
2797          */
2798         while (iostate.io_issued != iostate.io_completed) {
2799                 iostate.io_wanted = 1;
2800                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2801         }
2802         if (iostate.io_error)
2803                 retval = iostate.io_error;
2804
2805         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2806                      (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2807
2808         return (retval);
2809 }
2810
2811
2812 static int
2813 cluster_phys_read(vp, uio, filesize, devblocksize, flags)
2814         struct vnode *vp;
2815         struct uio   *uio;
2816         off_t        filesize;
2817         int          devblocksize;
2818         int          flags;
2819 {
2820         upl_page_info_t *pl;
2821         upl_t            upl;
2822         vm_offset_t      upl_offset;
2823         addr64_t             dst_paddr;
2824         off_t            max_size;
2825         int              io_size;
2826         int              tail_size;
2827         int              upl_size;
2828         int              upl_needed_size;
2829         int              pages_in_pl;
2830         int              upl_flags;
2831         kern_return_t    kret;
2832         struct iovec     *iov;
2833         struct clios     iostate;
2834         int              error;
2835
2836         /*
2837          * When we enter this routine, we know
2838          *  -- the resid will not exceed iov_len
2839          *  -- the target address is physically contiguous
2840          */
2841
2842         iov = uio->uio_iov;
2843
2844         max_size = filesize - uio->uio_offset;
2845
2846         if (max_size > (off_t)((unsigned int)iov->iov_len))
2847                 io_size = iov->iov_len;
2848         else
2849                 io_size = max_size;
2850
2851         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2852         upl_needed_size = upl_offset + io_size;
2853
2854         error       = 0;
2855         pages_in_pl = 0;
2856         upl_size = upl_needed_size;
2857         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2858
2859         kret = vm_map_get_upl(current_map(),
2860                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2861                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
2862
2863         if (kret != KERN_SUCCESS) {
2864                 /*
2865                  * cluster_phys_read: failed to get pagelist
2866                  */
2867                 return(EINVAL);
2868         }
2869         if (upl_size < upl_needed_size) {
2870                 /*
2871                  * The upl_size wasn't satisfied.
2872                  */
2873                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2874
2875                 return(EINVAL);
2876         }
2877         pl = ubc_upl_pageinfo(upl);
2878
2879         dst_paddr = (((addr64_t)(int)upl_phys_page(pl, 0)) << 12) + ((addr64_t)iov->iov_base & PAGE_MASK);
2880
2881         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
2882                 int   head_size;
2883
2884                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
2885
2886                 if (head_size > io_size)
2887                         head_size = io_size;
2888
2889                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
2890
2891                 if (error) {
2892                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2893
2894                         return(EINVAL);
2895                 }
2896                 upl_offset += head_size;
2897                 dst_paddr  += head_size;
2898                 io_size    -= head_size;
2899         }
2900         tail_size = io_size & (devblocksize - 1);
2901         io_size  -= tail_size;
2902
2903         iostate.io_completed = 0;
2904         iostate.io_issued = 0;
2905         iostate.io_error = 0;
2906         iostate.io_wanted = 0;
2907
2908         while (io_size && error == 0) {
2909                 int  xsize;
2910
2911                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2912                         xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
2913                 else
2914                         xsize = io_size;
2915                 /*
2916                  * request asynchronously so that we can overlap
2917                  * the preparation of the next I/O... we'll do
2918                  * the commit after all the I/O has completed
2919                  * since its all issued against the same UPL
2920                  * if there are already too many outstanding reads
2921                  * wait until some have completed before issuing the next
2922                  */
2923                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2924                         iostate.io_wanted = 1;
2925                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
2926                 }
2927
2928                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0,
2929                                    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
2930                                    (struct buf *)0, &iostate);
2931                 /*
2932                  * The cluster_io read was issued successfully,
2933                  * update the uio structure
2934                  */
2935                 if (error == 0) {
2936                         uio->uio_resid  -= xsize;
2937                         iov->iov_len    -= xsize;
2938                         iov->iov_base   += xsize;
2939                         uio->uio_offset += xsize;
2940                         dst_paddr       += xsize;
2941                         upl_offset      += xsize;
2942                         io_size         -= xsize;
2943                 }
2944         }
2945         /*
2946          * make sure all async reads that are part of this stream
2947          * have completed before we proceed
2948          */
2949         while (iostate.io_issued != iostate.io_completed) {
2950                 iostate.io_wanted = 1;
2951                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
2952         }
2953         if (iostate.io_error) {
2954                 error = iostate.io_error;
2955         }
2956         if (error == 0 && tail_size)
2957                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
2958
2959         /*
2960          * just release our hold on the physically contiguous
2961          * region without changing any state
2962          */
2963         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2964
2965         return (error);
2966 }
2967
2968
2969 /*
2970  * generate advisory I/O's in the largest chunks possible
2971  * the completed pages will be released into the VM cache
2972  */
2973 int
2974 advisory_read(vp, filesize, f_offset, resid, devblocksize)
2975         struct vnode *vp;
2976         off_t         filesize;
2977         off_t         f_offset;
2978         int           resid;
2979         int           devblocksize;
2980 {
2981         upl_page_info_t *pl;
2982         upl_t            upl;
2983         vm_offset_t      upl_offset;
2984         int              upl_size;
2985         off_t            upl_f_offset;
2986         int              start_offset;
2987         int              start_pg;
2988         int              last_pg;
2989         int              pages_in_upl;
2990         off_t            max_size;
2991         int              io_size;
2992         kern_return_t    kret;
2993         int              retval = 0;
2994         int              issued_io;
2995
2996         if (!UBCINFOEXISTS(vp))
2997                 return(EINVAL);
2998
2999         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3000                      (int)f_offset, resid, (int)filesize, devblocksize, 0);
3001
3002         while (resid && f_offset < filesize && retval == 0) {
3003                 /*
3004                  * compute the size of the upl needed to encompass
3005                  * the requested read... limit each call to cluster_io
3006                  * to the maximum UPL size... cluster_io will clip if
3007                  * this exceeds the maximum io_size for the device,
3008                  * make sure to account for
3009                  * a starting offset that's not page aligned
3010                  */
3011                 start_offset = (int)(f_offset & PAGE_MASK_64);
3012                 upl_f_offset = f_offset - (off_t)start_offset;
3013                 max_size     = filesize - f_offset;
3014
3015                 if (resid < max_size)
3016                         io_size = resid;
3017                 else
3018                         io_size = max_size;
3019
3020                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3021                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3022                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3023                 pages_in_upl = upl_size / PAGE_SIZE;
3024
3025                 kret = ubc_create_upl(vp,
3026                                                 upl_f_offset,
3027                                                 upl_size,
3028                                                 &upl,
3029                                                 &pl,
3030                                                 UPL_RET_ONLY_ABSENT);
3031                 if (kret != KERN_SUCCESS)
3032                         return(retval);
3033                 issued_io = 0;
3034
3035                 /*
3036                  * before we start marching forward, we must make sure we end on
3037                  * a present page, otherwise we will be working with a freed
3038                  * upl
3039                  */
3040                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3041                         if (upl_page_present(pl, last_pg))
3042                                 break;
3043                 }
3044                 pages_in_upl = last_pg + 1;
3045
3046
3047                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
3048                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3049
3050
3051                 for (last_pg = 0; last_pg < pages_in_upl; ) {
3052                         /*
3053                          * scan from the beginning of the upl looking for the first
3054                          * page that is present.... this will become the first page in
3055                          * the request we're going to make to 'cluster_io'... if all
3056                          * of the pages are absent, we won't call through to 'cluster_io'
3057                          */
3058                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3059                                 if (upl_page_present(pl, start_pg))
3060                                         break;
3061                         }
3062
3063                         /*
3064                          * scan from the starting present page looking for an absent
3065                          * page before the end of the upl is reached, if we
3066                          * find one, then it will terminate the range of pages being
3067                          * presented to 'cluster_io'
3068                          */
3069                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3070                                 if (!upl_page_present(pl, last_pg))
3071                                         break;
3072                         }
3073
3074                         if (last_pg > start_pg) {
3075                                 /*
3076                                  * we found a range of pages that must be filled
3077                                  * if the last page in this range is the last page of the file
3078                                  * we may have to clip the size of it to keep from reading past
3079                                  * the end of the last physical block associated with the file
3080                                  */
3081                                 upl_offset = start_pg * PAGE_SIZE;
3082                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
3083
3084                                 if ((upl_f_offset + upl_offset + io_size) > filesize)
3085                                         io_size = filesize - (upl_f_offset + upl_offset);
3086
3087                                 /*
3088                                  * issue an asynchronous read to cluster_io
3089                                  */
3090                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
3091                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
3092
3093                                 issued_io = 1;
3094                         }
3095                 }
3096                 if (issued_io == 0)
3097                         ubc_upl_abort(upl, 0);
3098
3099                 io_size = upl_size - start_offset;
3100
3101                 if (io_size > resid)
3102                         io_size = resid;
3103                 f_offset += io_size;
3104                 resid    -= io_size;
3105         }
3106
3107         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3108                      (int)f_offset, resid, retval, 0, 0);
3109
3110         return(retval);
3111 }
3112
3113
3114 int
3115 cluster_push(vp)
3116         struct vnode *vp;
3117 {
3118         int  retval;
3119
3120         if (!UBCINFOEXISTS(vp) || vp->v_clen == 0) {
3121                 vp->v_flag &= ~VHASDIRTY;
3122                 return(0);
3123         }
3124
3125         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3126                      vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
3127
3128         if (vp->v_flag & VHASDIRTY) {
3129                 daddr_t start_pg;
3130                 daddr_t last_pg;
3131                 daddr_t end_pg;
3132
3133                 start_pg = vp->v_cstart;
3134                 end_pg   = vp->v_lastw;
3135
3136                 vp->v_flag &= ~VHASDIRTY;
3137                 vp->v_clen = 0;
3138
3139                 while (start_pg < end_pg) {
3140                         last_pg = start_pg + MAX_UPL_TRANSFER;
3141
3142                         if (last_pg > end_pg)
3143                                 last_pg = end_pg;
3144
3145                         cluster_push_x(vp, ubc_getsize(vp), start_pg, last_pg, 0);
3146
3147                         start_pg = last_pg;
3148                 }
3149                 return (1);
3150         }
3151         retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
3152
3153         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3154                      vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
3155
3156         return (retval);
3157 }
3158
3159
3160 static int
3161 cluster_try_push(vp, EOF, can_delay, push_all)
3162         struct vnode *vp;
3163         off_t  EOF;
3164         int    can_delay;
3165         int    push_all;
3166 {
3167         int cl_index;
3168         int cl_index1;
3169         int min_index;
3170         int cl_len;
3171         int cl_total;
3172         int cl_pushed;
3173         struct v_cluster l_clusters[MAX_CLUSTERS];
3174
3175         /*
3176          * make a local 'sorted' copy of the clusters
3177          * and clear vp->v_clen so that new clusters can
3178          * be developed
3179          */
3180         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3181                 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3182                         if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3183                                 continue;
3184                         if (min_index == -1)
3185                                 min_index = cl_index1;
3186                         else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3187                                 min_index = cl_index1;
3188                 }
3189                 if (min_index == -1)
3190                         break;
3191                 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3192                 l_clusters[cl_index].last_pg  = vp->v_clusters[min_index].last_pg;
3193
3194                 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3195         }
3196         cl_len     = cl_index;
3197         vp->v_clen = 0;
3198
3199         for (cl_pushed = 0, cl_index = 0; cl_index < cl_len; cl_index++) {
3200                 /*
3201                  * try to push each cluster in turn...  cluster_push_x may not
3202                  * push the cluster if can_delay is TRUE and the cluster doesn't
3203                  * meet the critera for an immediate push
3204                  */
3205                 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3206                         l_clusters[cl_index].start_pg = 0;
3207                         l_clusters[cl_index].last_pg  = 0;
3208
3209                         cl_pushed++;
3210
3211                         if (push_all == 0)
3212                                 break;
3213                 }
3214         }
3215         if (cl_len > cl_pushed) {
3216                /*
3217                 * we didn't push all of the clusters, so
3218                 * lets try to merge them back in to the vnode
3219                 */
3220                 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3221                         /*
3222                          * we picked up some new clusters while we were trying to
3223                          * push the old ones (I don't think this can happen because
3224                          * I'm holding the lock, but just in case)... the sum of the
3225                          * leftovers plus the new cluster count exceeds our ability
3226                          * to represent them, so fall back to the VHASDIRTY mechanism
3227                          */
3228                         for (cl_index = 0; cl_index < cl_len; cl_index++) {
3229                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3230                                         continue;
3231
3232                                 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3233                                         vp->v_cstart = l_clusters[cl_index].start_pg;
3234                                 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3235                                         vp->v_lastw = l_clusters[cl_index].last_pg;
3236                         }
3237                         vp->v_flag |= VHASDIRTY;
3238                 } else {
3239                         /*
3240                          * we've got room to merge the leftovers back in
3241                          * just append them starting at the next 'hole'
3242                          * represented by vp->v_clen
3243                          */
3244                         for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3245                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3246                                         continue;
3247
3248                                 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3249                                 vp->v_clusters[cl_index1].last_pg  = l_clusters[cl_index].last_pg;
3250
3251                                 if (cl_index1 == 0) {
3252                                         vp->v_cstart = l_clusters[cl_index].start_pg;
3253                                         vp->v_lastw  = l_clusters[cl_index].last_pg;
3254                                 } else {
3255                                         if (l_clusters[cl_index].start_pg < vp->v_cstart)
3256                                                 vp->v_cstart = l_clusters[cl_index].start_pg;
3257                                         if (l_clusters[cl_index].last_pg > vp->v_lastw)
3258                                                 vp->v_lastw = l_clusters[cl_index].last_pg;
3259                                 }
3260                                 cl_index1++;
3261                         }
3262                         /*
3263                          * update the cluster count
3264                          */
3265                         vp->v_clen = cl_index1;
3266                 }
3267         }
3268         return(MAX_CLUSTERS - vp->v_clen);
3269 }
3270
3271
3272
3273 static int
3274 cluster_push_x(vp, EOF, first, last, can_delay)
3275         struct vnode *vp;
3276         off_t  EOF;
3277         daddr_t first;
3278         daddr_t last;
3279         int    can_delay;
3280 {
3281         upl_page_info_t *pl;
3282         upl_t            upl;
3283         vm_offset_t      upl_offset;
3284         int              upl_size;
3285         off_t            upl_f_offset;
3286         int              pages_in_upl;
3287         int              start_pg;
3288         int              last_pg;
3289         int              io_size;
3290         int              io_flags;
3291         int              size;
3292         kern_return_t    kret;
3293
3294
3295         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3296                      vp->v_clen, first, last, EOF, 0);
3297
3298         if ((pages_in_upl = last - first) == 0) {
3299                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
3300
3301                 return (1);
3302         }
3303         upl_size = pages_in_upl * PAGE_SIZE;
3304         upl_f_offset = ((off_t)first) * PAGE_SIZE_64;
3305
3306         if (upl_f_offset + upl_size >= EOF) {
3307
3308                 if (upl_f_offset >= EOF) {
3309                         /*
3310                          * must have truncated the file and missed
3311                          * clearing a dangling cluster (i.e. it's completely
3312                          * beyond the new EOF
3313                          */
3314                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3315
3316                         return(1);
3317                 }
3318                 size = EOF - upl_f_offset;
3319
3320                 upl_size = (size + (PAGE_SIZE - 1) ) & ~(PAGE_SIZE - 1);
3321                 pages_in_upl = upl_size / PAGE_SIZE;
3322         } else {
3323                 if (can_delay && (pages_in_upl < (MAX_UPL_TRANSFER - (MAX_UPL_TRANSFER / 2))))
3324                         return(0);
3325                 size = upl_size;
3326         }
3327         kret = ubc_create_upl(vp,
3328                                 upl_f_offset,
3329                                 upl_size,
3330                                 &upl,
3331                                 &pl,
3332                                 UPL_RET_ONLY_DIRTY);
3333         if (kret != KERN_SUCCESS)
3334                 panic("cluster_push: failed to get pagelist");
3335
3336         if (can_delay) {
3337                 int  num_of_dirty;
3338
3339                 for (num_of_dirty = 0, start_pg = 0; start_pg < pages_in_upl; start_pg++) {
3340                         if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3341                                 num_of_dirty++;
3342                 }
3343                 if (num_of_dirty < pages_in_upl / 2) {
3344                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3345
3346                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 0, 2, num_of_dirty, (pages_in_upl / 2), 0);
3347
3348                         return(0);
3349                 }
3350         }
3351         last_pg = 0;
3352
3353         while (size) {
3354
3355                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3356                         if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3357                                 break;
3358                 }
3359                 if (start_pg > last_pg) {
3360                         io_size = (start_pg - last_pg) * PAGE_SIZE;
3361
3362                         ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size,
3363                                         UPL_ABORT_FREE_ON_EMPTY);
3364
3365                         if (io_size < size)
3366                                 size -= io_size;
3367                         else
3368                                 break;
3369                 }
3370                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3371                         if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
3372                                 break;
3373                 }
3374                 upl_offset = start_pg * PAGE_SIZE;
3375
3376                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3377
3378                 if (vp->v_flag & VNOCACHE_DATA)
3379                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC | CL_DUMP;
3380                 else
3381                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
3382
3383                 while (vp->v_numoutput >= ASYNC_THROTTLE) {
3384                         vp->v_flag |= VTHROTTLED;
3385                         tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
3386                 }
3387                 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
3388
3389                 size -= io_size;
3390         }
3391         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3392
3393         return(1);
3394 }
3395
3396
3397
3398 static int
3399 cluster_align_phys_io(struct vnode *vp, struct uio *uio, addr64_t usr_paddr, int xsize, int devblocksize, int flags)
3400 {
3401         struct iovec     *iov;
3402         upl_page_info_t  *pl;
3403         upl_t            upl;
3404         addr64_t             ubc_paddr;
3405         kern_return_t    kret;
3406         int              error = 0;
3407
3408         iov = uio->uio_iov;
3409
3410         kret = ubc_create_upl(vp,
3411                               uio->uio_offset & ~PAGE_MASK_64,
3412                               PAGE_SIZE,
3413                               &upl,
3414                               &pl,
3415                               UPL_FLAGS_NONE);
3416
3417         if (kret != KERN_SUCCESS)
3418                 return(EINVAL);
3419
3420         if (!upl_valid_page(pl, 0)) {
3421                 /*
3422                  * issue a synchronous read to cluster_io
3423                  */
3424                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3425                                    CL_READ, (struct buf *)0, (struct clios *)0);
3426                 if (error) {
3427                           ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3428
3429                           return(error);
3430                 }
3431         }
3432         ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
3433
3434 /*
3435  *              NOTE:  There is no prototype for the following in BSD. It, and the definitions
3436  *              of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
3437  *              osfmk/ppc/mappings.h.  They are not included here because there appears to be no
3438  *              way to do so without exporting them to kexts as well.
3439  */
3440                 if (flags & CL_READ)
3441 //                      copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);    /* Copy physical to physical and flush the destination */
3442                         copypv(ubc_paddr, usr_paddr, xsize,        2 |        1 |        4);    /* Copy physical to physical and flush the destination */
3443                 else
3444 //                      copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);    /* Copy physical to physical and flush the source */
3445                         copypv(ubc_paddr, usr_paddr, xsize,        2 |        1 |        8);    /* Copy physical to physical and flush the source */
3446
3447                 if ( !(flags & CL_READ) || upl_dirty_page(pl, 0)) {
3448                         /*
3449                         * issue a synchronous write to cluster_io
3450                         */
3451                         error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3452                                 0, (struct buf *)0, (struct clios *)0);
3453                 }
3454                 if (error == 0) {
3455                         uio->uio_offset += xsize;
3456                         iov->iov_base   += xsize;
3457                         iov->iov_len    -= xsize;
3458                         uio->uio_resid  -= xsize;
3459                 }
3460                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3461
3462                 return (error);
3463 }