bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * Redistribution and use in source and binary forms, with or without
  28  * modification, are permitted provided that the following conditions
  29  * are met:
  30  * 1. Redistributions of source code must retain the above copyright
  31  *    notice, this list of conditions and the following disclaimer.
  32  * 2. Redistributions in binary form must reproduce the above copyright
  33  *    notice, this list of conditions and the following disclaimer in the
  34  *    documentation and/or other materials provided with the distribution.
  35  * 3. All advertising materials mentioning features or use of this software
  36  *    must display the following acknowledgement:
  37  *      This product includes software developed by the University of
  38  *      California, Berkeley and its contributors.
  39  * 4. Neither the name of the University nor the names of its contributors
  40  *    may be used to endorse or promote products derived from this software
  41  *    without specific prior written permission.
  42  *
  43  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  46  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  53  * SUCH DAMAGE.
  54  *
  55  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  56  */
  57
  58 #include <sys/param.h>
  59 #include <sys/proc.h>
  60 #include <sys/buf.h>
  61 #include <sys/vnode.h>
  62 #include <sys/mount.h>
  63 #include <sys/trace.h>
  64 #include <sys/malloc.h>
  65 #include <sys/resourcevar.h>
  66 #include <libkern/libkern.h>
  67
  68 #include <sys/ubc.h>
  69 #include <vm/vm_pageout.h>
  70
  71 #include <sys/kdebug.h>
  72
  73 #define CL_READ      0x01
  74 #define CL_ASYNC     0x02
  75 #define CL_COMMIT    0x04
  76 #define CL_PAGEOUT   0x10
  77 #define CL_AGE       0x20
  78 #define CL_DUMP      0x40
  79 #define CL_NOZERO    0x80
  80 #define CL_PAGEIN    0x100
  81 #define CL_DEV_MEMORY 0x200
  82 #define CL_PRESERVE   0x400
  83
  84
  85 struct clios {
  86         u_int  io_completed;       /* amount of io that has currently completed */
  87         u_int  io_issued;          /* amount of io that was successfully issued */
  88         int    io_error;           /* error code of first error encountered */
  89         int    io_wanted;          /* someone is sleeping waiting for a change in state */
  90 };
  91
  92
  93 static void cluster_zero(upl_t upl, vm_offset_t   upl_offset,
  94                 int size, struct buf *bp);
  95 static int cluster_read_x(struct vnode *vp, struct uio *uio,
  96                 off_t filesize, int devblocksize, int flags);
  97 static int cluster_write_x(struct vnode *vp, struct uio *uio,
  98                 off_t oldEOF, off_t newEOF, off_t headOff,
  99                 off_t tailOff, int devblocksize, int flags);
 100 static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
 101                 off_t filesize, int devblocksize, int flags);
 102 static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
 103                 off_t newEOF, int devblocksize, int flags);
 104 static int cluster_phys_read(struct vnode *vp, struct uio *uio,
 105                 off_t filesize, int devblocksize, int flags);
 106 static int cluster_phys_write(struct vnode *vp, struct uio *uio,
 107                 off_t newEOF, int devblocksize, int flags);
 108 static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
 109                 vm_offset_t usr_paddr, int xsize, int devblocksize, int flags);
 110 static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
 111 static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all);
 112
 113
 114 /*
 115  * throttle the number of async writes that
 116  * can be outstanding on a single vnode
 117  * before we issue a synchronous write
 118  */
 119 #define ASYNC_THROTTLE  9
 120
 121 static int
 122 cluster_iodone(bp)
 123         struct buf *bp;
 124 {
 125         int         b_flags;
 126         int         error;
 127         int         total_size;
 128         int         total_resid;
 129         int         upl_offset;
 130         int         zero_offset;
 131         upl_t       upl;
 132         struct buf *cbp;
 133         struct buf *cbp_head;
 134         struct buf *cbp_next;
 135         struct buf *real_bp;
 136         struct vnode *vp;
 137         struct clios *iostate;
 138         int         commit_size;
 139         int         pg_offset;
 140
 141
 142         cbp_head = (struct buf *)(bp->b_trans_head);
 143
 144         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 145                      (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 146
 147         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 148                 /*
 149                  * all I/O requests that are part of this transaction
 150                  * have to complete before we can process it
 151                  */
 152                 if ( !(cbp->b_flags & B_DONE)) {
 153
 154                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 155                                      (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
 156
 157                         return 0;
 158                 }
 159         }
 160         error       = 0;
 161         total_size  = 0;
 162         total_resid = 0;
 163
 164         cbp        = cbp_head;
 165         upl_offset = cbp->b_uploffset;
 166         upl        = cbp->b_pagelist;
 167         b_flags    = cbp->b_flags;
 168         real_bp    = cbp->b_real_bp;
 169         vp         = cbp->b_vp;
 170         zero_offset= cbp->b_validend;
 171         iostate    = (struct clios *)cbp->b_iostate;
 172
 173         while (cbp) {
 174                 if (cbp->b_vectorcount > 1)
 175                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 176
 177                 if ((cbp->b_flags & B_ERROR) && error == 0)
 178                         error = cbp->b_error;
 179
 180                 total_resid += cbp->b_resid;
 181                 total_size  += cbp->b_bcount;
 182
 183                 cbp_next = cbp->b_trans_next;
 184
 185                 free_io_buf(cbp);
 186
 187                 cbp = cbp_next;
 188         }
 189         if (zero_offset)
 190                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 191
 192         if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
 193                 vp->v_flag &= ~VTHROTTLED;
 194                 wakeup((caddr_t)&vp->v_numoutput);
 195         }
 196         if (iostate) {
 197                 /*
 198                  * someone has issued multiple I/Os asynchrounsly
 199                  * and is waiting for them to complete (streaming)
 200                  */
 201                 if (error && iostate->io_error == 0)
 202                         iostate->io_error = error;
 203
 204                 iostate->io_completed += total_size;
 205
 206                 if (iostate->io_wanted) {
 207                         /*
 208                          * someone is waiting for the state of
 209                          * this io stream to change
 210                          */
 211                         iostate->io_wanted = 0;
 212                         wakeup((caddr_t)&iostate->io_wanted);
 213                 }
 214         }
 215         if ((b_flags & B_NEED_IODONE) && real_bp) {
 216                 if (error) {
 217                         real_bp->b_flags |= B_ERROR;
 218                         real_bp->b_error = error;
 219                 }
 220                 real_bp->b_resid = total_resid;
 221
 222                 biodone(real_bp);
 223         }
 224         if (error == 0 && total_resid)
 225                 error = EIO;
 226
 227         if (b_flags & B_COMMIT_UPL) {
 228                 pg_offset   = upl_offset & PAGE_MASK;
 229                 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 230
 231                 if (error || (b_flags & B_NOCACHE) || ((b_flags & B_PHYS) && !(b_flags & B_READ))) {
 232                         int upl_abort_code;
 233
 234                         if (b_flags & B_PHYS)
 235                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 236                         else if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
 237                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 238                         else if (b_flags & B_PGIN)
 239                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 240                         else
 241                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 242
 243                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 244                                         upl_abort_code);
 245
 246                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 247                                      (int)upl, upl_offset - pg_offset, commit_size,
 248                                      0x80000000|upl_abort_code, 0);
 249
 250                 } else {
 251                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 252
 253                         if (b_flags & B_PHYS)
 254                                 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
 255                         else if ( !(b_flags & B_PAGEOUT))
 256                                 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
 257                         if (b_flags & B_AGE)
 258                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 259
 260                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 261                                         upl_commit_flags);
 262
 263                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 264                                      (int)upl, upl_offset - pg_offset, commit_size,
 265                                      upl_commit_flags, 0);
 266                 }
 267         } else
 268                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 269                              (int)upl, upl_offset, 0, error, 0);
 270
 271         return (error);
 272 }
 273
 274
 275 static void
 276 cluster_zero(upl, upl_offset, size, bp)
 277         upl_t         upl;
 278         vm_offset_t   upl_offset;
 279         int           size;
 280         struct buf   *bp;
 281 {
 282         vm_offset_t   io_addr = 0;
 283         int           must_unmap = 0;
 284         kern_return_t kret;
 285
 286         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
 287                      upl_offset, size, (int)bp, 0, 0);
 288
 289         if (bp == NULL || bp->b_data == NULL) {
 290                 kret = ubc_upl_map(upl, &io_addr);
 291
 292                 if (kret != KERN_SUCCESS)
 293                         panic("cluster_zero: ubc_upl_map() failed with (%d)", kret);
 294                 if (io_addr == 0)
 295                         panic("cluster_zero: ubc_upl_map() mapped 0");
 296
 297                 must_unmap = 1;
 298         } else
 299                 io_addr = (vm_offset_t)bp->b_data;
 300         bzero((caddr_t)(io_addr + upl_offset), size);
 301
 302         if (must_unmap) {
 303                 kret = ubc_upl_unmap(upl);
 304
 305                 if (kret != KERN_SUCCESS)
 306                         panic("cluster_zero: kernel_upl_unmap failed");
 307         }
 308 }
 309
 310 static int
 311 cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
 312         struct vnode *vp;
 313         upl_t         upl;
 314         vm_offset_t   upl_offset;
 315         off_t         f_offset;
 316         int           non_rounded_size;
 317         int           devblocksize;
 318         int           flags;
 319         struct buf   *real_bp;
 320         struct clios *iostate;
 321 {
 322         struct buf   *cbp;
 323         struct iovec *iovp;
 324         u_int         size;
 325         u_int         io_size;
 326         int           io_flags;
 327         int           error = 0;
 328         int           retval = 0;
 329         struct buf   *cbp_head = 0;
 330         struct buf   *cbp_tail = 0;
 331         upl_page_info_t *pl;
 332         int buf_count = 0;
 333         int pg_count;
 334         int pg_offset;
 335         u_int max_iosize;
 336         u_int max_vectors;
 337         int priv;
 338         int zero_offset = 0;
 339         u_int  first_lblkno;
 340
 341         if (flags & CL_READ) {
 342                 io_flags = (B_VECTORLIST | B_READ);
 343
 344                 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
 345         } else {
 346                 io_flags = (B_VECTORLIST | B_WRITEINPROG);
 347
 348                 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
 349         }
 350         pl = ubc_upl_pageinfo(upl);
 351
 352         if (flags & CL_AGE)
 353                 io_flags |= B_AGE;
 354         if (flags & CL_DUMP)
 355                 io_flags |= B_NOCACHE;
 356         if (flags & CL_PAGEIN)
 357                 io_flags |= B_PGIN;
 358         if (flags & CL_PAGEOUT)
 359                 io_flags |= B_PAGEOUT;
 360         if (flags & CL_COMMIT)
 361                 io_flags |= B_COMMIT_UPL;
 362         if (flags & CL_PRESERVE)
 363                 io_flags |= B_PHYS;
 364
 365         if (devblocksize)
 366                 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
 367         else
 368                 size = non_rounded_size;
 369
 370
 371         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 372                      (int)f_offset, size, upl_offset, flags, 0);
 373
 374         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 375                 /*
 376                  * then we are going to end up
 377                  * with a page that we can't complete (the file size wasn't a multiple
 378                  * of PAGE_SIZE and we're trying to read to the end of the file
 379                  * so we'll go ahead and zero out the portion of the page we can't
 380                  * read in from the file
 381                  */
 382                 zero_offset = upl_offset + non_rounded_size;
 383         }
 384         while (size) {
 385                 int vsize;
 386                 int i;
 387                 int pl_index;
 388                 int pg_resid;
 389                 int num_contig;
 390                 daddr_t lblkno;
 391                 daddr_t blkno;
 392
 393                 if (size > max_iosize)
 394                         io_size = max_iosize;
 395                 else
 396                         io_size = size;
 397
 398                 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
 399                         if (error == EOPNOTSUPP)
 400                                 panic("VOP_CMAP Unimplemented");
 401                         break;
 402                 }
 403
 404                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 405                              (int)f_offset, (int)blkno, io_size, zero_offset, 0);
 406
 407                 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
 408                         if (flags & CL_PAGEOUT) {
 409                                 error = EINVAL;
 410                                 break;
 411                         };
 412
 413                         /* Try paging out the page individually before
 414                            giving up entirely and dumping it (it could
 415                            be mapped in a "hole" and require allocation
 416                            before the I/O:
 417                          */
 418                          ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE_64, UPL_ABORT_FREE_ON_EMPTY);
 419                          if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
 420                                 error = EINVAL;
 421                                 break;
 422                          };
 423
 424                         upl_offset += PAGE_SIZE_64;
 425                         f_offset   += PAGE_SIZE_64;
 426                         size       -= PAGE_SIZE_64;
 427                         continue;
 428                 }
 429                 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
 430                 /*
 431                  * we have now figured out how much I/O we can do - this is in 'io_size'
 432                  * pl_index represents the first page in the 'upl' that the I/O will occur for
 433                  * pg_offset is the starting point in the first page for the I/O
 434                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 435                  */
 436                 pl_index  = upl_offset / PAGE_SIZE;
 437                 pg_offset = upl_offset & PAGE_MASK;
 438                 pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 439
 440                 if (flags & CL_DEV_MEMORY) {
 441                         /*
 442                          * currently, can't deal with reading 'holes' in file
 443                          */
 444                         if ((long)blkno == -1) {
 445                                 error = EINVAL;
 446                                 break;
 447                         }
 448                         /*
 449                          * treat physical requests as one 'giant' page
 450                          */
 451                         pg_count = 1;
 452                 }
 453                 if ((flags & CL_READ) && (long)blkno == -1) {
 454                         int bytes_to_zero;
 455
 456                         /*
 457                          * if we're reading and blkno == -1, then we've got a
 458                          * 'hole' in the file that we need to deal with by zeroing
 459                          * out the affected area in the upl
 460                          */
 461                         if (zero_offset && io_size == size) {
 462                                 /*
 463                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
 464                                  * than 'zero_offset' will be non-zero
 465                                  * if the 'hole' returned by VOP_CMAP extends all the way to the eof
 466                                  * (indicated by the io_size finishing off the I/O request for this UPL)
 467                                  * than we're not going to issue an I/O for the
 468                                  * last page in this upl... we need to zero both the hole and the tail
 469                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
 470                                  */
 471                                 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
 472
 473                                 zero_offset = 0;
 474                         } else
 475                                 bytes_to_zero = io_size;
 476
 477                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
 478
 479                         if (cbp_head)
 480                                 /*
 481                                  * if there is a current I/O chain pending
 482                                  * then the first page of the group we just zero'd
 483                                  * will be handled by the I/O completion if the zero
 484                                  * fill started in the middle of the page
 485                                  */
 486                                 pg_count = (io_size - pg_offset) / PAGE_SIZE;
 487                         else {
 488                                 /*
 489                                  * no pending I/O to pick up that first page
 490                                  * so, we have to make sure it gets committed
 491                                  * here.
 492                                  * set the pg_offset to 0 so that the upl_commit_range
 493                                  * starts with this page
 494                                  */
 495                                 pg_count = (io_size + pg_offset) / PAGE_SIZE;
 496                                 pg_offset = 0;
 497                         }
 498                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 499                                 /*
 500                                  * if we're done with the request for this UPL
 501                                  * then we have to make sure to commit the last page
 502                                  * even if we only partially zero-filled it
 503                                  */
 504                                 pg_count++;
 505
 506                         if (pg_count) {
 507                                 if (pg_offset)
 508                                         pg_resid = PAGE_SIZE - pg_offset;
 509                                 else
 510                                         pg_resid = 0;
 511
 512                                 if (flags & CL_COMMIT)
 513                                         ubc_upl_commit_range(upl,
 514                                                         (upl_offset + pg_resid) & ~PAGE_MASK,
 515                                                         pg_count * PAGE_SIZE,
 516                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 517                         }
 518                         upl_offset += io_size;
 519                         f_offset   += io_size;
 520                         size       -= io_size;
 521
 522                         if (cbp_head && pg_count)
 523                                 goto start_io;
 524                         continue;
 525
 526                 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
 527                         real_bp->b_blkno = blkno;
 528                 }
 529
 530                 if (pg_count > 1) {
 531                         if (pg_count > max_vectors) {
 532                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 533
 534                                 if (io_size < 0) {
 535                                         io_size = PAGE_SIZE - pg_offset;
 536                                         pg_count = 1;
 537                                 } else
 538                                         pg_count = max_vectors;
 539                         }
 540                         /*
 541                          * we need to allocate space for the vector list
 542                          */
 543                         if (pg_count > 1) {
 544                                 iovp = (struct iovec *)_MALLOC(sizeof(struct iovec) * pg_count,
 545                                                                M_SEGMENT, M_NOWAIT);
 546
 547                                 if (iovp == (struct iovec *) 0) {
 548                                         /*
 549                                          * if the allocation fails, then throttle down to a single page
 550                                          */
 551                                         io_size = PAGE_SIZE - pg_offset;
 552                                         pg_count = 1;
 553                                 }
 554                         }
 555                 }
 556
 557                 /* Throttle the speculative IO */
 558                 if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 559                         priv = 0;
 560                 else
 561                         priv = 1;
 562
 563                 cbp = alloc_io_buf(vp, priv);
 564
 565                 if (pg_count == 1)
 566                         /*
 567                          * we use the io vector that's reserved in the buffer header
 568                          * this insures we can always issue an I/O even in a low memory
 569                          * condition that prevents the _MALLOC from succeeding... this
 570                          * is necessary to prevent deadlocks with the pager
 571                          */
 572                         iovp = (struct iovec *)(&cbp->b_vects[0]);
 573
 574                 cbp->b_vectorlist  = (void *)iovp;
 575                 cbp->b_vectorcount = pg_count;
 576
 577                 if (flags & CL_DEV_MEMORY) {
 578
 579                         iovp->iov_len  = io_size;
 580                         iovp->iov_base = (caddr_t)upl_phys_page(pl, 0);
 581
 582                         if (iovp->iov_base == (caddr_t) 0) {
 583                                 free_io_buf(cbp);
 584                                 error = EINVAL;
 585                         } else
 586                                 iovp->iov_base += upl_offset;
 587                 } else {
 588
 589                   for (i = 0, vsize = io_size; i < pg_count; i++, iovp++) {
 590                         int     psize;
 591
 592                         psize = PAGE_SIZE - pg_offset;
 593
 594                         if (psize > vsize)
 595                                 psize = vsize;
 596
 597                         iovp->iov_len  = psize;
 598                         iovp->iov_base = (caddr_t)upl_phys_page(pl, pl_index + i);
 599
 600                         if (iovp->iov_base == (caddr_t) 0) {
 601                                 if (pg_count > 1)
 602                                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 603                                 free_io_buf(cbp);
 604
 605                                 error = EINVAL;
 606                                 break;
 607                         }
 608                         iovp->iov_base += pg_offset;
 609                         pg_offset = 0;
 610
 611                         if (flags & CL_PAGEOUT) {
 612                                 int         s;
 613                                 struct buf *bp;
 614
 615                                 s = splbio();
 616                                 if (bp = incore(vp, lblkno + i)) {
 617                                         if (!ISSET(bp->b_flags, B_BUSY)) {
 618                                                 bremfree(bp);
 619                                                 SET(bp->b_flags, (B_BUSY | B_INVAL));
 620                                                 splx(s);
 621                                                 brelse(bp);
 622                                         } else
 623                                                 panic("BUSY bp found in cluster_io");
 624                                 }
 625                                 splx(s);
 626                         }
 627                         vsize -= psize;
 628                     }
 629                 }
 630                 if (error)
 631                         break;
 632
 633                 if (flags & CL_ASYNC) {
 634                         cbp->b_flags |= (B_CALL | B_ASYNC);
 635                         cbp->b_iodone = (void *)cluster_iodone;
 636                 }
 637                 cbp->b_flags |= io_flags;
 638
 639                 cbp->b_lblkno = lblkno;
 640                 cbp->b_blkno  = blkno;
 641                 cbp->b_bcount = io_size;
 642                 cbp->b_pagelist  = upl;
 643                 cbp->b_uploffset = upl_offset;
 644                 cbp->b_trans_next = (struct buf *)0;
 645
 646                 if (cbp->b_iostate = (void *)iostate)
 647                         /*
 648                          * caller wants to track the state of this
 649                          * io... bump the amount issued against this stream
 650                          */
 651                         iostate->io_issued += io_size;
 652
 653                 if (flags & CL_READ)
 654                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 655                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 656                 else
 657                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 658                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 659
 660                 if (cbp_head) {
 661                         cbp_tail->b_trans_next = cbp;
 662                         cbp_tail = cbp;
 663                 } else {
 664                         cbp_head = cbp;
 665                         cbp_tail = cbp;
 666                 }
 667                 (struct buf *)(cbp->b_trans_head) = cbp_head;
 668                 buf_count++;
 669
 670                 upl_offset += io_size;
 671                 f_offset   += io_size;
 672                 size       -= io_size;
 673
 674                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
 675                         /*
 676                          * if we have no more I/O to issue or
 677                          * the current I/O we've prepared fully
 678                          * completes the last page in this request
 679                          * and it's either an ASYNC request or
 680                          * we've already accumulated more than 8 I/O's into
 681                          * this transaction and it's not an I/O directed to
 682                          * special DEVICE memory
 683                          * then go ahead and issue the I/O
 684                          */
 685 start_io:
 686                         if (real_bp) {
 687                                 cbp_head->b_flags |= B_NEED_IODONE;
 688                                 cbp_head->b_real_bp = real_bp;
 689                         } else
 690                                 cbp_head->b_real_bp = (struct buf *)NULL;
 691
 692                         if (size == 0) {
 693                                 /*
 694                                  * we're about to issue the last I/O for this upl
 695                                  * if this was a read to the eof and the eof doesn't
 696                                  * finish on a page boundary, than we need to zero-fill
 697                                  * the rest of the page....
 698                                  */
 699                                 cbp_head->b_validend = zero_offset;
 700                         } else
 701                                 cbp_head->b_validend = 0;
 702
 703                         for (cbp = cbp_head; cbp;) {
 704                                 struct buf * cbp_next;
 705
 706                                 if (io_flags & B_WRITEINPROG)
 707                                         cbp->b_vp->v_numoutput++;
 708
 709                                 cbp_next = cbp->b_trans_next;
 710
 711                                 (void) VOP_STRATEGY(cbp);
 712                                 cbp = cbp_next;
 713                         }
 714                         if ( !(flags & CL_ASYNC)) {
 715                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 716                                         biowait(cbp);
 717
 718                                 if (error = cluster_iodone(cbp_head)) {
 719                                         if ((flags & CL_PAGEOUT) && (error == ENXIO))
 720                                                 retval = 0;     /* drop the error */
 721                                         else
 722                                                 retval = error;
 723                                         error  = 0;
 724                                 }
 725                         }
 726                         cbp_head = (struct buf *)0;
 727                         cbp_tail = (struct buf *)0;
 728
 729                         buf_count = 0;
 730                 }
 731         }
 732         if (error) {
 733                 int abort_size;
 734
 735                 io_size = 0;
 736
 737                 for (cbp = cbp_head; cbp;) {
 738                         struct buf * cbp_next;
 739
 740                         if (cbp->b_vectorcount > 1)
 741                                 _FREE(cbp->b_vectorlist, M_SEGMENT);
 742                         upl_offset -= cbp->b_bcount;
 743                         size       += cbp->b_bcount;
 744                         io_size    += cbp->b_bcount;
 745
 746                         cbp_next = cbp->b_trans_next;
 747                         free_io_buf(cbp);
 748                         cbp = cbp_next;
 749                 }
 750                 if (iostate) {
 751                         /*
 752                          * update the error condition for this stream
 753                          * since we never really issued the io
 754                          * just go ahead and adjust it back
 755                          */
 756                         if (iostate->io_error == 0)
 757                                 iostate->io_error = error;
 758                         iostate->io_issued -= io_size;
 759
 760                         if (iostate->io_wanted) {
 761                                 /*
 762                                  * someone is waiting for the state of
 763                                  * this io stream to change
 764                                  */
 765                                 iostate->io_wanted = 0;
 766                                 wakeup((caddr_t)&iostate->io_wanted);
 767                         }
 768                 }
 769                 pg_offset  = upl_offset & PAGE_MASK;
 770                 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 771
 772                 if (flags & CL_COMMIT) {
 773                         int upl_abort_code;
 774
 775                         if (flags & CL_PRESERVE)
 776                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 777                         else if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
 778                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 779                         else if (flags & CL_PAGEIN)
 780                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 781                         else
 782                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 783
 784                         ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
 785                                                 upl_abort_code);
 786
 787                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
 788                                      (int)upl, upl_offset - pg_offset, abort_size, error, 0);
 789                 }
 790                 if (real_bp) {
 791                         real_bp->b_flags |= B_ERROR;
 792                         real_bp->b_error  = error;
 793
 794                         biodone(real_bp);
 795                 }
 796                 if (retval == 0)
 797                         retval = error;
 798         }
 799         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
 800                      (int)f_offset, size, upl_offset, retval, 0);
 801
 802         return (retval);
 803 }
 804
 805
 806 static int
 807 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
 808         struct vnode *vp;
 809         off_t         f_offset;
 810         u_int         size;
 811         off_t         filesize;
 812         int           devblocksize;
 813 {
 814         int           pages_to_fetch;
 815         int           skipped_pages;
 816
 817         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
 818                      (int)f_offset, size, (int)filesize, 0, 0);
 819
 820         if (f_offset >= filesize) {
 821                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 822                              (int)f_offset, 0, 0, 0, 0);
 823                 return(0);
 824         }
 825         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
 826                 size = MAX_UPL_TRANSFER * PAGE_SIZE;
 827         else
 828                 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
 829
 830         if ((off_t)size > (filesize - f_offset))
 831                 size = filesize - f_offset;
 832
 833         pages_to_fetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
 834
 835         for (skipped_pages = 0; skipped_pages < pages_to_fetch; skipped_pages++) {
 836                 if (ubc_page_op(vp, f_offset, 0, 0, 0) != KERN_SUCCESS)
 837                         break;
 838                 f_offset += PAGE_SIZE;
 839                 size     -= PAGE_SIZE;
 840         }
 841         if (skipped_pages < pages_to_fetch)
 842                 advisory_read(vp, filesize, f_offset, size, devblocksize);
 843
 844         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 845                      (int)f_offset + (pages_to_fetch * PAGE_SIZE), skipped_pages, 0, 1, 0);
 846
 847         return (pages_to_fetch);
 848 }
 849
 850
 851
 852 static void
 853 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
 854         struct vnode *vp;
 855         daddr_t       b_lblkno;
 856         daddr_t       e_lblkno;
 857         off_t         filesize;
 858         int           devblocksize;
 859 {
 860         daddr_t       r_lblkno;
 861         off_t         f_offset;
 862         int           size_of_prefetch;
 863         int           max_pages;
 864
 865         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
 866                      b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
 867
 868         if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
 869                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 870                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
 871                 return;
 872         }
 873
 874         if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
 875                                  (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
 876                 vp->v_ralen = 0;
 877                 vp->v_maxra = 0;
 878
 879                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 880                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
 881
 882                 return;
 883         }
 884         max_pages = MAX_UPL_TRANSFER;
 885
 886         vp->v_ralen = vp->v_ralen ? min(max_pages, vp->v_ralen << 1) : 1;
 887
 888         if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
 889                 vp->v_ralen = min(max_pages, (e_lblkno + 1) - b_lblkno);
 890
 891         if (e_lblkno < vp->v_maxra) {
 892                 if ((vp->v_maxra - e_lblkno) > max(max_pages / 16, 4)) {
 893
 894                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 895                                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
 896                         return;
 897                 }
 898         }
 899         r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
 900         f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
 901
 902         if (f_offset < filesize) {
 903                 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
 904
 905                 if (size_of_prefetch)
 906                         vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
 907         }
 908         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 909                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
 910 }
 911
 912 int
 913 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 914         struct vnode *vp;
 915         upl_t         upl;
 916         vm_offset_t   upl_offset;
 917         off_t         f_offset;
 918         int           size;
 919         off_t         filesize;
 920         int           devblocksize;
 921         int           flags;
 922 {
 923         int           io_size;
 924         int           pg_size;
 925         off_t         max_size;
 926         int local_flags = CL_PAGEOUT;
 927
 928         if ((flags & UPL_IOSYNC) == 0)
 929                 local_flags |= CL_ASYNC;
 930         if ((flags & UPL_NOCOMMIT) == 0)
 931                 local_flags |= CL_COMMIT;
 932
 933
 934         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
 935                      (int)f_offset, size, (int)filesize, local_flags, 0);
 936
 937         /*
 938          * If they didn't specify any I/O, then we are done...
 939          * we can't issue an abort because we don't know how
 940          * big the upl really is
 941          */
 942         if (size <= 0)
 943                 return (EINVAL);
 944
 945         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
 946                 if (local_flags & CL_COMMIT)
 947                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 948                 return (EROFS);
 949         }
 950         /*
 951          * can't page-in from a negative offset
 952          * or if we're starting beyond the EOF
 953          * or if the file offset isn't page aligned
 954          * or the size requested isn't a multiple of PAGE_SIZE
 955          */
 956         if (f_offset < 0 || f_offset >= filesize ||
 957            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 958                 if (local_flags & CL_COMMIT)
 959                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 960                 return (EINVAL);
 961         }
 962         max_size = filesize - f_offset;
 963
 964         if (size < max_size)
 965                 io_size = size;
 966         else
 967                 io_size = max_size;
 968
 969         pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 970
 971         if (size > pg_size) {
 972                 if (local_flags & CL_COMMIT)
 973                         ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
 974                                         UPL_ABORT_FREE_ON_EMPTY);
 975         }
 976         while (vp->v_numoutput >= ASYNC_THROTTLE) {
 977                 vp->v_flag |= VTHROTTLED;
 978                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_pageout", 0);
 979         }
 980
 981         return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
 982                            local_flags, (struct buf *)0, (struct clios *)0));
 983 }
 984
 985 int
 986 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 987         struct vnode *vp;
 988         upl_t         upl;
 989         vm_offset_t   upl_offset;
 990         off_t         f_offset;
 991         int           size;
 992         off_t         filesize;
 993         int           devblocksize;
 994         int           flags;
 995 {
 996         u_int         io_size;
 997         int           rounded_size;
 998         off_t         max_size;
 999         int           retval;
1000         int           local_flags = 0;
1001
1002         if (upl == NULL || size < 0)
1003                 panic("cluster_pagein: NULL upl passed in");
1004
1005         if ((flags & UPL_IOSYNC) == 0)
1006                 local_flags |= CL_ASYNC;
1007         if ((flags & UPL_NOCOMMIT) == 0)
1008                 local_flags |= CL_COMMIT;
1009
1010
1011         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1012                      (int)f_offset, size, (int)filesize, local_flags, 0);
1013
1014         /*
1015          * can't page-in from a negative offset
1016          * or if we're starting beyond the EOF
1017          * or if the file offset isn't page aligned
1018          * or the size requested isn't a multiple of PAGE_SIZE
1019          */
1020         if (f_offset < 0 || f_offset >= filesize ||
1021            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1022                 if (local_flags & CL_COMMIT)
1023                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1024                 return (EINVAL);
1025         }
1026         max_size = filesize - f_offset;
1027
1028         if (size < max_size)
1029                 io_size = size;
1030         else
1031                 io_size = max_size;
1032
1033         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1034
1035         if (size > rounded_size && (local_flags & CL_COMMIT))
1036                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1037                                     size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1038
1039         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
1040                            local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
1041
1042         if (retval == 0) {
1043                 int b_lblkno;
1044                 int e_lblkno;
1045
1046                 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
1047                 e_lblkno = (int)
1048                         ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1049
1050                 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
1051                         /*
1052                          * we haven't read the last page in of the file yet
1053                          * so let's try to read ahead if we're in
1054                          * a sequential access pattern
1055                          */
1056                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1057                 }
1058                 vp->v_lastr = e_lblkno;
1059         }
1060         return (retval);
1061 }
1062
1063 int
1064 cluster_bp(bp)
1065         struct buf *bp;
1066 {
1067         off_t  f_offset;
1068         int    flags;
1069
1070         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1071                      (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1072
1073         if (bp->b_pagelist == (upl_t) 0)
1074                 panic("cluster_bp: can't handle NULL upl yet\n");
1075         if (bp->b_flags & B_READ)
1076                 flags = CL_ASYNC | CL_READ;
1077         else
1078                 flags = CL_ASYNC;
1079
1080         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1081
1082         return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
1083 }
1084
1085 int
1086 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1087         struct vnode *vp;
1088         struct uio   *uio;
1089         off_t         oldEOF;
1090         off_t         newEOF;
1091         off_t         headOff;
1092         off_t         tailOff;
1093         int           devblocksize;
1094         int           flags;
1095 {
1096         int           prev_resid;
1097         int           clip_size;
1098         off_t         max_io_size;
1099         struct iovec  *iov;
1100         vm_offset_t   upl_offset;
1101         int           upl_size;
1102         int           pages_in_pl;
1103         upl_page_info_t *pl;
1104         int           upl_flags;
1105         upl_t         upl;
1106         int           retval = 0;
1107
1108
1109         if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
1110           {
1111             retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1112             return(retval);
1113           }
1114
1115         while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1116           {
1117             /* we know we have a resid, so this is safe */
1118             iov = uio->uio_iov;
1119             while (iov->iov_len == 0) {
1120               uio->uio_iov++;
1121               uio->uio_iovcnt--;
1122               iov = uio->uio_iov;
1123             }
1124
1125             /*
1126              * We check every vector target and if it is physically
1127              * contiguous space, we skip the sanity checks.
1128              */
1129
1130             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1131             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1132             pages_in_pl = 0;
1133             upl_flags = UPL_QUERY_OBJECT_TYPE;
1134             if ((vm_map_get_upl(current_map(),
1135                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1136                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1137               {
1138                 /*
1139                  * the user app must have passed in an invalid address
1140                  */
1141                 return (EFAULT);
1142               }
1143
1144             if (upl_flags & UPL_PHYS_CONTIG)
1145               {
1146                 if (flags & IO_HEADZEROFILL)
1147                   {
1148                     flags &= ~IO_HEADZEROFILL;
1149
1150                     if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1151                         return(retval);
1152                   }
1153
1154                 retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
1155
1156                 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1157                   {
1158                     retval = cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL);
1159                     return(retval);
1160                   }
1161               }
1162             else if ((uio->uio_resid < 4 * PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1163               {
1164                 /*
1165                  * We set a threshhold of 4 pages to decide if the nocopy
1166                  * write loop is worth the trouble...
1167                  * we also come here if we're trying to zero the head and/or tail
1168                  * of a partially written page, and the user source is not a physically contiguous region
1169                  */
1170                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1171                 return(retval);
1172               }
1173             else if (uio->uio_offset & PAGE_MASK_64)
1174               {
1175                 /* Bring the file offset write up to a pagesize boundary */
1176                 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1177                 if (uio->uio_resid < clip_size)
1178                   clip_size = uio->uio_resid;
1179                 /*
1180                  * Fake the resid going into the cluster_write_x call
1181                  * and restore it on the way out.
1182                  */
1183                 prev_resid = uio->uio_resid;
1184                 uio->uio_resid = clip_size;
1185                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1186                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1187               }
1188             else if ((int)iov->iov_base & PAGE_MASK_64)
1189               {
1190                 clip_size = iov->iov_len;
1191                 prev_resid = uio->uio_resid;
1192                 uio->uio_resid = clip_size;
1193                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1194                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1195               }
1196             else
1197               {
1198                 /*
1199                  * If we come in here, we know the offset into
1200                  * the file is on a pagesize boundary
1201                  */
1202
1203                 max_io_size = newEOF - uio->uio_offset;
1204                 clip_size = uio->uio_resid;
1205                 if (iov->iov_len < clip_size)
1206                   clip_size = iov->iov_len;
1207                 if (max_io_size < clip_size)
1208                   clip_size = max_io_size;
1209
1210                 if (clip_size < PAGE_SIZE)
1211                   {
1212                     /*
1213                      * Take care of tail end of write in this vector
1214                      */
1215                     prev_resid = uio->uio_resid;
1216                     uio->uio_resid = clip_size;
1217                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1218                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1219                   }
1220                 else
1221                   {
1222                     /* round clip_size down to a multiple of pagesize */
1223                     clip_size = clip_size & ~(PAGE_MASK);
1224                     prev_resid = uio->uio_resid;
1225                     uio->uio_resid = clip_size;
1226                     retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1227                     if ((retval == 0) && uio->uio_resid)
1228                       retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1229                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1230                   }
1231               } /* end else */
1232           } /* end while */
1233         return(retval);
1234 }
1235
1236
1237 static int
1238 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1239         struct vnode *vp;
1240         struct uio   *uio;
1241         off_t         newEOF;
1242         int           devblocksize;
1243         int           flags;
1244 {
1245         upl_t            upl;
1246         upl_page_info_t  *pl;
1247         off_t            upl_f_offset;
1248         vm_offset_t      upl_offset;
1249         off_t            max_io_size;
1250         int              io_size;
1251         int              io_flag;
1252         int              upl_size;
1253         int              upl_needed_size;
1254         int              pages_in_pl;
1255         int              upl_flags;
1256         kern_return_t    kret;
1257         struct iovec     *iov;
1258         int              i;
1259         int              first = 1;
1260         int              force_data_sync;
1261         int              error  = 0;
1262         struct clios     iostate;
1263
1264         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1265                      (int)uio->uio_offset, (int)uio->uio_resid,
1266                      (int)newEOF, devblocksize, 0);
1267
1268         /*
1269          * When we enter this routine, we know
1270          *  -- the offset into the file is on a pagesize boundary
1271          *  -- the resid is a page multiple
1272          *  -- the resid will not exceed iov_len
1273          */
1274         cluster_try_push(vp, newEOF, 0, 1);
1275
1276         iostate.io_completed = 0;
1277         iostate.io_issued = 0;
1278         iostate.io_error = 0;
1279         iostate.io_wanted = 0;
1280
1281         iov = uio->uio_iov;
1282
1283         while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1284                 io_size = uio->uio_resid;
1285
1286                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1287                         io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1288
1289                 if (first) {
1290                         if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
1291                                 io_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 8;
1292                         first = 0;
1293                 }
1294                 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1295                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1296
1297                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1298                              (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1299
1300                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1301                         pages_in_pl = 0;
1302                         upl_size = upl_needed_size;
1303                         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1304                                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1305
1306                         kret = vm_map_get_upl(current_map(),
1307                                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1308                                               &upl_size,
1309                                               &upl,
1310                                               NULL,
1311                                               &pages_in_pl,
1312                                               &upl_flags,
1313                                               force_data_sync);
1314
1315                         if (kret != KERN_SUCCESS) {
1316                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1317                                              0, 0, 0, kret, 0);
1318
1319                                 /*
1320                                  * cluster_nocopy_write: failed to get pagelist
1321                                  *
1322                                  * we may have already spun some portion of this request
1323                                  * off as async requests... we need to wait for the I/O
1324                                  * to complete before returning
1325                                  */
1326                                 goto wait_for_writes;
1327                         }
1328                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1329                         pages_in_pl = upl_size / PAGE_SIZE;
1330
1331                         for (i = 0; i < pages_in_pl; i++) {
1332                                 if (!upl_valid_page(pl, i))
1333                                         break;
1334                         }
1335                         if (i == pages_in_pl)
1336                                 break;
1337
1338                         /*
1339                          * didn't get all the pages back that we
1340                          * needed... release this upl and try again
1341                          */
1342                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1343                                             UPL_ABORT_FREE_ON_EMPTY);
1344                 }
1345                 if (force_data_sync >= 3) {
1346                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1347                                      i, pages_in_pl, upl_size, kret, 0);
1348
1349                         /*
1350                          * for some reason, we couldn't acquire a hold on all
1351                          * the pages needed in the user's address space
1352                          *
1353                          * we may have already spun some portion of this request
1354                          * off as async requests... we need to wait for the I/O
1355                          * to complete before returning
1356                          */
1357                         goto wait_for_writes;
1358                 }
1359
1360                 /*
1361                  * Consider the possibility that upl_size wasn't satisfied.
1362                  */
1363                 if (upl_size != upl_needed_size)
1364                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1365
1366                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1367                              (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1368
1369                 if (io_size == 0) {
1370                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1371                                             UPL_ABORT_FREE_ON_EMPTY);
1372
1373                         /*
1374                          * we may have already spun some portion of this request
1375                          * off as async requests... we need to wait for the I/O
1376                          * to complete before returning
1377                          */
1378                         goto wait_for_writes;
1379                 }
1380                 /*
1381                  * Now look for pages already in the cache
1382                  * and throw them away.
1383                  */
1384
1385                 upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
1386                 max_io_size = io_size;
1387
1388                 while (max_io_size) {
1389                         /*
1390                          * Flag UPL_POP_DUMP says if the page is found
1391                          * in the page cache it must be thrown away.
1392                          */
1393                         ubc_page_op(vp,
1394                                     upl_f_offset,
1395                                     UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1396                                     0, 0);
1397                         max_io_size  -= PAGE_SIZE_64;
1398                         upl_f_offset += PAGE_SIZE_64;
1399                 }
1400                 /*
1401                  * we want push out these writes asynchronously so that we can overlap
1402                  * the preparation of the next I/O
1403                  * if there are already too many outstanding writes
1404                  * wait until some complete before issuing the next
1405                  */
1406                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1407                         iostate.io_wanted = 1;
1408                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1409                 }
1410                 if (iostate.io_error) {
1411                         /*
1412                          * one of the earlier writes we issued ran into a hard error
1413                          * don't issue any more writes, cleanup the UPL
1414                          * that was just created but not used, then
1415                          * go wait for all writes that are part of this stream
1416                          * to complete before returning the error to the caller
1417                          */
1418                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1419                                             UPL_ABORT_FREE_ON_EMPTY);
1420
1421                         goto wait_for_writes;
1422                 }
1423                 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT;
1424
1425                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1426                              (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1427
1428                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1429                                    io_size, devblocksize, io_flag, (struct buf *)0, &iostate);
1430
1431                 iov->iov_len    -= io_size;
1432                 iov->iov_base   += io_size;
1433                 uio->uio_resid  -= io_size;
1434                 uio->uio_offset += io_size;
1435
1436                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1437                              (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1438
1439         } /* end while */
1440
1441 wait_for_writes:
1442         /*
1443          * make sure all async writes issued as part of this stream
1444          * have completed before we return
1445          */
1446         while (iostate.io_issued != iostate.io_completed) {
1447                 iostate.io_wanted = 1;
1448                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1449         }
1450         if (iostate.io_error)
1451                 error = iostate.io_error;
1452
1453         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1454                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1455
1456         return (error);
1457 }
1458
1459
1460 static int
1461 cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
1462         struct vnode *vp;
1463         struct uio   *uio;
1464         off_t        newEOF;
1465         int          devblocksize;
1466         int          flags;
1467 {
1468         upl_page_info_t *pl;
1469         vm_offset_t      src_paddr;
1470         upl_t            upl;
1471         vm_offset_t      upl_offset;
1472         int              tail_size;
1473         int              io_size;
1474         int              upl_size;
1475         int              upl_needed_size;
1476         int              pages_in_pl;
1477         int              upl_flags;
1478         kern_return_t    kret;
1479         struct iovec     *iov;
1480         int              error  = 0;
1481
1482         /*
1483          * When we enter this routine, we know
1484          *  -- the resid will not exceed iov_len
1485          *  -- the vector target address is physcially contiguous
1486          */
1487         cluster_try_push(vp, newEOF, 0, 1);
1488
1489         iov = uio->uio_iov;
1490         io_size = iov->iov_len;
1491         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1492         upl_needed_size = upl_offset + io_size;
1493
1494         pages_in_pl = 0;
1495         upl_size = upl_needed_size;
1496         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1497                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1498
1499         kret = vm_map_get_upl(current_map(),
1500                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1501                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1502
1503         if (kret != KERN_SUCCESS) {
1504                 /*
1505                  * cluster_phys_write: failed to get pagelist
1506                  * note: return kret here
1507                  */
1508               return(EINVAL);
1509         }
1510         /*
1511          * Consider the possibility that upl_size wasn't satisfied.
1512          * This is a failure in the physical memory case.
1513          */
1514         if (upl_size < upl_needed_size) {
1515                 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1516                 return(EINVAL);
1517         }
1518         pl = ubc_upl_pageinfo(upl);
1519
1520         src_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
1521
1522         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1523                 int   head_size;
1524
1525                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1526
1527                 if (head_size > io_size)
1528                         head_size = io_size;
1529
1530                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
1531
1532                 if (error) {
1533                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1534
1535                         return(EINVAL);
1536                 }
1537                 upl_offset += head_size;
1538                 src_paddr  += head_size;
1539                 io_size    -= head_size;
1540         }
1541         tail_size = io_size & (devblocksize - 1);
1542         io_size  -= tail_size;
1543
1544         if (io_size) {
1545                 /*
1546                  * issue a synchronous write to cluster_io
1547                  */
1548                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1549                                    io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
1550         }
1551         if (error == 0) {
1552                 /*
1553                  * The cluster_io write completed successfully,
1554                  * update the uio structure
1555                  */
1556                 uio->uio_resid  -= io_size;
1557                 iov->iov_len    -= io_size;
1558                 iov->iov_base   += io_size;
1559                 uio->uio_offset += io_size;
1560                 src_paddr       += io_size;
1561
1562                 if (tail_size)
1563                         error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
1564         }
1565         /*
1566          * just release our hold on the physically contiguous
1567          * region without changing any state
1568          */
1569         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1570
1571         return (error);
1572 }
1573
1574
1575 static int
1576 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1577         struct vnode *vp;
1578         struct uio   *uio;
1579         off_t         oldEOF;
1580         off_t         newEOF;
1581         off_t         headOff;
1582         off_t         tailOff;
1583         int           devblocksize;
1584         int           flags;
1585 {
1586         upl_page_info_t *pl;
1587         upl_t            upl;
1588         vm_offset_t      upl_offset;
1589         int              upl_size;
1590         off_t            upl_f_offset;
1591         int              pages_in_upl;
1592         int              start_offset;
1593         int              xfer_resid;
1594         int              io_size;
1595         int              io_flags;
1596         vm_offset_t      io_address;
1597         int              io_offset;
1598         int              bytes_to_zero;
1599         int              bytes_to_move;
1600         kern_return_t    kret;
1601         int              retval = 0;
1602         int              uio_resid;
1603         long long        total_size;
1604         long long        zero_cnt;
1605         off_t            zero_off;
1606         long long        zero_cnt1;
1607         off_t            zero_off1;
1608         daddr_t          start_blkno;
1609         daddr_t          last_blkno;
1610
1611         if (uio) {
1612                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1613                              (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1614
1615                 uio_resid = uio->uio_resid;
1616         } else {
1617                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1618                              0, 0, (int)oldEOF, (int)newEOF, 0);
1619
1620                 uio_resid = 0;
1621         }
1622         zero_cnt  = 0;
1623         zero_cnt1 = 0;
1624
1625         if (flags & IO_HEADZEROFILL) {
1626                 /*
1627                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1628                  * so we zero fill the intervening space between the old EOF and the offset
1629                  * where the next chunk of real data begins.... ftruncate will also use this
1630                  * routine to zero fill to the new EOF when growing a file... in this case, the
1631                  * uio structure will not be provided
1632                  */
1633                 if (uio) {
1634                         if (headOff < uio->uio_offset) {
1635                                 zero_cnt = uio->uio_offset - headOff;
1636                                 zero_off = headOff;
1637                         }
1638                 } else if (headOff < newEOF) {
1639                         zero_cnt = newEOF - headOff;
1640                         zero_off = headOff;
1641                 }
1642         }
1643         if (flags & IO_TAILZEROFILL) {
1644                 if (uio) {
1645                         zero_off1 = uio->uio_offset + uio->uio_resid;
1646
1647                         if (zero_off1 < tailOff)
1648                                 zero_cnt1 = tailOff - zero_off1;
1649                 }
1650         }
1651         if (zero_cnt == 0 && uio == (struct uio *) 0)
1652           {
1653             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1654                          retval, 0, 0, 0, 0);
1655             return (0);
1656           }
1657
1658         while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1659                 /*
1660                  * for this iteration of the loop, figure out where our starting point is
1661                  */
1662                 if (zero_cnt) {
1663                         start_offset = (int)(zero_off & PAGE_MASK_64);
1664                         upl_f_offset = zero_off - start_offset;
1665                 } else if (uio_resid) {
1666                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1667                         upl_f_offset = uio->uio_offset - start_offset;
1668                 } else {
1669                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1670                         upl_f_offset = zero_off1 - start_offset;
1671                 }
1672                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1673                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1674
1675                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1676                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1677
1678                 /*
1679                  * compute the size of the upl needed to encompass
1680                  * the requested write... limit each call to cluster_io
1681                  * to the maximum UPL size... cluster_io will clip if
1682                  * this exceeds the maximum io_size for the device,
1683                  * make sure to account for
1684                  * a starting offset that's not page aligned
1685                  */
1686                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1687
1688                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1689                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1690
1691                 pages_in_upl = upl_size / PAGE_SIZE;
1692                 io_size      = upl_size - start_offset;
1693
1694                 if ((long long)io_size > total_size)
1695                         io_size = total_size;
1696
1697                 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1698                 last_blkno  = start_blkno + pages_in_upl;
1699
1700                 kret = ubc_create_upl(vp,
1701                                                         upl_f_offset,
1702                                                         upl_size,
1703                                                         &upl,
1704                                                         &pl,
1705                                                         UPL_FLAGS_NONE);
1706                 if (kret != KERN_SUCCESS)
1707                         panic("cluster_write: failed to get pagelist");
1708
1709                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
1710                         (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1711
1712                 if (start_offset && !upl_valid_page(pl, 0)) {
1713                         int   read_size;
1714
1715                         /*
1716                          * we're starting in the middle of the first page of the upl
1717                          * and the page isn't currently valid, so we're going to have
1718                          * to read it in first... this is a synchronous operation
1719                          */
1720                         read_size = PAGE_SIZE;
1721
1722                         if ((upl_f_offset + read_size) > newEOF)
1723                                 read_size = newEOF - upl_f_offset;
1724
1725                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
1726                                             CL_READ, (struct buf *)0, (struct clios *)0);
1727                         if (retval) {
1728                                 /*
1729                                  * we had an error during the read which causes us to abort
1730                                  * the current cluster_write request... before we do, we need
1731                                  * to release the rest of the pages in the upl without modifying
1732                                  * there state and mark the failed page in error
1733                                  */
1734                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1735                                 ubc_upl_abort_range(upl, 0, upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1736
1737                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1738                                              (int)upl, 0, 0, retval, 0);
1739                                 break;
1740                         }
1741                 }
1742                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1743                         /*
1744                          * the last offset we're writing to in this upl does not end on a page
1745                          * boundary... if it's not beyond the old EOF, then we'll also need to
1746                          * pre-read this page in if it isn't already valid
1747                          */
1748                         upl_offset = upl_size - PAGE_SIZE;
1749
1750                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1751                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1752                                 int   read_size;
1753
1754                                 read_size = PAGE_SIZE;
1755
1756                                 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1757                                         read_size = newEOF - (upl_f_offset + upl_offset);
1758
1759                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
1760                                                     CL_READ, (struct buf *)0, (struct clios *)0);
1761                                 if (retval) {
1762                                         /*
1763                                          * we had an error during the read which causes us to abort
1764                                          * the current cluster_write request... before we do, we
1765                                          * need to release the rest of the pages in the upl without
1766                                          * modifying there state and mark the failed page in error
1767                                          */
1768                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1769                                         ubc_upl_abort_range(upl, 0,          upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1770
1771                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1772                                                      (int)upl, 0, 0, retval, 0);
1773                                         break;
1774                                 }
1775                         }
1776                 }
1777                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
1778                         panic("cluster_write: ubc_upl_map failed\n");
1779                 xfer_resid = io_size;
1780                 io_offset = start_offset;
1781
1782                 while (zero_cnt && xfer_resid) {
1783
1784                         if (zero_cnt < (long long)xfer_resid)
1785                                 bytes_to_zero = zero_cnt;
1786                         else
1787                                 bytes_to_zero = xfer_resid;
1788
1789                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1790                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1791
1792                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1793                                              (int)upl_f_offset + io_offset, bytes_to_zero,
1794                                              (int)io_offset, xfer_resid, 0);
1795                         } else {
1796                                 int zero_pg_index;
1797
1798                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1799                                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1800
1801                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1802                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1803
1804                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1805                                                      (int)upl_f_offset + io_offset, bytes_to_zero,
1806                                                      (int)io_offset, xfer_resid, 0);
1807
1808                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1809                                            !upl_dirty_page(pl, zero_pg_index)) {
1810                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1811
1812                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1813                                                      (int)upl_f_offset + io_offset, bytes_to_zero,
1814                                                      (int)io_offset, xfer_resid, 0);
1815                                 }
1816                         }
1817                         xfer_resid -= bytes_to_zero;
1818                         zero_cnt   -= bytes_to_zero;
1819                         zero_off   += bytes_to_zero;
1820                         io_offset  += bytes_to_zero;
1821                 }
1822                 if (xfer_resid && uio_resid) {
1823                         bytes_to_move = min(uio_resid, xfer_resid);
1824
1825                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1826                                      (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1827
1828                         retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1829
1830
1831                         if (retval) {
1832                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1833                                         panic("cluster_write: kernel_upl_unmap failed\n");
1834
1835                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1836
1837                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1838                                              (int)upl, 0, 0, retval, 0);
1839                         } else {
1840                                 uio_resid  -= bytes_to_move;
1841                                 xfer_resid -= bytes_to_move;
1842                                 io_offset  += bytes_to_move;
1843                         }
1844                 }
1845                 while (xfer_resid && zero_cnt1 && retval == 0) {
1846
1847                         if (zero_cnt1 < (long long)xfer_resid)
1848                                 bytes_to_zero = zero_cnt1;
1849                         else
1850                                 bytes_to_zero = xfer_resid;
1851
1852                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1853                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1854
1855                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1856                                              (int)upl_f_offset + io_offset,
1857                                              bytes_to_zero, (int)io_offset, xfer_resid, 0);
1858                         } else {
1859                                 int zero_pg_index;
1860
1861                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1862                                 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1863
1864                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1865                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1866
1867                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1868                                                      (int)upl_f_offset + io_offset,
1869                                                      bytes_to_zero, (int)io_offset, xfer_resid, 0);
1870
1871                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1872                                            !upl_dirty_page(pl, zero_pg_index)) {
1873                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1874
1875                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1876                                                      (int)upl_f_offset + io_offset,
1877                                                      bytes_to_zero, (int)io_offset, xfer_resid, 0);
1878                                 }
1879                         }
1880                         xfer_resid -= bytes_to_zero;
1881                         zero_cnt1  -= bytes_to_zero;
1882                         zero_off1  += bytes_to_zero;
1883                         io_offset  += bytes_to_zero;
1884                 }
1885
1886                 if (retval == 0) {
1887                         int cl_index;
1888                         int can_delay;
1889
1890                         io_size += start_offset;
1891
1892                         if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1893                                 /*
1894                                  * if we're extending the file with this write
1895                                  * we'll zero fill the rest of the page so that
1896                                  * if the file gets extended again in such a way as to leave a
1897                                  * hole starting at this EOF, we'll have zero's in the correct spot
1898                                  */
1899                                 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1900
1901                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1902                                              (int)upl_f_offset + io_size,
1903                                              upl_size - io_size, 0, 0, 0);
1904                         }
1905                         if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1906                                 panic("cluster_write: kernel_upl_unmap failed\n");
1907
1908                         if (flags & IO_SYNC)
1909                                 /*
1910                                  * if the IO_SYNC flag is set than we need to
1911                                  * bypass any clusters and immediately issue
1912                                  * the I/O
1913                                  */
1914                                 goto issue_io;
1915
1916                         if (vp->v_clen == 0)
1917                                 /*
1918                                  * no clusters currently present
1919                                  */
1920                                 goto start_new_cluster;
1921
1922                         /*
1923                          * keep track of the overall dirty page
1924                          * range we've developed
1925                          * in case we have to fall back to the
1926                          * VHASDIRTY method of flushing
1927                          */
1928                         if (vp->v_flag & VHASDIRTY)
1929                                 goto delay_io;
1930
1931                         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
1932                                 /*
1933                                  * we have an existing cluster... see if this write will extend it nicely
1934                                  */
1935                                 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
1936                                         /*
1937                                          * the current write starts at or after the current cluster
1938                                          */
1939                                         if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1940                                                 /*
1941                                                  * we have a write that fits entirely
1942                                                  * within the existing cluster limits
1943                                                  */
1944                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg)
1945                                                         /*
1946                                                          * update our idea of where the cluster ends
1947                                                          */
1948                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
1949                                                 break;
1950                                         }
1951                                         if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1952                                                 /*
1953                                                  * we have a write that starts in the middle of the current cluster
1954                                                  * but extends beyond the cluster's limit
1955                                                  * we'll clip the current cluster if we actually
1956                                                  * overlap with the new write
1957                                                  * and start a new cluster with the current write
1958                                                  */
1959                                                  if (vp->v_clusters[cl_index].last_pg > start_blkno)
1960                                                         vp->v_clusters[cl_index].last_pg = start_blkno;
1961                                         }
1962                                         /*
1963                                          * we also get here for the case where the current write starts
1964                                          * beyond the limit of the existing cluster
1965                                          *
1966                                          * in either case, we'll check the remaining clusters before
1967                                          * starting a new one
1968                                          */
1969                                 } else {
1970                                         /*
1971                                          * the current write starts in front of the current cluster
1972                                          */
1973                                         if ((vp->v_clusters[cl_index].last_pg - start_blkno) <=  MAX_UPL_TRANSFER) {
1974                                                 /*
1975                                                  * we can just merge the old cluster
1976                                                  * with the new request and leave it
1977                                                  * in the cache
1978                                                  */
1979                                                 vp->v_clusters[cl_index].start_pg = start_blkno;
1980
1981                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
1982                                                         /*
1983                                                          * the current write completely
1984                                                          * envelops the existing cluster
1985                                                          */
1986                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
1987                                                 }
1988                                                 break;
1989                                         }
1990
1991                                         /*
1992                                          * if we were to combine this write with the current cluster
1993                                          * we would exceed the cluster size limit.... so,
1994                                          * let's see if there's any overlap of the new I/O with
1995                                          * the existing cluster...
1996                                          *
1997                                          */
1998                                         if (last_blkno > vp->v_clusters[cl_index].start_pg)
1999                                                 /*
2000                                                  * the current write extends into the existing cluster
2001                                                  * clip the current cluster by moving the start position
2002                                                  * to where the current write ends
2003                                                  */
2004                                                 vp->v_clusters[cl_index].start_pg = last_blkno;
2005                                         /*
2006                                          * if we get here, there was no way to merge
2007                                          * the new I/O with this cluster and
2008                                          * keep it under our maximum cluster length
2009                                          * we'll check the remaining clusters before starting a new one
2010                                          */
2011                                 }
2012                         }
2013                         if (cl_index < vp->v_clen)
2014                                 /*
2015                                  * we found an existing cluster that we
2016                                  * could merger this I/O into
2017                                  */
2018                                 goto delay_io;
2019
2020                         if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
2021                                 /*
2022                                  * we didn't find an existing cluster to
2023                                  * merge into, but there's room to start
2024                                  * a new one
2025                                  */
2026                                 goto start_new_cluster;
2027
2028                         /*
2029                          * no exisitng cluster to merge with and no
2030                          * room to start a new one... we'll try
2031                          * pushing the existing ones... if none of
2032                          * them are able to be pushed, we'll have
2033                          * to fall back on the VHASDIRTY mechanism
2034                          * cluster_try_push will set v_clen to the
2035                          * number of remaining clusters if it is
2036                          * unable to push all of them
2037                          */
2038                         if (vp->v_flag & VNOCACHE_DATA)
2039                                 can_delay = 0;
2040                         else
2041                                 can_delay = 1;
2042
2043                         if (cluster_try_push(vp, newEOF, 0, 0) == 0) {
2044                                 vp->v_flag |= VHASDIRTY;
2045                                 goto delay_io;
2046                         }
2047 start_new_cluster:
2048                         if (vp->v_clen == 0) {
2049                                 vp->v_ciosiz = devblocksize;
2050                                 vp->v_cstart = start_blkno;
2051                                 vp->v_lastw  = last_blkno;
2052                         }
2053                         vp->v_clusters[vp->v_clen].start_pg = start_blkno;
2054                         vp->v_clusters[vp->v_clen].last_pg  = last_blkno;
2055                         vp->v_clen++;
2056 delay_io:
2057                         /*
2058                          * make sure we keep v_cstart and v_lastw up to
2059                          * date in case we have to fall back on the
2060                          * V_HASDIRTY mechanism (or we've already entered it)
2061                          */
2062                         if (start_blkno < vp->v_cstart)
2063                                 vp->v_cstart = start_blkno;
2064                         if (last_blkno > vp->v_lastw)
2065                                 vp->v_lastw = last_blkno;
2066
2067                         ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2068                         continue;
2069 issue_io:
2070                         /*
2071                          * in order to maintain some semblance of coherency with mapped writes
2072                          * we need to write the cluster back out as a multiple of the PAGESIZE
2073                          * unless the cluster encompasses the last page of the file... in this
2074                          * case we'll round out to the nearest device block boundary
2075                          */
2076                         io_size = upl_size;
2077
2078                         if ((upl_f_offset + io_size) > newEOF) {
2079                                 io_size = newEOF - upl_f_offset;
2080                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2081                         }
2082
2083                         if (flags & IO_SYNC)
2084                                 io_flags = CL_COMMIT | CL_AGE;
2085                         else
2086                                 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
2087
2088                         if (vp->v_flag & VNOCACHE_DATA)
2089                                 io_flags |= CL_DUMP;
2090
2091                         while (vp->v_numoutput >= ASYNC_THROTTLE) {
2092                                 vp->v_flag |= VTHROTTLED;
2093                                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
2094                         }
2095                         retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
2096                                             io_flags, (struct buf *)0, (struct clios *)0);
2097                 }
2098         }
2099         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2100                      retval, 0, 0, 0, 0);
2101
2102         return (retval);
2103 }
2104
2105 int
2106 cluster_read(vp, uio, filesize, devblocksize, flags)
2107         struct vnode *vp;
2108         struct uio   *uio;
2109         off_t         filesize;
2110         int           devblocksize;
2111         int           flags;
2112 {
2113         int           prev_resid;
2114         int           clip_size;
2115         off_t         max_io_size;
2116         struct iovec  *iov;
2117         vm_offset_t   upl_offset;
2118         int           upl_size;
2119         int           pages_in_pl;
2120         upl_page_info_t *pl;
2121         int           upl_flags;
2122         upl_t         upl;
2123         int           retval = 0;
2124
2125         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2126                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2127
2128         /*
2129          * We set a threshhold of 4 pages to decide if the nocopy
2130          * read loop is worth the trouble...
2131          */
2132
2133         if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
2134           {
2135             retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2136             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2137                          (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2138             return(retval);
2139           }
2140
2141         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2142           {
2143             /* we know we have a resid, so this is safe */
2144             iov = uio->uio_iov;
2145             while (iov->iov_len == 0) {
2146               uio->uio_iov++;
2147               uio->uio_iovcnt--;
2148               iov = uio->uio_iov;
2149             }
2150
2151             /*
2152              * We check every vector target and if it is physically
2153              * contiguous space, we skip the sanity checks.
2154              */
2155
2156             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
2157             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
2158             pages_in_pl = 0;
2159             upl_flags = UPL_QUERY_OBJECT_TYPE;
2160             if((vm_map_get_upl(current_map(),
2161                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2162                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
2163               {
2164                 /*
2165                  * the user app must have passed in an invalid address
2166                  */
2167                 return (EFAULT);
2168               }
2169
2170             if (upl_flags & UPL_PHYS_CONTIG)
2171               {
2172                 retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
2173               }
2174             else if (uio->uio_resid < 4 * PAGE_SIZE)
2175               {
2176                 /*
2177                  * We set a threshhold of 4 pages to decide if the nocopy
2178                  * read loop is worth the trouble...
2179                  */
2180                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2181                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2182                              (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2183                 return(retval);
2184               }
2185             else if (uio->uio_offset & PAGE_MASK_64)
2186               {
2187                 /* Bring the file offset read up to a pagesize boundary */
2188                 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2189                 if (uio->uio_resid < clip_size)
2190                   clip_size = uio->uio_resid;
2191                 /*
2192                  * Fake the resid going into the cluster_read_x call
2193                  * and restore it on the way out.
2194                  */
2195                 prev_resid = uio->uio_resid;
2196                 uio->uio_resid = clip_size;
2197                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2198                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2199               }
2200             else if ((int)iov->iov_base & PAGE_MASK_64)
2201               {
2202                 clip_size = iov->iov_len;
2203                 prev_resid = uio->uio_resid;
2204                 uio->uio_resid = clip_size;
2205                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2206                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2207               }
2208             else
2209               {
2210                 /*
2211                  * If we come in here, we know the offset into
2212                  * the file is on a pagesize boundary
2213                  */
2214
2215                 max_io_size = filesize - uio->uio_offset;
2216                 clip_size = uio->uio_resid;
2217                 if (iov->iov_len < clip_size)
2218                   clip_size = iov->iov_len;
2219                 if (max_io_size < clip_size)
2220                   clip_size = (int)max_io_size;
2221
2222                 if (clip_size < PAGE_SIZE)
2223                   {
2224                     /*
2225                      * Take care of the tail end of the read in this vector.
2226                      */
2227                     prev_resid = uio->uio_resid;
2228                     uio->uio_resid = clip_size;
2229                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2230                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2231                   }
2232                 else
2233                   {
2234                     /* round clip_size down to a multiple of pagesize */
2235                     clip_size = clip_size & ~(PAGE_MASK);
2236                     prev_resid = uio->uio_resid;
2237                     uio->uio_resid = clip_size;
2238                     retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2239                     if ((retval==0) && uio->uio_resid)
2240                       retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2241                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2242                   }
2243               } /* end else */
2244           } /* end while */
2245
2246         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2247                      (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2248
2249         return(retval);
2250 }
2251
2252
2253 static int
2254 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2255         struct vnode *vp;
2256         struct uio   *uio;
2257         off_t         filesize;
2258         int           devblocksize;
2259         int           flags;
2260 {
2261         upl_page_info_t *pl;
2262         upl_t            upl;
2263         vm_offset_t      upl_offset;
2264         int              upl_size;
2265         off_t            upl_f_offset;
2266         int              start_offset;
2267         int              start_pg;
2268         int              last_pg;
2269         int              uio_last;
2270         int              pages_in_upl;
2271         off_t            max_size;
2272         int              io_size;
2273         vm_offset_t      io_address;
2274         kern_return_t    kret;
2275         int              segflg;
2276         int              error  = 0;
2277         int              retval = 0;
2278         int              b_lblkno;
2279         int              e_lblkno;
2280
2281         b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
2282
2283         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2284                 /*
2285                  * compute the size of the upl needed to encompass
2286                  * the requested read... limit each call to cluster_io
2287                  * to the maximum UPL size... cluster_io will clip if
2288                  * this exceeds the maximum io_size for the device,
2289                  * make sure to account for
2290                  * a starting offset that's not page aligned
2291                  */
2292                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2293                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2294                 max_size     = filesize - uio->uio_offset;
2295
2296                 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2297                         io_size = uio->uio_resid;
2298                 else
2299                         io_size = max_size;
2300
2301                 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
2302                         segflg = uio->uio_segflg;
2303
2304                         uio->uio_segflg = UIO_PHYS_USERSPACE;
2305
2306                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2307                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2308
2309                         while (io_size && retval == 0) {
2310                                 int         xsize;
2311                                 vm_offset_t paddr;
2312
2313                                 if (ubc_page_op(vp,
2314                                                 upl_f_offset,
2315                                                 UPL_POP_SET | UPL_POP_BUSY,
2316                                                 &paddr, 0) != KERN_SUCCESS)
2317                                         break;
2318
2319                                 xsize = PAGE_SIZE - start_offset;
2320
2321                                 if (xsize > io_size)
2322                                         xsize = io_size;
2323
2324                                 retval = uiomove((caddr_t)(paddr + start_offset), xsize, uio);
2325
2326                                 ubc_page_op(vp, upl_f_offset,
2327                                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2328
2329                                 io_size     -= xsize;
2330                                 start_offset = (int)
2331                                         (uio->uio_offset & PAGE_MASK_64);
2332                                 upl_f_offset = uio->uio_offset - start_offset;
2333                         }
2334                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2335                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2336
2337                         uio->uio_segflg = segflg;
2338
2339                         if (retval)
2340                                 break;
2341
2342                         if (io_size == 0) {
2343                                 /*
2344                                  * we're already finished with this read request
2345                                  * let's see if we should do a read-ahead
2346                                  */
2347                                 e_lblkno = (int)
2348                                         ((uio->uio_offset - 1) / PAGE_SIZE_64);
2349
2350                                 if (!(vp->v_flag & VRAOFF))
2351                                         /*
2352                                          * let's try to read ahead if we're in
2353                                          * a sequential access pattern
2354                                          */
2355                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2356                                 vp->v_lastr = e_lblkno;
2357
2358                                 break;
2359                         }
2360                         max_size = filesize - uio->uio_offset;
2361                 }
2362                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2363                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2364                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2365                 pages_in_upl = upl_size / PAGE_SIZE;
2366
2367                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2368                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2369
2370                 kret = ubc_create_upl(vp,
2371                                                 upl_f_offset,
2372                                                 upl_size,
2373                                                 &upl,
2374                                                 &pl,
2375                                                 UPL_FLAGS_NONE);
2376                 if (kret != KERN_SUCCESS)
2377                         panic("cluster_read: failed to get pagelist");
2378
2379                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2380                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2381
2382                 /*
2383                  * scan from the beginning of the upl looking for the first
2384                  * non-valid page.... this will become the first page in
2385                  * the request we're going to make to 'cluster_io'... if all
2386                  * of the pages are valid, we won't call through to 'cluster_io'
2387                  */
2388                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2389                         if (!upl_valid_page(pl, start_pg))
2390                                 break;
2391                 }
2392
2393                 /*
2394                  * scan from the starting invalid page looking for a valid
2395                  * page before the end of the upl is reached, if we
2396                  * find one, then it will be the last page of the request to
2397                  * 'cluster_io'
2398                  */
2399                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2400                         if (upl_valid_page(pl, last_pg))
2401                                 break;
2402                 }
2403
2404                 if (start_pg < last_pg) {
2405                         /*
2406                          * we found a range of 'invalid' pages that must be filled
2407                          * if the last page in this range is the last page of the file
2408                          * we may have to clip the size of it to keep from reading past
2409                          * the end of the last physical block associated with the file
2410                          */
2411                         upl_offset = start_pg * PAGE_SIZE;
2412                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2413
2414                         if ((upl_f_offset + upl_offset + io_size) > filesize)
2415                                 io_size = filesize - (upl_f_offset + upl_offset);
2416
2417                         /*
2418                          * issue a synchronous read to cluster_io
2419                          */
2420
2421                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2422                                            io_size, devblocksize, CL_READ, (struct buf *)0, (struct clios *)0);
2423                 }
2424                 if (error == 0) {
2425                         /*
2426                          * if the read completed successfully, or there was no I/O request
2427                          * issued, than map the upl into kernel address space and
2428                          * move the data into user land.... we'll first add on any 'valid'
2429                          * pages that were present in the upl when we acquired it.
2430                          */
2431                         u_int  val_size;
2432                         u_int  size_of_prefetch;
2433
2434                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2435                                 if (!upl_valid_page(pl, uio_last))
2436                                         break;
2437                         }
2438                         /*
2439                          * compute size to transfer this round,  if uio->uio_resid is
2440                          * still non-zero after this uiomove, we'll loop around and
2441                          * set up for another I/O.
2442                          */
2443                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2444
2445                         if (max_size < val_size)
2446                                 val_size = max_size;
2447
2448                         if (uio->uio_resid < val_size)
2449                                 val_size = uio->uio_resid;
2450
2451                         e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2452
2453                         if (size_of_prefetch = (uio->uio_resid - val_size)) {
2454                                 /*
2455                                  * if there's still I/O left to do for this request, then issue a
2456                                  * pre-fetch I/O... the I/O wait time will overlap
2457                                  * with the copying of the data
2458                                  */
2459                                 cluster_rd_prefetch(vp, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
2460                         } else {
2461                                 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2462                                         /*
2463                                          * let's try to read ahead if we're in
2464                                          * a sequential access pattern
2465                                          */
2466                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2467                                 vp->v_lastr = e_lblkno;
2468                         }
2469                         if (uio->uio_segflg == UIO_USERSPACE) {
2470                                 int       offset;
2471
2472                                 segflg = uio->uio_segflg;
2473
2474                                 uio->uio_segflg = UIO_PHYS_USERSPACE;
2475
2476
2477                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2478                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2479
2480                                 offset = start_offset;
2481
2482                                 while (val_size && retval == 0) {
2483                                         int       csize;
2484                                         int       i;
2485                                         caddr_t   paddr;
2486
2487                                         i = offset / PAGE_SIZE;
2488                                         csize = min(PAGE_SIZE - start_offset, val_size);
2489
2490                                         paddr = (caddr_t)upl_phys_page(pl, i) + start_offset;
2491
2492                                         retval = uiomove(paddr, csize, uio);
2493
2494                                         val_size    -= csize;
2495                                         offset      += csize;
2496                                         start_offset = offset & PAGE_MASK;
2497                                 }
2498                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2499                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2500
2501                                 uio->uio_segflg = segflg;
2502                         }
2503                         else
2504                         {
2505                                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
2506                                         panic("cluster_read: ubc_upl_map() failed\n");
2507
2508                                 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2509
2510                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
2511                                         panic("cluster_read: ubc_upl_unmap() failed\n");
2512                         }
2513                 }
2514                 if (start_pg < last_pg) {
2515                         /*
2516                          * compute the range of pages that we actually issued an I/O for
2517                          * and either commit them as valid if the I/O succeeded
2518                          * or abort them if the I/O failed
2519                          */
2520                         io_size = (last_pg - start_pg) * PAGE_SIZE;
2521
2522                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2523                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2524
2525                         if (error || (vp->v_flag & VNOCACHE_DATA))
2526                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2527                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2528                         else
2529                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2530                                                 UPL_COMMIT_CLEAR_DIRTY
2531                                                 | UPL_COMMIT_FREE_ON_EMPTY
2532                                                 | UPL_COMMIT_INACTIVATE);
2533
2534                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2535                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2536                 }
2537                 if ((last_pg - start_pg) < pages_in_upl) {
2538                         int cur_pg;
2539                         int commit_flags;
2540
2541                         /*
2542                          * the set of pages that we issued an I/O for did not encompass
2543                          * the entire upl... so just release these without modifying
2544                          * there state
2545                          */
2546                         if (error)
2547                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2548                         else {
2549                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2550                                              (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2551
2552                                 if (start_pg) {
2553                                         /*
2554                                          * we found some already valid pages at the beginning of
2555                                          * the upl commit these back to the inactive list with
2556                                          * reference cleared
2557                                          */
2558                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2559                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2560                                                                    | UPL_COMMIT_INACTIVATE;
2561
2562                                                 if (upl_dirty_page(pl, cur_pg))
2563                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2564
2565                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2566                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2567                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2568                                                 else
2569                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2570                                                                 PAGE_SIZE, commit_flags);
2571                                         }
2572                                 }
2573                                 if (last_pg < uio_last) {
2574                                         /*
2575                                          * we found some already valid pages immediately after the
2576                                          * pages we issued I/O for, commit these back to the
2577                                          * inactive list with reference cleared
2578                                          */
2579                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2580                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
2581                                                                                 | UPL_COMMIT_INACTIVATE;
2582
2583                                                 if (upl_dirty_page(pl, cur_pg))
2584                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2585
2586                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2587                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2588                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2589                                                 else
2590                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2591                                                                 PAGE_SIZE, commit_flags);
2592                                         }
2593                                 }
2594                                 if (uio_last < pages_in_upl) {
2595                                         /*
2596                                          * there were some invalid pages beyond the valid pages
2597                                          * that we didn't issue an I/O for, just release them
2598                                          * unchanged
2599                                          */
2600                                         ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2601                                                             (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
2602                                 }
2603
2604                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2605                                         (int)upl, -1, -1, 0, 0);
2606                         }
2607                 }
2608                 if (retval == 0)
2609                         retval = error;
2610         }
2611
2612         return (retval);
2613 }
2614
2615
2616 static int
2617 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2618         struct vnode *vp;
2619         struct uio   *uio;
2620         off_t         filesize;
2621         int           devblocksize;
2622         int           flags;
2623 {
2624         upl_t            upl;
2625         upl_page_info_t  *pl;
2626         off_t            upl_f_offset;
2627         vm_offset_t      upl_offset;
2628         off_t            start_upl_f_offset;
2629         off_t            max_io_size;
2630         int              io_size;
2631         int              upl_size;
2632         int              upl_needed_size;
2633         int              pages_in_pl;
2634         vm_offset_t      paddr;
2635         int              upl_flags;
2636         kern_return_t    kret;
2637         int              segflg;
2638         struct iovec     *iov;
2639         int              i;
2640         int              force_data_sync;
2641         int              retval = 0;
2642         int              first = 1;
2643         struct clios     iostate;
2644
2645         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2646                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2647
2648         /*
2649          * When we enter this routine, we know
2650          *  -- the offset into the file is on a pagesize boundary
2651          *  -- the resid is a page multiple
2652          *  -- the resid will not exceed iov_len
2653          */
2654
2655         iostate.io_completed = 0;
2656         iostate.io_issued = 0;
2657         iostate.io_error = 0;
2658         iostate.io_wanted = 0;
2659
2660         iov = uio->uio_iov;
2661
2662         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2663
2664                 max_io_size = filesize - uio->uio_offset;
2665
2666                 if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2667                         io_size = max_io_size;
2668                 else
2669                         io_size = uio->uio_resid;
2670
2671                 /*
2672                  * We don't come into this routine unless
2673                  * UIO_USERSPACE is set.
2674                  */
2675                 segflg = uio->uio_segflg;
2676
2677                 uio->uio_segflg = UIO_PHYS_USERSPACE;
2678
2679                 /*
2680                  * First look for pages already in the cache
2681                  * and move them to user space.
2682                  */
2683                 while (io_size && (retval == 0)) {
2684                         upl_f_offset = uio->uio_offset;
2685
2686                         /*
2687                          * If this call fails, it means the page is not
2688                          * in the page cache.
2689                          */
2690                         if (ubc_page_op(vp, upl_f_offset,
2691                                         UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS)
2692                                 break;
2693
2694                         retval = uiomove((caddr_t)(paddr), PAGE_SIZE, uio);
2695
2696                         ubc_page_op(vp, upl_f_offset,
2697                                     UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2698
2699                         io_size -= PAGE_SIZE;
2700                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
2701                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2702                 }
2703                 uio->uio_segflg = segflg;
2704
2705                 if (retval) {
2706                         /*
2707                          * we may have already spun some portion of this request
2708                          * off as async requests... we need to wait for the I/O
2709                          * to complete before returning
2710                          */
2711                         goto wait_for_reads;
2712                 }
2713                 /*
2714                  * If we are already finished with this read, then return
2715                  */
2716                 if (io_size == 0) {
2717                         /*
2718                          * we may have already spun some portion of this request
2719                          * off as async requests... we need to wait for the I/O
2720                          * to complete before returning
2721                          */
2722                         goto wait_for_reads;
2723                 }
2724                 max_io_size = io_size;
2725
2726                 if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2727                         max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2728                 if (first) {
2729                         if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2730                                 max_io_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 8;
2731                         first = 0;
2732                 }
2733                 start_upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
2734                 upl_f_offset = start_upl_f_offset;
2735                 io_size = 0;
2736
2737                 while (io_size < max_io_size) {
2738                         if (ubc_page_op(vp, upl_f_offset,
2739                                         UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS) {
2740                                 ubc_page_op(vp, upl_f_offset,
2741                                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2742                                 break;
2743                         }
2744                         /*
2745                          * Build up the io request parameters.
2746                          */
2747                         io_size += PAGE_SIZE_64;
2748                         upl_f_offset += PAGE_SIZE_64;
2749                 }
2750                 if (io_size == 0)
2751                         /*
2752                          * we may have already spun some portion of this request
2753                          * off as async requests... we need to wait for the I/O
2754                          * to complete before returning
2755                          */
2756                         goto wait_for_reads;
2757
2758                 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2759                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2760
2761                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2762                              (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
2763
2764                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2765                         pages_in_pl = 0;
2766                         upl_size = upl_needed_size;
2767                         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2768
2769                         kret = vm_map_get_upl(current_map(),
2770                                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2771                                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2772
2773                         if (kret != KERN_SUCCESS) {
2774                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2775                                              (int)upl_offset, upl_size, io_size, kret, 0);
2776
2777                                 /*
2778                                  * cluster_nocopy_read: failed to get pagelist
2779                                  *
2780                                  * we may have already spun some portion of this request
2781                                  * off as async requests... we need to wait for the I/O
2782                                  * to complete before returning
2783                                  */
2784                                 goto wait_for_reads;
2785                         }
2786                         pages_in_pl = upl_size / PAGE_SIZE;
2787                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2788
2789                         for (i = 0; i < pages_in_pl; i++) {
2790                                 if (!upl_valid_page(pl, i))
2791                                         break;
2792                         }
2793                         if (i == pages_in_pl)
2794                                 break;
2795
2796                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2797                                             UPL_ABORT_FREE_ON_EMPTY);
2798                 }
2799                 if (force_data_sync >= 3) {
2800                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2801                                      (int)upl_offset, upl_size, io_size, kret, 0);
2802
2803                         goto wait_for_reads;
2804                 }
2805                 /*
2806                  * Consider the possibility that upl_size wasn't satisfied.
2807                  */
2808                 if (upl_size != upl_needed_size)
2809                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2810
2811                 if (io_size == 0) {
2812                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2813                                             UPL_ABORT_FREE_ON_EMPTY);
2814                         goto wait_for_reads;
2815                 }
2816                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2817                              (int)upl_offset, upl_size, io_size, kret, 0);
2818
2819                 /*
2820                  * request asynchronously so that we can overlap
2821                  * the preparation of the next I/O
2822                  * if there are already too many outstanding reads
2823                  * wait until some have completed before issuing the next read
2824                  */
2825                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2826                         iostate.io_wanted = 1;
2827                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2828                 }
2829                 if (iostate.io_error) {
2830                         /*
2831                          * one of the earlier reads we issued ran into a hard error
2832                          * don't issue any more reads, cleanup the UPL
2833                          * that was just created but not used, then
2834                          * go wait for any other reads to complete before
2835                          * returning the error to the caller
2836                          */
2837                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2838                                             UPL_ABORT_FREE_ON_EMPTY);
2839
2840                         goto wait_for_reads;
2841                 }
2842                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2843                              (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
2844
2845                 retval = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
2846                                    io_size, devblocksize,
2847                                    CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
2848                                    (struct buf *)0, &iostate);
2849
2850                 /*
2851                  * update the uio structure
2852                  */
2853                 iov->iov_base   += io_size;
2854                 iov->iov_len    -= io_size;
2855                 uio->uio_resid  -= io_size;
2856                 uio->uio_offset += io_size;
2857
2858                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2859                              (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, retval, 0);
2860
2861         } /* end while */
2862
2863 wait_for_reads:
2864         /*
2865          * make sure all async reads that are part of this stream
2866          * have completed before we return
2867          */
2868         while (iostate.io_issued != iostate.io_completed) {
2869                 iostate.io_wanted = 1;
2870                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2871         }
2872         if (iostate.io_error)
2873                 retval = iostate.io_error;
2874
2875         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2876                      (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2877
2878         return (retval);
2879 }
2880
2881
2882 static int
2883 cluster_phys_read(vp, uio, filesize, devblocksize, flags)
2884         struct vnode *vp;
2885         struct uio   *uio;
2886         off_t        filesize;
2887         int          devblocksize;
2888         int          flags;
2889 {
2890         upl_page_info_t *pl;
2891         upl_t            upl;
2892         vm_offset_t      upl_offset;
2893         vm_offset_t      dst_paddr;
2894         off_t            max_size;
2895         int              io_size;
2896         int              tail_size;
2897         int              upl_size;
2898         int              upl_needed_size;
2899         int              pages_in_pl;
2900         int              upl_flags;
2901         kern_return_t    kret;
2902         struct iovec     *iov;
2903         struct clios     iostate;
2904         int              error;
2905
2906         /*
2907          * When we enter this routine, we know
2908          *  -- the resid will not exceed iov_len
2909          *  -- the target address is physically contiguous
2910          */
2911
2912         iov = uio->uio_iov;
2913
2914         max_size = filesize - uio->uio_offset;
2915
2916         if (max_size > (off_t)((unsigned int)iov->iov_len))
2917                 io_size = iov->iov_len;
2918         else
2919                 io_size = max_size;
2920
2921         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2922         upl_needed_size = upl_offset + io_size;
2923
2924         error       = 0;
2925         pages_in_pl = 0;
2926         upl_size = upl_needed_size;
2927         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2928
2929         kret = vm_map_get_upl(current_map(),
2930                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2931                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
2932
2933         if (kret != KERN_SUCCESS) {
2934                 /*
2935                  * cluster_phys_read: failed to get pagelist
2936                  */
2937                 return(EINVAL);
2938         }
2939         if (upl_size < upl_needed_size) {
2940                 /*
2941                  * The upl_size wasn't satisfied.
2942                  */
2943                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2944
2945                 return(EINVAL);
2946         }
2947         pl = ubc_upl_pageinfo(upl);
2948
2949         dst_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
2950
2951         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
2952                 int   head_size;
2953
2954                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
2955
2956                 if (head_size > io_size)
2957                         head_size = io_size;
2958
2959                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
2960
2961                 if (error) {
2962                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2963
2964                         return(EINVAL);
2965                 }
2966                 upl_offset += head_size;
2967                 dst_paddr  += head_size;
2968                 io_size    -= head_size;
2969         }
2970         tail_size = io_size & (devblocksize - 1);
2971         io_size  -= tail_size;
2972
2973         iostate.io_completed = 0;
2974         iostate.io_issued = 0;
2975         iostate.io_error = 0;
2976         iostate.io_wanted = 0;
2977
2978         while (io_size && error == 0) {
2979                 int  xsize;
2980
2981                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2982                         xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
2983                 else
2984                         xsize = io_size;
2985                 /*
2986                  * request asynchronously so that we can overlap
2987                  * the preparation of the next I/O... we'll do
2988                  * the commit after all the I/O has completed
2989                  * since its all issued against the same UPL
2990                  * if there are already too many outstanding reads
2991                  * wait until some have completed before issuing the next
2992                  */
2993                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2994                         iostate.io_wanted = 1;
2995                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
2996                 }
2997
2998                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0,
2999                                    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3000                                    (struct buf *)0, &iostate);
3001                 /*
3002                  * The cluster_io read was issued successfully,
3003                  * update the uio structure
3004                  */
3005                 if (error == 0) {
3006                         uio->uio_resid  -= xsize;
3007                         iov->iov_len    -= xsize;
3008                         iov->iov_base   += xsize;
3009                         uio->uio_offset += xsize;
3010                         dst_paddr       += xsize;
3011                         upl_offset      += xsize;
3012                         io_size         -= xsize;
3013                 }
3014         }
3015         /*
3016          * make sure all async reads that are part of this stream
3017          * have completed before we proceed
3018          */
3019         while (iostate.io_issued != iostate.io_completed) {
3020                 iostate.io_wanted = 1;
3021                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3022         }
3023         if (iostate.io_error) {
3024                 error = iostate.io_error;
3025         }
3026         if (error == 0 && tail_size)
3027                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
3028
3029         /*
3030          * just release our hold on the physically contiguous
3031          * region without changing any state
3032          */
3033         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3034
3035         return (error);
3036 }
3037
3038
3039 /*
3040  * generate advisory I/O's in the largest chunks possible
3041  * the completed pages will be released into the VM cache
3042  */
3043 int
3044 advisory_read(vp, filesize, f_offset, resid, devblocksize)
3045         struct vnode *vp;
3046         off_t         filesize;
3047         off_t         f_offset;
3048         int           resid;
3049         int           devblocksize;
3050 {
3051         upl_page_info_t *pl;
3052         upl_t            upl;
3053         vm_offset_t      upl_offset;
3054         int              upl_size;
3055         off_t            upl_f_offset;
3056         int              start_offset;
3057         int              start_pg;
3058         int              last_pg;
3059         int              pages_in_upl;
3060         off_t            max_size;
3061         int              io_size;
3062         kern_return_t    kret;
3063         int              retval = 0;
3064         int              issued_io;
3065
3066         if (!UBCINFOEXISTS(vp))
3067                 return(EINVAL);
3068
3069         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3070                      (int)f_offset, resid, (int)filesize, devblocksize, 0);
3071
3072         while (resid && f_offset < filesize && retval == 0) {
3073                 /*
3074                  * compute the size of the upl needed to encompass
3075                  * the requested read... limit each call to cluster_io
3076                  * to the maximum UPL size... cluster_io will clip if
3077                  * this exceeds the maximum io_size for the device,
3078                  * make sure to account for
3079                  * a starting offset that's not page aligned
3080                  */
3081                 start_offset = (int)(f_offset & PAGE_MASK_64);
3082                 upl_f_offset = f_offset - (off_t)start_offset;
3083                 max_size     = filesize - f_offset;
3084
3085                 if (resid < max_size)
3086                         io_size = resid;
3087                 else
3088                         io_size = max_size;
3089
3090                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3091                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3092                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3093                 pages_in_upl = upl_size / PAGE_SIZE;
3094
3095                 kret = ubc_create_upl(vp,
3096                                                 upl_f_offset,
3097                                                 upl_size,
3098                                                 &upl,
3099                                                 &pl,
3100                                                 UPL_RET_ONLY_ABSENT);
3101                 if (kret != KERN_SUCCESS)
3102                         return(retval);
3103                 issued_io = 0;
3104
3105                 /*
3106                  * before we start marching forward, we must make sure we end on
3107                  * a present page, otherwise we will be working with a freed
3108                  * upl
3109                  */
3110                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3111                         if (upl_page_present(pl, last_pg))
3112                                 break;
3113                 }
3114                 pages_in_upl = last_pg + 1;
3115
3116
3117                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
3118                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3119
3120
3121                 for (last_pg = 0; last_pg < pages_in_upl; ) {
3122                         /*
3123                          * scan from the beginning of the upl looking for the first
3124                          * page that is present.... this will become the first page in
3125                          * the request we're going to make to 'cluster_io'... if all
3126                          * of the pages are absent, we won't call through to 'cluster_io'
3127                          */
3128                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3129                                 if (upl_page_present(pl, start_pg))
3130                                         break;
3131                         }
3132
3133                         /*
3134                          * scan from the starting present page looking for an absent
3135                          * page before the end of the upl is reached, if we
3136                          * find one, then it will terminate the range of pages being
3137                          * presented to 'cluster_io'
3138                          */
3139                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3140                                 if (!upl_page_present(pl, last_pg))
3141                                         break;
3142                         }
3143
3144                         if (last_pg > start_pg) {
3145                                 /*
3146                                  * we found a range of pages that must be filled
3147                                  * if the last page in this range is the last page of the file
3148                                  * we may have to clip the size of it to keep from reading past
3149                                  * the end of the last physical block associated with the file
3150                                  */
3151                                 upl_offset = start_pg * PAGE_SIZE;
3152                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
3153
3154                                 if ((upl_f_offset + upl_offset + io_size) > filesize)
3155                                         io_size = filesize - (upl_f_offset + upl_offset);
3156
3157                                 /*
3158                                  * issue an asynchronous read to cluster_io
3159                                  */
3160                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
3161                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
3162
3163                                 issued_io = 1;
3164                         }
3165                 }
3166                 if (issued_io == 0)
3167                         ubc_upl_abort(upl, 0);
3168
3169                 io_size = upl_size - start_offset;
3170
3171                 if (io_size > resid)
3172                         io_size = resid;
3173                 f_offset += io_size;
3174                 resid    -= io_size;
3175         }
3176
3177         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3178                      (int)f_offset, resid, retval, 0, 0);
3179
3180         return(retval);
3181 }
3182
3183
3184 int
3185 cluster_push(vp)
3186         struct vnode *vp;
3187 {
3188         int  retval;
3189
3190         if (!UBCINFOEXISTS(vp) || vp->v_clen == 0) {
3191                 vp->v_flag &= ~VHASDIRTY;
3192                 return(0);
3193         }
3194
3195         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3196                      vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
3197
3198         if (vp->v_flag & VHASDIRTY) {
3199                 daddr_t start_pg;
3200                 daddr_t last_pg;
3201                 daddr_t end_pg;
3202
3203                 start_pg = vp->v_cstart;
3204                 end_pg   = vp->v_lastw;
3205
3206                 vp->v_flag &= ~VHASDIRTY;
3207                 vp->v_clen = 0;
3208
3209                 while (start_pg < end_pg) {
3210                         last_pg = start_pg + MAX_UPL_TRANSFER;
3211
3212                         if (last_pg > end_pg)
3213                                 last_pg = end_pg;
3214
3215                         cluster_push_x(vp, ubc_getsize(vp), start_pg, last_pg, 0);
3216
3217                         start_pg = last_pg;
3218                 }
3219                 return (1);
3220         }
3221         retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
3222
3223         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3224                      vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
3225
3226         return (retval);
3227 }
3228
3229
3230 static int
3231 cluster_try_push(vp, EOF, can_delay, push_all)
3232         struct vnode *vp;
3233         off_t  EOF;
3234         int    can_delay;
3235         int    push_all;
3236 {
3237         int cl_index;
3238         int cl_index1;
3239         int min_index;
3240         int cl_len;
3241         int cl_total;
3242         int cl_pushed;
3243         struct v_cluster l_clusters[MAX_CLUSTERS];
3244
3245         /*
3246          * make a local 'sorted' copy of the clusters
3247          * and clear vp->v_clen so that new clusters can
3248          * be developed
3249          */
3250         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3251                 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3252                         if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3253                                 continue;
3254                         if (min_index == -1)
3255                                 min_index = cl_index1;
3256                         else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3257                                 min_index = cl_index1;
3258                 }
3259                 if (min_index == -1)
3260                         break;
3261                 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3262                 l_clusters[cl_index].last_pg  = vp->v_clusters[min_index].last_pg;
3263
3264                 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3265         }
3266         cl_len     = cl_index;
3267         vp->v_clen = 0;
3268
3269         for (cl_pushed = 0, cl_index = 0; cl_index < cl_len; cl_index++) {
3270                 /*
3271                  * try to push each cluster in turn...  cluster_push_x may not
3272                  * push the cluster if can_delay is TRUE and the cluster doesn't
3273                  * meet the critera for an immediate push
3274                  */
3275                 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3276                         l_clusters[cl_index].start_pg = 0;
3277                         l_clusters[cl_index].last_pg  = 0;
3278
3279                         cl_pushed++;
3280
3281                         if (push_all == 0)
3282                                 break;
3283                 }
3284         }
3285         if (cl_len > cl_pushed) {
3286                /*
3287                 * we didn't push all of the clusters, so
3288                 * lets try to merge them back in to the vnode
3289                 */
3290                 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3291                         /*
3292                          * we picked up some new clusters while we were trying to
3293                          * push the old ones (I don't think this can happen because
3294                          * I'm holding the lock, but just in case)... the sum of the
3295                          * leftovers plus the new cluster count exceeds our ability
3296                          * to represent them, so fall back to the VHASDIRTY mechanism
3297                          */
3298                         for (cl_index = 0; cl_index < cl_len; cl_index++) {
3299                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3300                                         continue;
3301
3302                                 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3303                                         vp->v_cstart = l_clusters[cl_index].start_pg;
3304                                 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3305                                         vp->v_lastw = l_clusters[cl_index].last_pg;
3306                         }
3307                         vp->v_flag |= VHASDIRTY;
3308                 } else {
3309                         /*
3310                          * we've got room to merge the leftovers back in
3311                          * just append them starting at the next 'hole'
3312                          * represented by vp->v_clen
3313                          */
3314                         for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3315                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3316                                         continue;
3317
3318                                 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3319                                 vp->v_clusters[cl_index1].last_pg  = l_clusters[cl_index].last_pg;
3320
3321                                 if (cl_index1 == 0) {
3322                                         vp->v_cstart = l_clusters[cl_index].start_pg;
3323                                         vp->v_lastw  = l_clusters[cl_index].last_pg;
3324                                 } else {
3325                                         if (l_clusters[cl_index].start_pg < vp->v_cstart)
3326                                                 vp->v_cstart = l_clusters[cl_index].start_pg;
3327                                         if (l_clusters[cl_index].last_pg > vp->v_lastw)
3328                                                 vp->v_lastw = l_clusters[cl_index].last_pg;
3329                                 }
3330                                 cl_index1++;
3331                         }
3332                         /*
3333                          * update the cluster count
3334                          */
3335                         vp->v_clen = cl_index1;
3336                 }
3337         }
3338         return(MAX_CLUSTERS - vp->v_clen);
3339 }
3340
3341
3342
3343 static int
3344 cluster_push_x(vp, EOF, first, last, can_delay)
3345         struct vnode *vp;
3346         off_t  EOF;
3347         daddr_t first;
3348         daddr_t last;
3349         int    can_delay;
3350 {
3351         upl_page_info_t *pl;
3352         upl_t            upl;
3353         vm_offset_t      upl_offset;
3354         int              upl_size;
3355         off_t            upl_f_offset;
3356         int              pages_in_upl;
3357         int              start_pg;
3358         int              last_pg;
3359         int              io_size;
3360         int              io_flags;
3361         int              size;
3362         kern_return_t    kret;
3363
3364
3365         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3366                      vp->v_clen, first, last, EOF, 0);
3367
3368         if ((pages_in_upl = last - first) == 0) {
3369                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
3370
3371                 return (1);
3372         }
3373         upl_size = pages_in_upl * PAGE_SIZE;
3374         upl_f_offset = ((off_t)first) * PAGE_SIZE_64;
3375
3376         if (upl_f_offset + upl_size >= EOF) {
3377
3378                 if (upl_f_offset >= EOF) {
3379                         /*
3380                          * must have truncated the file and missed
3381                          * clearing a dangling cluster (i.e. it's completely
3382                          * beyond the new EOF
3383                          */
3384                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3385
3386                         return(1);
3387                 }
3388                 size = EOF - upl_f_offset;
3389
3390                 upl_size = (size + (PAGE_SIZE - 1) ) & ~(PAGE_SIZE - 1);
3391                 pages_in_upl = upl_size / PAGE_SIZE;
3392         } else {
3393                 if (can_delay && (pages_in_upl < (MAX_UPL_TRANSFER - (MAX_UPL_TRANSFER / 2))))
3394                         return(0);
3395                 size = upl_size;
3396         }
3397         kret = ubc_create_upl(vp,
3398                                 upl_f_offset,
3399                                 upl_size,
3400                                 &upl,
3401                                 &pl,
3402                                 UPL_RET_ONLY_DIRTY);
3403         if (kret != KERN_SUCCESS)
3404                 panic("cluster_push: failed to get pagelist");
3405
3406         if (can_delay) {
3407                 int  num_of_dirty;
3408
3409                 for (num_of_dirty = 0, start_pg = 0; start_pg < pages_in_upl; start_pg++) {
3410                         if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3411                                 num_of_dirty++;
3412                 }
3413                 if (num_of_dirty < pages_in_upl / 2) {
3414                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3415
3416                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 0, 2, num_of_dirty, (pages_in_upl / 2), 0);
3417
3418                         return(0);
3419                 }
3420         }
3421         last_pg = 0;
3422
3423         while (size) {
3424
3425                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3426                         if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3427                                 break;
3428                 }
3429                 if (start_pg > last_pg) {
3430                         io_size = (start_pg - last_pg) * PAGE_SIZE;
3431
3432                         ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size,
3433                                         UPL_ABORT_FREE_ON_EMPTY);
3434
3435                         if (io_size < size)
3436                                 size -= io_size;
3437                         else
3438                                 break;
3439                 }
3440                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3441                         if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
3442                                 break;
3443                 }
3444                 upl_offset = start_pg * PAGE_SIZE;
3445
3446                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3447
3448                 if (vp->v_flag & VNOCACHE_DATA)
3449                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC | CL_DUMP;
3450                 else
3451                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
3452
3453                 while (vp->v_numoutput >= ASYNC_THROTTLE) {
3454                         vp->v_flag |= VTHROTTLED;
3455                         tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
3456                 }
3457                 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
3458
3459                 size -= io_size;
3460         }
3461         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3462
3463         return(1);
3464 }
3465
3466
3467
3468 static int
3469 cluster_align_phys_io(struct vnode *vp, struct uio *uio, vm_offset_t usr_paddr, int xsize, int devblocksize, int flags)
3470 {
3471         struct iovec     *iov;
3472         upl_page_info_t  *pl;
3473         upl_t            upl;
3474         vm_offset_t      ubc_paddr;
3475         kern_return_t    kret;
3476         int              error = 0;
3477
3478         iov = uio->uio_iov;
3479
3480         kret = ubc_create_upl(vp,
3481                               uio->uio_offset & ~PAGE_MASK_64,
3482                               PAGE_SIZE,
3483                               &upl,
3484                               &pl,
3485                               UPL_FLAGS_NONE);
3486
3487         if (kret != KERN_SUCCESS)
3488                 return(EINVAL);
3489
3490         if (!upl_valid_page(pl, 0)) {
3491                 /*
3492                  * issue a synchronous read to cluster_io
3493                  */
3494                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3495                                    CL_READ, (struct buf *)0, (struct clios *)0);
3496                 if (error) {
3497                           ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3498
3499                           return(error);
3500                 }
3501         }
3502         ubc_paddr = (vm_offset_t)upl_phys_page(pl, 0) + (int)(uio->uio_offset & PAGE_MASK_64);
3503
3504         if (flags & CL_READ)
3505                 copyp2p(ubc_paddr, usr_paddr, xsize, 2);
3506         else
3507                 copyp2p(usr_paddr, ubc_paddr, xsize, 1);
3508
3509         if ( !(flags & CL_READ) || upl_dirty_page(pl, 0)) {
3510                 /*
3511                  * issue a synchronous write to cluster_io
3512                  */
3513                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3514                                    0, (struct buf *)0, (struct clios *)0);
3515         }
3516         if (error == 0) {
3517                 uio->uio_offset += xsize;
3518                 iov->iov_base   += xsize;
3519                 iov->iov_len    -= xsize;
3520                 uio->uio_resid  -= xsize;
3521         }
3522         ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3523
3524         return (error);
3525 }