bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  26 /*
  27  * Copyright (c) 1993
  28  *      The Regents of the University of California.  All rights reserved.
  29  *
  30  * Redistribution and use in source and binary forms, with or without
  31  * modification, are permitted provided that the following conditions
  32  * are met:
  33  * 1. Redistributions of source code must retain the above copyright
  34  *    notice, this list of conditions and the following disclaimer.
  35  * 2. Redistributions in binary form must reproduce the above copyright
  36  *    notice, this list of conditions and the following disclaimer in the
  37  *    documentation and/or other materials provided with the distribution.
  38  * 3. All advertising materials mentioning features or use of this software
  39  *    must display the following acknowledgement:
  40  *      This product includes software developed by the University of
  41  *      California, Berkeley and its contributors.
  42  * 4. Neither the name of the University nor the names of its contributors
  43  *    may be used to endorse or promote products derived from this software
  44  *    without specific prior written permission.
  45  *
  46  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  47  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  48  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  49  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  50  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  51  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  52  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  53  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  54  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  55  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  56  * SUCH DAMAGE.
  57  *
  58  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  59  */
  60
  61 #include <sys/param.h>
  62 #include <sys/proc.h>
  63 #include <sys/buf.h>
  64 #include <sys/vnode.h>
  65 #include <sys/mount.h>
  66 #include <sys/trace.h>
  67 #include <sys/malloc.h>
  68 #include <sys/resourcevar.h>
  69 #include <libkern/libkern.h>
  70
  71 #include <sys/ubc.h>
  72 #include <vm/vm_pageout.h>
  73
  74 #include <sys/kdebug.h>
  75
  76 #define CL_READ      0x01
  77 #define CL_ASYNC     0x02
  78 #define CL_COMMIT    0x04
  79 #define CL_PAGEOUT   0x10
  80 #define CL_AGE       0x20
  81 #define CL_DUMP      0x40
  82 #define CL_NOZERO    0x80
  83 #define CL_PAGEIN    0x100
  84 #define CL_DEV_MEMORY 0x200
  85 #define CL_PRESERVE   0x400
  86
  87
  88 struct clios {
  89         u_int  io_completed;       /* amount of io that has currently completed */
  90         u_int  io_issued;          /* amount of io that was successfully issued */
  91         int    io_error;           /* error code of first error encountered */
  92         int    io_wanted;          /* someone is sleeping waiting for a change in state */
  93 };
  94
  95
  96 static void cluster_zero(upl_t upl, vm_offset_t   upl_offset,
  97                 int size, struct buf *bp);
  98 static int cluster_read_x(struct vnode *vp, struct uio *uio,
  99                 off_t filesize, int devblocksize, int flags);
 100 static int cluster_write_x(struct vnode *vp, struct uio *uio,
 101                 off_t oldEOF, off_t newEOF, off_t headOff,
 102                 off_t tailOff, int devblocksize, int flags);
 103 static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
 104                 off_t filesize, int devblocksize, int flags);
 105 static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
 106                 off_t newEOF, int devblocksize, int flags);
 107 static int cluster_phys_read(struct vnode *vp, struct uio *uio,
 108                 off_t filesize, int devblocksize, int flags);
 109 static int cluster_phys_write(struct vnode *vp, struct uio *uio,
 110                 off_t newEOF, int devblocksize, int flags);
 111 static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
 112                 vm_offset_t usr_paddr, int xsize, int devblocksize, int flags);
 113 static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
 114 static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all);
 115
 116
 117 /*
 118  * throttle the number of async writes that
 119  * can be outstanding on a single vnode
 120  * before we issue a synchronous write
 121  */
 122 #define ASYNC_THROTTLE  9
 123
 124 static int
 125 cluster_iodone(bp)
 126         struct buf *bp;
 127 {
 128         int         b_flags;
 129         int         error;
 130         int         total_size;
 131         int         total_resid;
 132         int         upl_offset;
 133         int         zero_offset;
 134         upl_t       upl;
 135         struct buf *cbp;
 136         struct buf *cbp_head;
 137         struct buf *cbp_next;
 138         struct buf *real_bp;
 139         struct vnode *vp;
 140         struct clios *iostate;
 141         int         commit_size;
 142         int         pg_offset;
 143
 144
 145         cbp_head = (struct buf *)(bp->b_trans_head);
 146
 147         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 148                      (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 149
 150         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 151                 /*
 152                  * all I/O requests that are part of this transaction
 153                  * have to complete before we can process it
 154                  */
 155                 if ( !(cbp->b_flags & B_DONE)) {
 156
 157                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 158                                      (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
 159
 160                         return 0;
 161                 }
 162         }
 163         error       = 0;
 164         total_size  = 0;
 165         total_resid = 0;
 166
 167         cbp        = cbp_head;
 168         upl_offset = cbp->b_uploffset;
 169         upl        = cbp->b_pagelist;
 170         b_flags    = cbp->b_flags;
 171         real_bp    = cbp->b_real_bp;
 172         vp         = cbp->b_vp;
 173         zero_offset= cbp->b_validend;
 174         iostate    = (struct clios *)cbp->b_iostate;
 175
 176         while (cbp) {
 177                 if (cbp->b_vectorcount > 1)
 178                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 179
 180                 if ((cbp->b_flags & B_ERROR) && error == 0)
 181                         error = cbp->b_error;
 182
 183                 total_resid += cbp->b_resid;
 184                 total_size  += cbp->b_bcount;
 185
 186                 cbp_next = cbp->b_trans_next;
 187
 188                 free_io_buf(cbp);
 189
 190                 cbp = cbp_next;
 191         }
 192         if (zero_offset)
 193                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 194
 195         if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
 196                 vp->v_flag &= ~VTHROTTLED;
 197                 wakeup((caddr_t)&vp->v_numoutput);
 198         }
 199         if (iostate) {
 200                 /*
 201                  * someone has issued multiple I/Os asynchrounsly
 202                  * and is waiting for them to complete (streaming)
 203                  */
 204                 if (error && iostate->io_error == 0)
 205                         iostate->io_error = error;
 206
 207                 iostate->io_completed += total_size;
 208
 209                 if (iostate->io_wanted) {
 210                         /*
 211                          * someone is waiting for the state of
 212                          * this io stream to change
 213                          */
 214                         iostate->io_wanted = 0;
 215                         wakeup((caddr_t)&iostate->io_wanted);
 216                 }
 217         }
 218         if ((b_flags & B_NEED_IODONE) && real_bp) {
 219                 if (error) {
 220                         real_bp->b_flags |= B_ERROR;
 221                         real_bp->b_error = error;
 222                 }
 223                 real_bp->b_resid = total_resid;
 224
 225                 biodone(real_bp);
 226         }
 227         if (error == 0 && total_resid)
 228                 error = EIO;
 229
 230         if (b_flags & B_COMMIT_UPL) {
 231                 pg_offset   = upl_offset & PAGE_MASK;
 232                 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 233
 234                 if (error || (b_flags & B_NOCACHE) || ((b_flags & B_PHYS) && !(b_flags & B_READ))) {
 235                         int upl_abort_code;
 236
 237                         if (b_flags & B_PHYS)
 238                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 239                         else if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
 240                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 241                         else if (b_flags & B_PGIN)
 242                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 243                         else
 244                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 245
 246                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 247                                         upl_abort_code);
 248
 249                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 250                                      (int)upl, upl_offset - pg_offset, commit_size,
 251                                      0x80000000|upl_abort_code, 0);
 252
 253                 } else {
 254                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 255
 256                         if (b_flags & B_PHYS)
 257                                 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
 258                         else if ( !(b_flags & B_PAGEOUT))
 259                                 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
 260                         if (b_flags & B_AGE)
 261                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 262
 263                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 264                                         upl_commit_flags);
 265
 266                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 267                                      (int)upl, upl_offset - pg_offset, commit_size,
 268                                      upl_commit_flags, 0);
 269                 }
 270         } else
 271                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 272                              (int)upl, upl_offset, 0, error, 0);
 273
 274         return (error);
 275 }
 276
 277
 278 static void
 279 cluster_zero(upl, upl_offset, size, bp)
 280         upl_t         upl;
 281         vm_offset_t   upl_offset;
 282         int           size;
 283         struct buf   *bp;
 284 {
 285         vm_offset_t   io_addr = 0;
 286         int           must_unmap = 0;
 287         kern_return_t kret;
 288
 289         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
 290                      upl_offset, size, (int)bp, 0, 0);
 291
 292         if (bp == NULL || bp->b_data == NULL) {
 293                 kret = ubc_upl_map(upl, &io_addr);
 294
 295                 if (kret != KERN_SUCCESS)
 296                         panic("cluster_zero: ubc_upl_map() failed with (%d)", kret);
 297                 if (io_addr == 0)
 298                         panic("cluster_zero: ubc_upl_map() mapped 0");
 299
 300                 must_unmap = 1;
 301         } else
 302                 io_addr = (vm_offset_t)bp->b_data;
 303         bzero((caddr_t)(io_addr + upl_offset), size);
 304
 305         if (must_unmap) {
 306                 kret = ubc_upl_unmap(upl);
 307
 308                 if (kret != KERN_SUCCESS)
 309                         panic("cluster_zero: kernel_upl_unmap failed");
 310         }
 311 }
 312
 313 static int
 314 cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
 315         struct vnode *vp;
 316         upl_t         upl;
 317         vm_offset_t   upl_offset;
 318         off_t         f_offset;
 319         int           non_rounded_size;
 320         int           devblocksize;
 321         int           flags;
 322         struct buf   *real_bp;
 323         struct clios *iostate;
 324 {
 325         struct buf   *cbp;
 326         struct iovec *iovp;
 327         u_int         size;
 328         u_int         io_size;
 329         int           io_flags;
 330         int           error = 0;
 331         int           retval = 0;
 332         struct buf   *cbp_head = 0;
 333         struct buf   *cbp_tail = 0;
 334         upl_page_info_t *pl;
 335         int buf_count = 0;
 336         int pg_count;
 337         int pg_offset;
 338         u_int max_iosize;
 339         u_int max_vectors;
 340         int priv;
 341         int zero_offset = 0;
 342         u_int  first_lblkno;
 343
 344         if (flags & CL_READ) {
 345                 io_flags = (B_VECTORLIST | B_READ);
 346
 347                 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
 348         } else {
 349                 io_flags = (B_VECTORLIST | B_WRITEINPROG);
 350
 351                 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
 352         }
 353         pl = ubc_upl_pageinfo(upl);
 354
 355         if (flags & CL_AGE)
 356                 io_flags |= B_AGE;
 357         if (flags & CL_DUMP)
 358                 io_flags |= B_NOCACHE;
 359         if (flags & CL_PAGEIN)
 360                 io_flags |= B_PGIN;
 361         if (flags & CL_PAGEOUT)
 362                 io_flags |= B_PAGEOUT;
 363         if (flags & CL_COMMIT)
 364                 io_flags |= B_COMMIT_UPL;
 365         if (flags & CL_PRESERVE)
 366                 io_flags |= B_PHYS;
 367
 368         if (devblocksize)
 369                 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
 370         else
 371                 size = non_rounded_size;
 372
 373
 374         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 375                      (int)f_offset, size, upl_offset, flags, 0);
 376
 377         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 378                 /*
 379                  * then we are going to end up
 380                  * with a page that we can't complete (the file size wasn't a multiple
 381                  * of PAGE_SIZE and we're trying to read to the end of the file
 382                  * so we'll go ahead and zero out the portion of the page we can't
 383                  * read in from the file
 384                  */
 385                 zero_offset = upl_offset + non_rounded_size;
 386         }
 387         while (size) {
 388                 int vsize;
 389                 int i;
 390                 int pl_index;
 391                 int pg_resid;
 392                 int num_contig;
 393                 daddr_t lblkno;
 394                 daddr_t blkno;
 395
 396                 if (size > max_iosize)
 397                         io_size = max_iosize;
 398                 else
 399                         io_size = size;
 400
 401                 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
 402                         if (error == EOPNOTSUPP)
 403                                 panic("VOP_CMAP Unimplemented");
 404                         break;
 405                 }
 406
 407                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 408                              (int)f_offset, (int)blkno, io_size, zero_offset, 0);
 409
 410                 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
 411                         if (flags & CL_PAGEOUT) {
 412                                 error = EINVAL;
 413                                 break;
 414                         };
 415
 416                         /* Try paging out the page individually before
 417                            giving up entirely and dumping it (it could
 418                            be mapped in a "hole" and require allocation
 419                            before the I/O:
 420                          */
 421                          ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE_64, UPL_ABORT_FREE_ON_EMPTY);
 422                          if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
 423                                 error = EINVAL;
 424                                 break;
 425                          };
 426
 427                         upl_offset += PAGE_SIZE_64;
 428                         f_offset   += PAGE_SIZE_64;
 429                         size       -= PAGE_SIZE_64;
 430                         continue;
 431                 }
 432                 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
 433                 /*
 434                  * we have now figured out how much I/O we can do - this is in 'io_size'
 435                  * pl_index represents the first page in the 'upl' that the I/O will occur for
 436                  * pg_offset is the starting point in the first page for the I/O
 437                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 438                  */
 439                 pl_index  = upl_offset / PAGE_SIZE;
 440                 pg_offset = upl_offset & PAGE_MASK;
 441                 pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 442
 443                 if (flags & CL_DEV_MEMORY) {
 444                         /*
 445                          * currently, can't deal with reading 'holes' in file
 446                          */
 447                         if ((long)blkno == -1) {
 448                                 error = EINVAL;
 449                                 break;
 450                         }
 451                         /*
 452                          * treat physical requests as one 'giant' page
 453                          */
 454                         pg_count = 1;
 455                 }
 456                 if ((flags & CL_READ) && (long)blkno == -1) {
 457                         int bytes_to_zero;
 458
 459                         /*
 460                          * if we're reading and blkno == -1, then we've got a
 461                          * 'hole' in the file that we need to deal with by zeroing
 462                          * out the affected area in the upl
 463                          */
 464                         if (zero_offset && io_size == size) {
 465                                 /*
 466                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
 467                                  * than 'zero_offset' will be non-zero
 468                                  * if the 'hole' returned by VOP_CMAP extends all the way to the eof
 469                                  * (indicated by the io_size finishing off the I/O request for this UPL)
 470                                  * than we're not going to issue an I/O for the
 471                                  * last page in this upl... we need to zero both the hole and the tail
 472                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
 473                                  */
 474                                 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
 475
 476                                 zero_offset = 0;
 477                         } else
 478                                 bytes_to_zero = io_size;
 479
 480                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
 481
 482                         if (cbp_head)
 483                                 /*
 484                                  * if there is a current I/O chain pending
 485                                  * then the first page of the group we just zero'd
 486                                  * will be handled by the I/O completion if the zero
 487                                  * fill started in the middle of the page
 488                                  */
 489                                 pg_count = (io_size - pg_offset) / PAGE_SIZE;
 490                         else {
 491                                 /*
 492                                  * no pending I/O to pick up that first page
 493                                  * so, we have to make sure it gets committed
 494                                  * here.
 495                                  * set the pg_offset to 0 so that the upl_commit_range
 496                                  * starts with this page
 497                                  */
 498                                 pg_count = (io_size + pg_offset) / PAGE_SIZE;
 499                                 pg_offset = 0;
 500                         }
 501                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 502                                 /*
 503                                  * if we're done with the request for this UPL
 504                                  * then we have to make sure to commit the last page
 505                                  * even if we only partially zero-filled it
 506                                  */
 507                                 pg_count++;
 508
 509                         if (pg_count) {
 510                                 if (pg_offset)
 511                                         pg_resid = PAGE_SIZE - pg_offset;
 512                                 else
 513                                         pg_resid = 0;
 514
 515                                 if (flags & CL_COMMIT)
 516                                         ubc_upl_commit_range(upl,
 517                                                         (upl_offset + pg_resid) & ~PAGE_MASK,
 518                                                         pg_count * PAGE_SIZE,
 519                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 520                         }
 521                         upl_offset += io_size;
 522                         f_offset   += io_size;
 523                         size       -= io_size;
 524
 525                         if (cbp_head && pg_count)
 526                                 goto start_io;
 527                         continue;
 528
 529                 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
 530                         real_bp->b_blkno = blkno;
 531                 }
 532
 533                 if (pg_count > 1) {
 534                         if (pg_count > max_vectors) {
 535                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 536
 537                                 if (io_size < 0) {
 538                                         io_size = PAGE_SIZE - pg_offset;
 539                                         pg_count = 1;
 540                                 } else
 541                                         pg_count = max_vectors;
 542                         }
 543                         /*
 544                          * we need to allocate space for the vector list
 545                          */
 546                         if (pg_count > 1) {
 547                                 iovp = (struct iovec *)_MALLOC(sizeof(struct iovec) * pg_count,
 548                                                                M_SEGMENT, M_NOWAIT);
 549
 550                                 if (iovp == (struct iovec *) 0) {
 551                                         /*
 552                                          * if the allocation fails, then throttle down to a single page
 553                                          */
 554                                         io_size = PAGE_SIZE - pg_offset;
 555                                         pg_count = 1;
 556                                 }
 557                         }
 558                 }
 559
 560                 /* Throttle the speculative IO */
 561                 if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 562                         priv = 0;
 563                 else
 564                         priv = 1;
 565
 566                 cbp = alloc_io_buf(vp, priv);
 567
 568                 if (pg_count == 1)
 569                         /*
 570                          * we use the io vector that's reserved in the buffer header
 571                          * this insures we can always issue an I/O even in a low memory
 572                          * condition that prevents the _MALLOC from succeeding... this
 573                          * is necessary to prevent deadlocks with the pager
 574                          */
 575                         iovp = (struct iovec *)(&cbp->b_vects[0]);
 576
 577                 cbp->b_vectorlist  = (void *)iovp;
 578                 cbp->b_vectorcount = pg_count;
 579
 580                 if (flags & CL_DEV_MEMORY) {
 581
 582                         iovp->iov_len  = io_size;
 583                         iovp->iov_base = (caddr_t)upl_phys_page(pl, 0);
 584
 585                         if (iovp->iov_base == (caddr_t) 0) {
 586                                 free_io_buf(cbp);
 587                                 error = EINVAL;
 588                         } else
 589                                 iovp->iov_base += upl_offset;
 590                 } else {
 591
 592                   for (i = 0, vsize = io_size; i < pg_count; i++, iovp++) {
 593                         int     psize;
 594
 595                         psize = PAGE_SIZE - pg_offset;
 596
 597                         if (psize > vsize)
 598                                 psize = vsize;
 599
 600                         iovp->iov_len  = psize;
 601                         iovp->iov_base = (caddr_t)upl_phys_page(pl, pl_index + i);
 602
 603                         if (iovp->iov_base == (caddr_t) 0) {
 604                                 if (pg_count > 1)
 605                                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 606                                 free_io_buf(cbp);
 607
 608                                 error = EINVAL;
 609                                 break;
 610                         }
 611                         iovp->iov_base += pg_offset;
 612                         pg_offset = 0;
 613
 614                         if (flags & CL_PAGEOUT) {
 615                                 int         s;
 616                                 struct buf *bp;
 617
 618                                 s = splbio();
 619                                 if (bp = incore(vp, lblkno + i)) {
 620                                         if (!ISSET(bp->b_flags, B_BUSY)) {
 621                                                 bremfree(bp);
 622                                                 SET(bp->b_flags, (B_BUSY | B_INVAL));
 623                                                 splx(s);
 624                                                 brelse(bp);
 625                                         } else
 626                                                 panic("BUSY bp found in cluster_io");
 627                                 }
 628                                 splx(s);
 629                         }
 630                         vsize -= psize;
 631                     }
 632                 }
 633                 if (error)
 634                         break;
 635
 636                 if (flags & CL_ASYNC) {
 637                         cbp->b_flags |= (B_CALL | B_ASYNC);
 638                         cbp->b_iodone = (void *)cluster_iodone;
 639                 }
 640                 cbp->b_flags |= io_flags;
 641
 642                 cbp->b_lblkno = lblkno;
 643                 cbp->b_blkno  = blkno;
 644                 cbp->b_bcount = io_size;
 645                 cbp->b_pagelist  = upl;
 646                 cbp->b_uploffset = upl_offset;
 647                 cbp->b_trans_next = (struct buf *)0;
 648
 649                 if (cbp->b_iostate = (void *)iostate)
 650                         /*
 651                          * caller wants to track the state of this
 652                          * io... bump the amount issued against this stream
 653                          */
 654                         iostate->io_issued += io_size;
 655
 656                 if (flags & CL_READ)
 657                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 658                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 659                 else
 660                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 661                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 662
 663                 if (cbp_head) {
 664                         cbp_tail->b_trans_next = cbp;
 665                         cbp_tail = cbp;
 666                 } else {
 667                         cbp_head = cbp;
 668                         cbp_tail = cbp;
 669                 }
 670                 (struct buf *)(cbp->b_trans_head) = cbp_head;
 671                 buf_count++;
 672
 673                 upl_offset += io_size;
 674                 f_offset   += io_size;
 675                 size       -= io_size;
 676
 677                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
 678                         /*
 679                          * if we have no more I/O to issue or
 680                          * the current I/O we've prepared fully
 681                          * completes the last page in this request
 682                          * and it's either an ASYNC request or
 683                          * we've already accumulated more than 8 I/O's into
 684                          * this transaction and it's not an I/O directed to
 685                          * special DEVICE memory
 686                          * then go ahead and issue the I/O
 687                          */
 688 start_io:
 689                         if (real_bp) {
 690                                 cbp_head->b_flags |= B_NEED_IODONE;
 691                                 cbp_head->b_real_bp = real_bp;
 692                         } else
 693                                 cbp_head->b_real_bp = (struct buf *)NULL;
 694
 695                         if (size == 0) {
 696                                 /*
 697                                  * we're about to issue the last I/O for this upl
 698                                  * if this was a read to the eof and the eof doesn't
 699                                  * finish on a page boundary, than we need to zero-fill
 700                                  * the rest of the page....
 701                                  */
 702                                 cbp_head->b_validend = zero_offset;
 703                         } else
 704                                 cbp_head->b_validend = 0;
 705
 706                         for (cbp = cbp_head; cbp;) {
 707                                 struct buf * cbp_next;
 708
 709                                 if (io_flags & B_WRITEINPROG)
 710                                         cbp->b_vp->v_numoutput++;
 711
 712                                 cbp_next = cbp->b_trans_next;
 713
 714                                 (void) VOP_STRATEGY(cbp);
 715                                 cbp = cbp_next;
 716                         }
 717                         if ( !(flags & CL_ASYNC)) {
 718                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 719                                         biowait(cbp);
 720
 721                                 if (error = cluster_iodone(cbp_head)) {
 722                                         if ((flags & CL_PAGEOUT) && (error == ENXIO))
 723                                                 retval = 0;     /* drop the error */
 724                                         else
 725                                                 retval = error;
 726                                         error  = 0;
 727                                 }
 728                         }
 729                         cbp_head = (struct buf *)0;
 730                         cbp_tail = (struct buf *)0;
 731
 732                         buf_count = 0;
 733                 }
 734         }
 735         if (error) {
 736                 int abort_size;
 737
 738                 io_size = 0;
 739
 740                 for (cbp = cbp_head; cbp;) {
 741                         struct buf * cbp_next;
 742
 743                         if (cbp->b_vectorcount > 1)
 744                                 _FREE(cbp->b_vectorlist, M_SEGMENT);
 745                         upl_offset -= cbp->b_bcount;
 746                         size       += cbp->b_bcount;
 747                         io_size    += cbp->b_bcount;
 748
 749                         cbp_next = cbp->b_trans_next;
 750                         free_io_buf(cbp);
 751                         cbp = cbp_next;
 752                 }
 753                 if (iostate) {
 754                         /*
 755                          * update the error condition for this stream
 756                          * since we never really issued the io
 757                          * just go ahead and adjust it back
 758                          */
 759                         if (iostate->io_error == 0)
 760                                 iostate->io_error = error;
 761                         iostate->io_issued -= io_size;
 762
 763                         if (iostate->io_wanted) {
 764                                 /*
 765                                  * someone is waiting for the state of
 766                                  * this io stream to change
 767                                  */
 768                                 iostate->io_wanted = 0;
 769                                 wakeup((caddr_t)&iostate->io_wanted);
 770                         }
 771                 }
 772                 pg_offset  = upl_offset & PAGE_MASK;
 773                 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 774
 775                 if (flags & CL_COMMIT) {
 776                         int upl_abort_code;
 777
 778                         if (flags & CL_PRESERVE)
 779                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 780                         else if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
 781                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 782                         else if (flags & CL_PAGEIN)
 783                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 784                         else
 785                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 786
 787                         ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
 788                                                 upl_abort_code);
 789
 790                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
 791                                      (int)upl, upl_offset - pg_offset, abort_size, error, 0);
 792                 }
 793                 if (real_bp) {
 794                         real_bp->b_flags |= B_ERROR;
 795                         real_bp->b_error  = error;
 796
 797                         biodone(real_bp);
 798                 }
 799                 if (retval == 0)
 800                         retval = error;
 801         }
 802         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
 803                      (int)f_offset, size, upl_offset, retval, 0);
 804
 805         return (retval);
 806 }
 807
 808
 809 static int
 810 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
 811         struct vnode *vp;
 812         off_t         f_offset;
 813         u_int         size;
 814         off_t         filesize;
 815         int           devblocksize;
 816 {
 817         int           pages_to_fetch;
 818         int           skipped_pages;
 819
 820         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
 821                      (int)f_offset, size, (int)filesize, 0, 0);
 822
 823         if (f_offset >= filesize) {
 824                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 825                              (int)f_offset, 0, 0, 0, 0);
 826                 return(0);
 827         }
 828         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
 829                 size = MAX_UPL_TRANSFER * PAGE_SIZE;
 830         else
 831                 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
 832
 833         if ((off_t)size > (filesize - f_offset))
 834                 size = filesize - f_offset;
 835
 836         pages_to_fetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
 837
 838         for (skipped_pages = 0; skipped_pages < pages_to_fetch; skipped_pages++) {
 839                 if (ubc_page_op(vp, f_offset, 0, 0, 0) != KERN_SUCCESS)
 840                         break;
 841                 f_offset += PAGE_SIZE;
 842                 size     -= PAGE_SIZE;
 843         }
 844         if (skipped_pages < pages_to_fetch)
 845                 advisory_read(vp, filesize, f_offset, size, devblocksize);
 846
 847         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 848                      (int)f_offset + (pages_to_fetch * PAGE_SIZE), skipped_pages, 0, 1, 0);
 849
 850         return (pages_to_fetch);
 851 }
 852
 853
 854
 855 static void
 856 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
 857         struct vnode *vp;
 858         daddr_t       b_lblkno;
 859         daddr_t       e_lblkno;
 860         off_t         filesize;
 861         int           devblocksize;
 862 {
 863         daddr_t       r_lblkno;
 864         off_t         f_offset;
 865         int           size_of_prefetch;
 866         int           max_pages;
 867
 868         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
 869                      b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
 870
 871         if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
 872                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 873                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
 874                 return;
 875         }
 876
 877         if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
 878                                  (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
 879                 vp->v_ralen = 0;
 880                 vp->v_maxra = 0;
 881
 882                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 883                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
 884
 885                 return;
 886         }
 887         max_pages = MAX_UPL_TRANSFER;
 888
 889         vp->v_ralen = vp->v_ralen ? min(max_pages, vp->v_ralen << 1) : 1;
 890
 891         if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
 892                 vp->v_ralen = min(max_pages, (e_lblkno + 1) - b_lblkno);
 893
 894         if (e_lblkno < vp->v_maxra) {
 895                 if ((vp->v_maxra - e_lblkno) > max(max_pages / 16, 4)) {
 896
 897                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 898                                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
 899                         return;
 900                 }
 901         }
 902         r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
 903         f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
 904
 905         if (f_offset < filesize) {
 906                 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
 907
 908                 if (size_of_prefetch)
 909                         vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
 910         }
 911         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 912                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
 913 }
 914
 915 int
 916 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 917         struct vnode *vp;
 918         upl_t         upl;
 919         vm_offset_t   upl_offset;
 920         off_t         f_offset;
 921         int           size;
 922         off_t         filesize;
 923         int           devblocksize;
 924         int           flags;
 925 {
 926         int           io_size;
 927         int           pg_size;
 928         off_t         max_size;
 929         int local_flags = CL_PAGEOUT;
 930
 931         if ((flags & UPL_IOSYNC) == 0)
 932                 local_flags |= CL_ASYNC;
 933         if ((flags & UPL_NOCOMMIT) == 0)
 934                 local_flags |= CL_COMMIT;
 935
 936
 937         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
 938                      (int)f_offset, size, (int)filesize, local_flags, 0);
 939
 940         /*
 941          * If they didn't specify any I/O, then we are done...
 942          * we can't issue an abort because we don't know how
 943          * big the upl really is
 944          */
 945         if (size <= 0)
 946                 return (EINVAL);
 947
 948         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
 949                 if (local_flags & CL_COMMIT)
 950                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 951                 return (EROFS);
 952         }
 953         /*
 954          * can't page-in from a negative offset
 955          * or if we're starting beyond the EOF
 956          * or if the file offset isn't page aligned
 957          * or the size requested isn't a multiple of PAGE_SIZE
 958          */
 959         if (f_offset < 0 || f_offset >= filesize ||
 960            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 961                 if (local_flags & CL_COMMIT)
 962                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 963                 return (EINVAL);
 964         }
 965         max_size = filesize - f_offset;
 966
 967         if (size < max_size)
 968                 io_size = size;
 969         else
 970                 io_size = max_size;
 971
 972         pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 973
 974         if (size > pg_size) {
 975                 if (local_flags & CL_COMMIT)
 976                         ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
 977                                         UPL_ABORT_FREE_ON_EMPTY);
 978         }
 979         while (vp->v_numoutput >= ASYNC_THROTTLE) {
 980                 vp->v_flag |= VTHROTTLED;
 981                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_pageout", 0);
 982         }
 983
 984         return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
 985                            local_flags, (struct buf *)0, (struct clios *)0));
 986 }
 987
 988 int
 989 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 990         struct vnode *vp;
 991         upl_t         upl;
 992         vm_offset_t   upl_offset;
 993         off_t         f_offset;
 994         int           size;
 995         off_t         filesize;
 996         int           devblocksize;
 997         int           flags;
 998 {
 999         u_int         io_size;
1000         int           rounded_size;
1001         off_t         max_size;
1002         int           retval;
1003         int           local_flags = 0;
1004
1005         if (upl == NULL || size < 0)
1006                 panic("cluster_pagein: NULL upl passed in");
1007
1008         if ((flags & UPL_IOSYNC) == 0)
1009                 local_flags |= CL_ASYNC;
1010         if ((flags & UPL_NOCOMMIT) == 0)
1011                 local_flags |= CL_COMMIT;
1012
1013
1014         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1015                      (int)f_offset, size, (int)filesize, local_flags, 0);
1016
1017         /*
1018          * can't page-in from a negative offset
1019          * or if we're starting beyond the EOF
1020          * or if the file offset isn't page aligned
1021          * or the size requested isn't a multiple of PAGE_SIZE
1022          */
1023         if (f_offset < 0 || f_offset >= filesize ||
1024            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1025                 if (local_flags & CL_COMMIT)
1026                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1027                 return (EINVAL);
1028         }
1029         max_size = filesize - f_offset;
1030
1031         if (size < max_size)
1032                 io_size = size;
1033         else
1034                 io_size = max_size;
1035
1036         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1037
1038         if (size > rounded_size && (local_flags & CL_COMMIT))
1039                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1040                                     size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1041
1042         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
1043                            local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
1044
1045         if (retval == 0) {
1046                 int b_lblkno;
1047                 int e_lblkno;
1048
1049                 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
1050                 e_lblkno = (int)
1051                         ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1052
1053                 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
1054                         /*
1055                          * we haven't read the last page in of the file yet
1056                          * so let's try to read ahead if we're in
1057                          * a sequential access pattern
1058                          */
1059                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1060                 }
1061                 vp->v_lastr = e_lblkno;
1062         }
1063         return (retval);
1064 }
1065
1066 int
1067 cluster_bp(bp)
1068         struct buf *bp;
1069 {
1070         off_t  f_offset;
1071         int    flags;
1072
1073         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1074                      (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1075
1076         if (bp->b_pagelist == (upl_t) 0)
1077                 panic("cluster_bp: can't handle NULL upl yet\n");
1078         if (bp->b_flags & B_READ)
1079                 flags = CL_ASYNC | CL_READ;
1080         else
1081                 flags = CL_ASYNC;
1082
1083         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1084
1085         return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
1086 }
1087
1088 int
1089 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1090         struct vnode *vp;
1091         struct uio   *uio;
1092         off_t         oldEOF;
1093         off_t         newEOF;
1094         off_t         headOff;
1095         off_t         tailOff;
1096         int           devblocksize;
1097         int           flags;
1098 {
1099         int           prev_resid;
1100         int           clip_size;
1101         off_t         max_io_size;
1102         struct iovec  *iov;
1103         vm_offset_t   upl_offset;
1104         int           upl_size;
1105         int           pages_in_pl;
1106         upl_page_info_t *pl;
1107         int           upl_flags;
1108         upl_t         upl;
1109         int           retval = 0;
1110
1111
1112         if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
1113           {
1114             retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1115             return(retval);
1116           }
1117
1118         while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1119           {
1120             /* we know we have a resid, so this is safe */
1121             iov = uio->uio_iov;
1122             while (iov->iov_len == 0) {
1123               uio->uio_iov++;
1124               uio->uio_iovcnt--;
1125               iov = uio->uio_iov;
1126             }
1127
1128             /*
1129              * We check every vector target and if it is physically
1130              * contiguous space, we skip the sanity checks.
1131              */
1132
1133             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1134             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1135             pages_in_pl = 0;
1136             upl_flags = UPL_QUERY_OBJECT_TYPE;
1137             if ((vm_map_get_upl(current_map(),
1138                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1139                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1140               {
1141                 /*
1142                  * the user app must have passed in an invalid address
1143                  */
1144                 return (EFAULT);
1145               }
1146
1147             if (upl_flags & UPL_PHYS_CONTIG)
1148               {
1149                 if (flags & IO_HEADZEROFILL)
1150                   {
1151                     flags &= ~IO_HEADZEROFILL;
1152
1153                     if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1154                         return(retval);
1155                   }
1156
1157                 retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
1158
1159                 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1160                   {
1161                     retval = cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL);
1162                     return(retval);
1163                   }
1164               }
1165             else if ((uio->uio_resid < 4 * PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1166               {
1167                 /*
1168                  * We set a threshhold of 4 pages to decide if the nocopy
1169                  * write loop is worth the trouble...
1170                  * we also come here if we're trying to zero the head and/or tail
1171                  * of a partially written page, and the user source is not a physically contiguous region
1172                  */
1173                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1174                 return(retval);
1175               }
1176             else if (uio->uio_offset & PAGE_MASK_64)
1177               {
1178                 /* Bring the file offset write up to a pagesize boundary */
1179                 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1180                 if (uio->uio_resid < clip_size)
1181                   clip_size = uio->uio_resid;
1182                 /*
1183                  * Fake the resid going into the cluster_write_x call
1184                  * and restore it on the way out.
1185                  */
1186                 prev_resid = uio->uio_resid;
1187                 uio->uio_resid = clip_size;
1188                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1189                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1190               }
1191             else if ((int)iov->iov_base & PAGE_MASK_64)
1192               {
1193                 clip_size = iov->iov_len;
1194                 prev_resid = uio->uio_resid;
1195                 uio->uio_resid = clip_size;
1196                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1197                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1198               }
1199             else
1200               {
1201                 /*
1202                  * If we come in here, we know the offset into
1203                  * the file is on a pagesize boundary
1204                  */
1205
1206                 max_io_size = newEOF - uio->uio_offset;
1207                 clip_size = uio->uio_resid;
1208                 if (iov->iov_len < clip_size)
1209                   clip_size = iov->iov_len;
1210                 if (max_io_size < clip_size)
1211                   clip_size = max_io_size;
1212
1213                 if (clip_size < PAGE_SIZE)
1214                   {
1215                     /*
1216                      * Take care of tail end of write in this vector
1217                      */
1218                     prev_resid = uio->uio_resid;
1219                     uio->uio_resid = clip_size;
1220                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1221                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1222                   }
1223                 else
1224                   {
1225                     /* round clip_size down to a multiple of pagesize */
1226                     clip_size = clip_size & ~(PAGE_MASK);
1227                     prev_resid = uio->uio_resid;
1228                     uio->uio_resid = clip_size;
1229                     retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1230                     if ((retval == 0) && uio->uio_resid)
1231                       retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1232                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1233                   }
1234               } /* end else */
1235           } /* end while */
1236         return(retval);
1237 }
1238
1239
1240 static int
1241 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1242         struct vnode *vp;
1243         struct uio   *uio;
1244         off_t         newEOF;
1245         int           devblocksize;
1246         int           flags;
1247 {
1248         upl_t            upl;
1249         upl_page_info_t  *pl;
1250         off_t            upl_f_offset;
1251         vm_offset_t      upl_offset;
1252         off_t            max_io_size;
1253         int              io_size;
1254         int              io_flag;
1255         int              upl_size;
1256         int              upl_needed_size;
1257         int              pages_in_pl;
1258         int              upl_flags;
1259         kern_return_t    kret;
1260         struct iovec     *iov;
1261         int              i;
1262         int              first = 1;
1263         int              force_data_sync;
1264         int              error  = 0;
1265         struct clios     iostate;
1266
1267         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1268                      (int)uio->uio_offset, (int)uio->uio_resid,
1269                      (int)newEOF, devblocksize, 0);
1270
1271         /*
1272          * When we enter this routine, we know
1273          *  -- the offset into the file is on a pagesize boundary
1274          *  -- the resid is a page multiple
1275          *  -- the resid will not exceed iov_len
1276          */
1277         cluster_try_push(vp, newEOF, 0, 1);
1278
1279         iostate.io_completed = 0;
1280         iostate.io_issued = 0;
1281         iostate.io_error = 0;
1282         iostate.io_wanted = 0;
1283
1284         iov = uio->uio_iov;
1285
1286         while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1287                 io_size = uio->uio_resid;
1288
1289                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1290                         io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1291
1292                 if (first) {
1293                         if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
1294                                 io_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 8;
1295                         first = 0;
1296                 }
1297                 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1298                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1299
1300                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1301                              (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1302
1303                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1304                         pages_in_pl = 0;
1305                         upl_size = upl_needed_size;
1306                         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1307                                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1308
1309                         kret = vm_map_get_upl(current_map(),
1310                                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1311                                               &upl_size,
1312                                               &upl,
1313                                               NULL,
1314                                               &pages_in_pl,
1315                                               &upl_flags,
1316                                               force_data_sync);
1317
1318                         if (kret != KERN_SUCCESS) {
1319                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1320                                              0, 0, 0, kret, 0);
1321
1322                                 /*
1323                                  * cluster_nocopy_write: failed to get pagelist
1324                                  *
1325                                  * we may have already spun some portion of this request
1326                                  * off as async requests... we need to wait for the I/O
1327                                  * to complete before returning
1328                                  */
1329                                 goto wait_for_writes;
1330                         }
1331                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1332                         pages_in_pl = upl_size / PAGE_SIZE;
1333
1334                         for (i = 0; i < pages_in_pl; i++) {
1335                                 if (!upl_valid_page(pl, i))
1336                                         break;
1337                         }
1338                         if (i == pages_in_pl)
1339                                 break;
1340
1341                         /*
1342                          * didn't get all the pages back that we
1343                          * needed... release this upl and try again
1344                          */
1345                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1346                                             UPL_ABORT_FREE_ON_EMPTY);
1347                 }
1348                 if (force_data_sync >= 3) {
1349                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1350                                      i, pages_in_pl, upl_size, kret, 0);
1351
1352                         /*
1353                          * for some reason, we couldn't acquire a hold on all
1354                          * the pages needed in the user's address space
1355                          *
1356                          * we may have already spun some portion of this request
1357                          * off as async requests... we need to wait for the I/O
1358                          * to complete before returning
1359                          */
1360                         goto wait_for_writes;
1361                 }
1362
1363                 /*
1364                  * Consider the possibility that upl_size wasn't satisfied.
1365                  */
1366                 if (upl_size != upl_needed_size)
1367                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1368
1369                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1370                              (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1371
1372                 if (io_size == 0) {
1373                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1374                                             UPL_ABORT_FREE_ON_EMPTY);
1375
1376                         /*
1377                          * we may have already spun some portion of this request
1378                          * off as async requests... we need to wait for the I/O
1379                          * to complete before returning
1380                          */
1381                         goto wait_for_writes;
1382                 }
1383                 /*
1384                  * Now look for pages already in the cache
1385                  * and throw them away.
1386                  */
1387
1388                 upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
1389                 max_io_size = io_size;
1390
1391                 while (max_io_size) {
1392                         /*
1393                          * Flag UPL_POP_DUMP says if the page is found
1394                          * in the page cache it must be thrown away.
1395                          */
1396                         ubc_page_op(vp,
1397                                     upl_f_offset,
1398                                     UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1399                                     0, 0);
1400                         max_io_size  -= PAGE_SIZE_64;
1401                         upl_f_offset += PAGE_SIZE_64;
1402                 }
1403                 /*
1404                  * we want push out these writes asynchronously so that we can overlap
1405                  * the preparation of the next I/O
1406                  * if there are already too many outstanding writes
1407                  * wait until some complete before issuing the next
1408                  */
1409                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1410                         iostate.io_wanted = 1;
1411                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1412                 }
1413                 if (iostate.io_error) {
1414                         /*
1415                          * one of the earlier writes we issued ran into a hard error
1416                          * don't issue any more writes, cleanup the UPL
1417                          * that was just created but not used, then
1418                          * go wait for all writes that are part of this stream
1419                          * to complete before returning the error to the caller
1420                          */
1421                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1422                                             UPL_ABORT_FREE_ON_EMPTY);
1423
1424                         goto wait_for_writes;
1425                 }
1426                 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT;
1427
1428                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1429                              (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1430
1431                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1432                                    io_size, devblocksize, io_flag, (struct buf *)0, &iostate);
1433
1434                 iov->iov_len    -= io_size;
1435                 iov->iov_base   += io_size;
1436                 uio->uio_resid  -= io_size;
1437                 uio->uio_offset += io_size;
1438
1439                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1440                              (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1441
1442         } /* end while */
1443
1444 wait_for_writes:
1445         /*
1446          * make sure all async writes issued as part of this stream
1447          * have completed before we return
1448          */
1449         while (iostate.io_issued != iostate.io_completed) {
1450                 iostate.io_wanted = 1;
1451                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1452         }
1453         if (iostate.io_error)
1454                 error = iostate.io_error;
1455
1456         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1457                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1458
1459         return (error);
1460 }
1461
1462
1463 static int
1464 cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
1465         struct vnode *vp;
1466         struct uio   *uio;
1467         off_t        newEOF;
1468         int          devblocksize;
1469         int          flags;
1470 {
1471         upl_page_info_t *pl;
1472         vm_offset_t      src_paddr;
1473         upl_t            upl;
1474         vm_offset_t      upl_offset;
1475         int              tail_size;
1476         int              io_size;
1477         int              upl_size;
1478         int              upl_needed_size;
1479         int              pages_in_pl;
1480         int              upl_flags;
1481         kern_return_t    kret;
1482         struct iovec     *iov;
1483         int              error  = 0;
1484
1485         /*
1486          * When we enter this routine, we know
1487          *  -- the resid will not exceed iov_len
1488          *  -- the vector target address is physcially contiguous
1489          */
1490         cluster_try_push(vp, newEOF, 0, 1);
1491
1492         iov = uio->uio_iov;
1493         io_size = iov->iov_len;
1494         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1495         upl_needed_size = upl_offset + io_size;
1496
1497         pages_in_pl = 0;
1498         upl_size = upl_needed_size;
1499         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1500                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1501
1502         kret = vm_map_get_upl(current_map(),
1503                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1504                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1505
1506         if (kret != KERN_SUCCESS) {
1507                 /*
1508                  * cluster_phys_write: failed to get pagelist
1509                  * note: return kret here
1510                  */
1511               return(EINVAL);
1512         }
1513         /*
1514          * Consider the possibility that upl_size wasn't satisfied.
1515          * This is a failure in the physical memory case.
1516          */
1517         if (upl_size < upl_needed_size) {
1518                 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1519                 return(EINVAL);
1520         }
1521         pl = ubc_upl_pageinfo(upl);
1522
1523         src_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
1524
1525         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1526                 int   head_size;
1527
1528                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1529
1530                 if (head_size > io_size)
1531                         head_size = io_size;
1532
1533                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
1534
1535                 if (error) {
1536                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1537
1538                         return(EINVAL);
1539                 }
1540                 upl_offset += head_size;
1541                 src_paddr  += head_size;
1542                 io_size    -= head_size;
1543         }
1544         tail_size = io_size & (devblocksize - 1);
1545         io_size  -= tail_size;
1546
1547         if (io_size) {
1548                 /*
1549                  * issue a synchronous write to cluster_io
1550                  */
1551                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1552                                    io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
1553         }
1554         if (error == 0) {
1555                 /*
1556                  * The cluster_io write completed successfully,
1557                  * update the uio structure
1558                  */
1559                 uio->uio_resid  -= io_size;
1560                 iov->iov_len    -= io_size;
1561                 iov->iov_base   += io_size;
1562                 uio->uio_offset += io_size;
1563                 src_paddr       += io_size;
1564
1565                 if (tail_size)
1566                         error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
1567         }
1568         /*
1569          * just release our hold on the physically contiguous
1570          * region without changing any state
1571          */
1572         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1573
1574         return (error);
1575 }
1576
1577
1578 static int
1579 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1580         struct vnode *vp;
1581         struct uio   *uio;
1582         off_t         oldEOF;
1583         off_t         newEOF;
1584         off_t         headOff;
1585         off_t         tailOff;
1586         int           devblocksize;
1587         int           flags;
1588 {
1589         upl_page_info_t *pl;
1590         upl_t            upl;
1591         vm_offset_t      upl_offset;
1592         int              upl_size;
1593         off_t            upl_f_offset;
1594         int              pages_in_upl;
1595         int              start_offset;
1596         int              xfer_resid;
1597         int              io_size;
1598         int              io_flags;
1599         vm_offset_t      io_address;
1600         int              io_offset;
1601         int              bytes_to_zero;
1602         int              bytes_to_move;
1603         kern_return_t    kret;
1604         int              retval = 0;
1605         int              uio_resid;
1606         long long        total_size;
1607         long long        zero_cnt;
1608         off_t            zero_off;
1609         long long        zero_cnt1;
1610         off_t            zero_off1;
1611         daddr_t          start_blkno;
1612         daddr_t          last_blkno;
1613
1614         if (uio) {
1615                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1616                              (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1617
1618                 uio_resid = uio->uio_resid;
1619         } else {
1620                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1621                              0, 0, (int)oldEOF, (int)newEOF, 0);
1622
1623                 uio_resid = 0;
1624         }
1625         zero_cnt  = 0;
1626         zero_cnt1 = 0;
1627
1628         if (flags & IO_HEADZEROFILL) {
1629                 /*
1630                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1631                  * so we zero fill the intervening space between the old EOF and the offset
1632                  * where the next chunk of real data begins.... ftruncate will also use this
1633                  * routine to zero fill to the new EOF when growing a file... in this case, the
1634                  * uio structure will not be provided
1635                  */
1636                 if (uio) {
1637                         if (headOff < uio->uio_offset) {
1638                                 zero_cnt = uio->uio_offset - headOff;
1639                                 zero_off = headOff;
1640                         }
1641                 } else if (headOff < newEOF) {
1642                         zero_cnt = newEOF - headOff;
1643                         zero_off = headOff;
1644                 }
1645         }
1646         if (flags & IO_TAILZEROFILL) {
1647                 if (uio) {
1648                         zero_off1 = uio->uio_offset + uio->uio_resid;
1649
1650                         if (zero_off1 < tailOff)
1651                                 zero_cnt1 = tailOff - zero_off1;
1652                 }
1653         }
1654         if (zero_cnt == 0 && uio == (struct uio *) 0)
1655           {
1656             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1657                          retval, 0, 0, 0, 0);
1658             return (0);
1659           }
1660
1661         while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1662                 /*
1663                  * for this iteration of the loop, figure out where our starting point is
1664                  */
1665                 if (zero_cnt) {
1666                         start_offset = (int)(zero_off & PAGE_MASK_64);
1667                         upl_f_offset = zero_off - start_offset;
1668                 } else if (uio_resid) {
1669                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1670                         upl_f_offset = uio->uio_offset - start_offset;
1671                 } else {
1672                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1673                         upl_f_offset = zero_off1 - start_offset;
1674                 }
1675                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1676                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1677
1678                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1679                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1680
1681                 /*
1682                  * compute the size of the upl needed to encompass
1683                  * the requested write... limit each call to cluster_io
1684                  * to the maximum UPL size... cluster_io will clip if
1685                  * this exceeds the maximum io_size for the device,
1686                  * make sure to account for
1687                  * a starting offset that's not page aligned
1688                  */
1689                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1690
1691                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1692                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1693
1694                 pages_in_upl = upl_size / PAGE_SIZE;
1695                 io_size      = upl_size - start_offset;
1696
1697                 if ((long long)io_size > total_size)
1698                         io_size = total_size;
1699
1700                 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1701                 last_blkno  = start_blkno + pages_in_upl;
1702
1703                 kret = ubc_create_upl(vp,
1704                                                         upl_f_offset,
1705                                                         upl_size,
1706                                                         &upl,
1707                                                         &pl,
1708                                                         UPL_FLAGS_NONE);
1709                 if (kret != KERN_SUCCESS)
1710                         panic("cluster_write: failed to get pagelist");
1711
1712                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
1713                         (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1714
1715                 if (start_offset && !upl_valid_page(pl, 0)) {
1716                         int   read_size;
1717
1718                         /*
1719                          * we're starting in the middle of the first page of the upl
1720                          * and the page isn't currently valid, so we're going to have
1721                          * to read it in first... this is a synchronous operation
1722                          */
1723                         read_size = PAGE_SIZE;
1724
1725                         if ((upl_f_offset + read_size) > newEOF)
1726                                 read_size = newEOF - upl_f_offset;
1727
1728                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
1729                                             CL_READ, (struct buf *)0, (struct clios *)0);
1730                         if (retval) {
1731                                 /*
1732                                  * we had an error during the read which causes us to abort
1733                                  * the current cluster_write request... before we do, we need
1734                                  * to release the rest of the pages in the upl without modifying
1735                                  * there state and mark the failed page in error
1736                                  */
1737                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1738                                 ubc_upl_abort_range(upl, 0, upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1739
1740                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1741                                              (int)upl, 0, 0, retval, 0);
1742                                 break;
1743                         }
1744                 }
1745                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1746                         /*
1747                          * the last offset we're writing to in this upl does not end on a page
1748                          * boundary... if it's not beyond the old EOF, then we'll also need to
1749                          * pre-read this page in if it isn't already valid
1750                          */
1751                         upl_offset = upl_size - PAGE_SIZE;
1752
1753                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1754                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1755                                 int   read_size;
1756
1757                                 read_size = PAGE_SIZE;
1758
1759                                 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1760                                         read_size = newEOF - (upl_f_offset + upl_offset);
1761
1762                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
1763                                                     CL_READ, (struct buf *)0, (struct clios *)0);
1764                                 if (retval) {
1765                                         /*
1766                                          * we had an error during the read which causes us to abort
1767                                          * the current cluster_write request... before we do, we
1768                                          * need to release the rest of the pages in the upl without
1769                                          * modifying there state and mark the failed page in error
1770                                          */
1771                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1772                                         ubc_upl_abort_range(upl, 0,          upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1773
1774                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1775                                                      (int)upl, 0, 0, retval, 0);
1776                                         break;
1777                                 }
1778                         }
1779                 }
1780                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
1781                         panic("cluster_write: ubc_upl_map failed\n");
1782                 xfer_resid = io_size;
1783                 io_offset = start_offset;
1784
1785                 while (zero_cnt && xfer_resid) {
1786
1787                         if (zero_cnt < (long long)xfer_resid)
1788                                 bytes_to_zero = zero_cnt;
1789                         else
1790                                 bytes_to_zero = xfer_resid;
1791
1792                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1793                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1794
1795                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1796                                              (int)upl_f_offset + io_offset, bytes_to_zero,
1797                                              (int)io_offset, xfer_resid, 0);
1798                         } else {
1799                                 int zero_pg_index;
1800
1801                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1802                                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1803
1804                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1805                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1806
1807                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1808                                                      (int)upl_f_offset + io_offset, bytes_to_zero,
1809                                                      (int)io_offset, xfer_resid, 0);
1810
1811                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1812                                            !upl_dirty_page(pl, zero_pg_index)) {
1813                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1814
1815                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1816                                                      (int)upl_f_offset + io_offset, bytes_to_zero,
1817                                                      (int)io_offset, xfer_resid, 0);
1818                                 }
1819                         }
1820                         xfer_resid -= bytes_to_zero;
1821                         zero_cnt   -= bytes_to_zero;
1822                         zero_off   += bytes_to_zero;
1823                         io_offset  += bytes_to_zero;
1824                 }
1825                 if (xfer_resid && uio_resid) {
1826                         bytes_to_move = min(uio_resid, xfer_resid);
1827
1828                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1829                                      (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1830
1831                         retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1832
1833
1834                         if (retval) {
1835                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1836                                         panic("cluster_write: kernel_upl_unmap failed\n");
1837
1838                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1839
1840                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1841                                              (int)upl, 0, 0, retval, 0);
1842                         } else {
1843                                 uio_resid  -= bytes_to_move;
1844                                 xfer_resid -= bytes_to_move;
1845                                 io_offset  += bytes_to_move;
1846                         }
1847                 }
1848                 while (xfer_resid && zero_cnt1 && retval == 0) {
1849
1850                         if (zero_cnt1 < (long long)xfer_resid)
1851                                 bytes_to_zero = zero_cnt1;
1852                         else
1853                                 bytes_to_zero = xfer_resid;
1854
1855                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1856                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1857
1858                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1859                                              (int)upl_f_offset + io_offset,
1860                                              bytes_to_zero, (int)io_offset, xfer_resid, 0);
1861                         } else {
1862                                 int zero_pg_index;
1863
1864                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1865                                 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1866
1867                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1868                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1869
1870                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1871                                                      (int)upl_f_offset + io_offset,
1872                                                      bytes_to_zero, (int)io_offset, xfer_resid, 0);
1873
1874                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1875                                            !upl_dirty_page(pl, zero_pg_index)) {
1876                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1877
1878                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1879                                                      (int)upl_f_offset + io_offset,
1880                                                      bytes_to_zero, (int)io_offset, xfer_resid, 0);
1881                                 }
1882                         }
1883                         xfer_resid -= bytes_to_zero;
1884                         zero_cnt1  -= bytes_to_zero;
1885                         zero_off1  += bytes_to_zero;
1886                         io_offset  += bytes_to_zero;
1887                 }
1888
1889                 if (retval == 0) {
1890                         int cl_index;
1891                         int can_delay;
1892
1893                         io_size += start_offset;
1894
1895                         if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1896                                 /*
1897                                  * if we're extending the file with this write
1898                                  * we'll zero fill the rest of the page so that
1899                                  * if the file gets extended again in such a way as to leave a
1900                                  * hole starting at this EOF, we'll have zero's in the correct spot
1901                                  */
1902                                 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1903
1904                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1905                                              (int)upl_f_offset + io_size,
1906                                              upl_size - io_size, 0, 0, 0);
1907                         }
1908                         if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1909                                 panic("cluster_write: kernel_upl_unmap failed\n");
1910
1911                         if (flags & IO_SYNC)
1912                                 /*
1913                                  * if the IO_SYNC flag is set than we need to
1914                                  * bypass any clusters and immediately issue
1915                                  * the I/O
1916                                  */
1917                                 goto issue_io;
1918
1919                         if (vp->v_clen == 0)
1920                                 /*
1921                                  * no clusters currently present
1922                                  */
1923                                 goto start_new_cluster;
1924
1925                         /*
1926                          * keep track of the overall dirty page
1927                          * range we've developed
1928                          * in case we have to fall back to the
1929                          * VHASDIRTY method of flushing
1930                          */
1931                         if (vp->v_flag & VHASDIRTY)
1932                                 goto delay_io;
1933
1934                         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
1935                                 /*
1936                                  * we have an existing cluster... see if this write will extend it nicely
1937                                  */
1938                                 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
1939                                         /*
1940                                          * the current write starts at or after the current cluster
1941                                          */
1942                                         if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1943                                                 /*
1944                                                  * we have a write that fits entirely
1945                                                  * within the existing cluster limits
1946                                                  */
1947                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg)
1948                                                         /*
1949                                                          * update our idea of where the cluster ends
1950                                                          */
1951                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
1952                                                 break;
1953                                         }
1954                                         if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1955                                                 /*
1956                                                  * we have a write that starts in the middle of the current cluster
1957                                                  * but extends beyond the cluster's limit
1958                                                  * we'll clip the current cluster if we actually
1959                                                  * overlap with the new write
1960                                                  * and start a new cluster with the current write
1961                                                  */
1962                                                  if (vp->v_clusters[cl_index].last_pg > start_blkno)
1963                                                         vp->v_clusters[cl_index].last_pg = start_blkno;
1964                                         }
1965                                         /*
1966                                          * we also get here for the case where the current write starts
1967                                          * beyond the limit of the existing cluster
1968                                          *
1969                                          * in either case, we'll check the remaining clusters before
1970                                          * starting a new one
1971                                          */
1972                                 } else {
1973                                         /*
1974                                          * the current write starts in front of the current cluster
1975                                          */
1976                                         if ((vp->v_clusters[cl_index].last_pg - start_blkno) <=  MAX_UPL_TRANSFER) {
1977                                                 /*
1978                                                  * we can just merge the old cluster
1979                                                  * with the new request and leave it
1980                                                  * in the cache
1981                                                  */
1982                                                 vp->v_clusters[cl_index].start_pg = start_blkno;
1983
1984                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
1985                                                         /*
1986                                                          * the current write completely
1987                                                          * envelops the existing cluster
1988                                                          */
1989                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
1990                                                 }
1991                                                 break;
1992                                         }
1993
1994                                         /*
1995                                          * if we were to combine this write with the current cluster
1996                                          * we would exceed the cluster size limit.... so,
1997                                          * let's see if there's any overlap of the new I/O with
1998                                          * the existing cluster...
1999                                          *
2000                                          */
2001                                         if (last_blkno > vp->v_clusters[cl_index].start_pg)
2002                                                 /*
2003                                                  * the current write extends into the existing cluster
2004                                                  * clip the current cluster by moving the start position
2005                                                  * to where the current write ends
2006                                                  */
2007                                                 vp->v_clusters[cl_index].start_pg = last_blkno;
2008                                         /*
2009                                          * if we get here, there was no way to merge
2010                                          * the new I/O with this cluster and
2011                                          * keep it under our maximum cluster length
2012                                          * we'll check the remaining clusters before starting a new one
2013                                          */
2014                                 }
2015                         }
2016                         if (cl_index < vp->v_clen)
2017                                 /*
2018                                  * we found an existing cluster that we
2019                                  * could merger this I/O into
2020                                  */
2021                                 goto delay_io;
2022
2023                         if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
2024                                 /*
2025                                  * we didn't find an existing cluster to
2026                                  * merge into, but there's room to start
2027                                  * a new one
2028                                  */
2029                                 goto start_new_cluster;
2030
2031                         /*
2032                          * no exisitng cluster to merge with and no
2033                          * room to start a new one... we'll try
2034                          * pushing the existing ones... if none of
2035                          * them are able to be pushed, we'll have
2036                          * to fall back on the VHASDIRTY mechanism
2037                          * cluster_try_push will set v_clen to the
2038                          * number of remaining clusters if it is
2039                          * unable to push all of them
2040                          */
2041                         if (vp->v_flag & VNOCACHE_DATA)
2042                                 can_delay = 0;
2043                         else
2044                                 can_delay = 1;
2045
2046                         if (cluster_try_push(vp, newEOF, 0, 0) == 0) {
2047                                 vp->v_flag |= VHASDIRTY;
2048                                 goto delay_io;
2049                         }
2050 start_new_cluster:
2051                         if (vp->v_clen == 0) {
2052                                 vp->v_ciosiz = devblocksize;
2053                                 vp->v_cstart = start_blkno;
2054                                 vp->v_lastw  = last_blkno;
2055                         }
2056                         vp->v_clusters[vp->v_clen].start_pg = start_blkno;
2057                         vp->v_clusters[vp->v_clen].last_pg  = last_blkno;
2058                         vp->v_clen++;
2059 delay_io:
2060                         /*
2061                          * make sure we keep v_cstart and v_lastw up to
2062                          * date in case we have to fall back on the
2063                          * V_HASDIRTY mechanism (or we've already entered it)
2064                          */
2065                         if (start_blkno < vp->v_cstart)
2066                                 vp->v_cstart = start_blkno;
2067                         if (last_blkno > vp->v_lastw)
2068                                 vp->v_lastw = last_blkno;
2069
2070                         ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2071                         continue;
2072 issue_io:
2073                         /*
2074                          * in order to maintain some semblance of coherency with mapped writes
2075                          * we need to write the cluster back out as a multiple of the PAGESIZE
2076                          * unless the cluster encompasses the last page of the file... in this
2077                          * case we'll round out to the nearest device block boundary
2078                          */
2079                         io_size = upl_size;
2080
2081                         if ((upl_f_offset + io_size) > newEOF) {
2082                                 io_size = newEOF - upl_f_offset;
2083                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2084                         }
2085
2086                         if (flags & IO_SYNC)
2087                                 io_flags = CL_COMMIT | CL_AGE;
2088                         else
2089                                 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
2090
2091                         if (vp->v_flag & VNOCACHE_DATA)
2092                                 io_flags |= CL_DUMP;
2093
2094                         while (vp->v_numoutput >= ASYNC_THROTTLE) {
2095                                 vp->v_flag |= VTHROTTLED;
2096                                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
2097                         }
2098                         retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
2099                                             io_flags, (struct buf *)0, (struct clios *)0);
2100                 }
2101         }
2102         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2103                      retval, 0, 0, 0, 0);
2104
2105         return (retval);
2106 }
2107
2108 int
2109 cluster_read(vp, uio, filesize, devblocksize, flags)
2110         struct vnode *vp;
2111         struct uio   *uio;
2112         off_t         filesize;
2113         int           devblocksize;
2114         int           flags;
2115 {
2116         int           prev_resid;
2117         int           clip_size;
2118         off_t         max_io_size;
2119         struct iovec  *iov;
2120         vm_offset_t   upl_offset;
2121         int           upl_size;
2122         int           pages_in_pl;
2123         upl_page_info_t *pl;
2124         int           upl_flags;
2125         upl_t         upl;
2126         int           retval = 0;
2127
2128         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2129                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2130
2131         /*
2132          * We set a threshhold of 4 pages to decide if the nocopy
2133          * read loop is worth the trouble...
2134          */
2135
2136         if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
2137           {
2138             retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2139             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2140                          (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2141             return(retval);
2142           }
2143
2144         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2145           {
2146             /* we know we have a resid, so this is safe */
2147             iov = uio->uio_iov;
2148             while (iov->iov_len == 0) {
2149               uio->uio_iov++;
2150               uio->uio_iovcnt--;
2151               iov = uio->uio_iov;
2152             }
2153
2154             /*
2155              * We check every vector target and if it is physically
2156              * contiguous space, we skip the sanity checks.
2157              */
2158
2159             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
2160             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
2161             pages_in_pl = 0;
2162             upl_flags = UPL_QUERY_OBJECT_TYPE;
2163             if((vm_map_get_upl(current_map(),
2164                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2165                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
2166               {
2167                 /*
2168                  * the user app must have passed in an invalid address
2169                  */
2170                 return (EFAULT);
2171               }
2172
2173             if (upl_flags & UPL_PHYS_CONTIG)
2174               {
2175                 retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
2176               }
2177             else if (uio->uio_resid < 4 * PAGE_SIZE)
2178               {
2179                 /*
2180                  * We set a threshhold of 4 pages to decide if the nocopy
2181                  * read loop is worth the trouble...
2182                  */
2183                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2184                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2185                              (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2186                 return(retval);
2187               }
2188             else if (uio->uio_offset & PAGE_MASK_64)
2189               {
2190                 /* Bring the file offset read up to a pagesize boundary */
2191                 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2192                 if (uio->uio_resid < clip_size)
2193                   clip_size = uio->uio_resid;
2194                 /*
2195                  * Fake the resid going into the cluster_read_x call
2196                  * and restore it on the way out.
2197                  */
2198                 prev_resid = uio->uio_resid;
2199                 uio->uio_resid = clip_size;
2200                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2201                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2202               }
2203             else if ((int)iov->iov_base & PAGE_MASK_64)
2204               {
2205                 clip_size = iov->iov_len;
2206                 prev_resid = uio->uio_resid;
2207                 uio->uio_resid = clip_size;
2208                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2209                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2210               }
2211             else
2212               {
2213                 /*
2214                  * If we come in here, we know the offset into
2215                  * the file is on a pagesize boundary
2216                  */
2217
2218                 max_io_size = filesize - uio->uio_offset;
2219                 clip_size = uio->uio_resid;
2220                 if (iov->iov_len < clip_size)
2221                   clip_size = iov->iov_len;
2222                 if (max_io_size < clip_size)
2223                   clip_size = (int)max_io_size;
2224
2225                 if (clip_size < PAGE_SIZE)
2226                   {
2227                     /*
2228                      * Take care of the tail end of the read in this vector.
2229                      */
2230                     prev_resid = uio->uio_resid;
2231                     uio->uio_resid = clip_size;
2232                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2233                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2234                   }
2235                 else
2236                   {
2237                     /* round clip_size down to a multiple of pagesize */
2238                     clip_size = clip_size & ~(PAGE_MASK);
2239                     prev_resid = uio->uio_resid;
2240                     uio->uio_resid = clip_size;
2241                     retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2242                     if ((retval==0) && uio->uio_resid)
2243                       retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2244                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2245                   }
2246               } /* end else */
2247           } /* end while */
2248
2249         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2250                      (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2251
2252         return(retval);
2253 }
2254
2255
2256 static int
2257 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2258         struct vnode *vp;
2259         struct uio   *uio;
2260         off_t         filesize;
2261         int           devblocksize;
2262         int           flags;
2263 {
2264         upl_page_info_t *pl;
2265         upl_t            upl;
2266         vm_offset_t      upl_offset;
2267         int              upl_size;
2268         off_t            upl_f_offset;
2269         int              start_offset;
2270         int              start_pg;
2271         int              last_pg;
2272         int              uio_last;
2273         int              pages_in_upl;
2274         off_t            max_size;
2275         int              io_size;
2276         vm_offset_t      io_address;
2277         kern_return_t    kret;
2278         int              segflg;
2279         int              error  = 0;
2280         int              retval = 0;
2281         int              b_lblkno;
2282         int              e_lblkno;
2283
2284         b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
2285
2286         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2287                 /*
2288                  * compute the size of the upl needed to encompass
2289                  * the requested read... limit each call to cluster_io
2290                  * to the maximum UPL size... cluster_io will clip if
2291                  * this exceeds the maximum io_size for the device,
2292                  * make sure to account for
2293                  * a starting offset that's not page aligned
2294                  */
2295                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2296                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2297                 max_size     = filesize - uio->uio_offset;
2298
2299                 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2300                         io_size = uio->uio_resid;
2301                 else
2302                         io_size = max_size;
2303
2304                 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
2305                         segflg = uio->uio_segflg;
2306
2307                         uio->uio_segflg = UIO_PHYS_USERSPACE;
2308
2309                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2310                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2311
2312                         while (io_size && retval == 0) {
2313                                 int         xsize;
2314                                 vm_offset_t paddr;
2315
2316                                 if (ubc_page_op(vp,
2317                                                 upl_f_offset,
2318                                                 UPL_POP_SET | UPL_POP_BUSY,
2319                                                 &paddr, 0) != KERN_SUCCESS)
2320                                         break;
2321
2322                                 xsize = PAGE_SIZE - start_offset;
2323
2324                                 if (xsize > io_size)
2325                                         xsize = io_size;
2326
2327                                 retval = uiomove((caddr_t)(paddr + start_offset), xsize, uio);
2328
2329                                 ubc_page_op(vp, upl_f_offset,
2330                                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2331
2332                                 io_size     -= xsize;
2333                                 start_offset = (int)
2334                                         (uio->uio_offset & PAGE_MASK_64);
2335                                 upl_f_offset = uio->uio_offset - start_offset;
2336                         }
2337                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2338                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2339
2340                         uio->uio_segflg = segflg;
2341
2342                         if (retval)
2343                                 break;
2344
2345                         if (io_size == 0) {
2346                                 /*
2347                                  * we're already finished with this read request
2348                                  * let's see if we should do a read-ahead
2349                                  */
2350                                 e_lblkno = (int)
2351                                         ((uio->uio_offset - 1) / PAGE_SIZE_64);
2352
2353                                 if (!(vp->v_flag & VRAOFF))
2354                                         /*
2355                                          * let's try to read ahead if we're in
2356                                          * a sequential access pattern
2357                                          */
2358                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2359                                 vp->v_lastr = e_lblkno;
2360
2361                                 break;
2362                         }
2363                         max_size = filesize - uio->uio_offset;
2364                 }
2365                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2366                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2367                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2368                 pages_in_upl = upl_size / PAGE_SIZE;
2369
2370                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2371                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2372
2373                 kret = ubc_create_upl(vp,
2374                                                 upl_f_offset,
2375                                                 upl_size,
2376                                                 &upl,
2377                                                 &pl,
2378                                                 UPL_FLAGS_NONE);
2379                 if (kret != KERN_SUCCESS)
2380                         panic("cluster_read: failed to get pagelist");
2381
2382                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2383                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2384
2385                 /*
2386                  * scan from the beginning of the upl looking for the first
2387                  * non-valid page.... this will become the first page in
2388                  * the request we're going to make to 'cluster_io'... if all
2389                  * of the pages are valid, we won't call through to 'cluster_io'
2390                  */
2391                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2392                         if (!upl_valid_page(pl, start_pg))
2393                                 break;
2394                 }
2395
2396                 /*
2397                  * scan from the starting invalid page looking for a valid
2398                  * page before the end of the upl is reached, if we
2399                  * find one, then it will be the last page of the request to
2400                  * 'cluster_io'
2401                  */
2402                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2403                         if (upl_valid_page(pl, last_pg))
2404                                 break;
2405                 }
2406
2407                 if (start_pg < last_pg) {
2408                         /*
2409                          * we found a range of 'invalid' pages that must be filled
2410                          * if the last page in this range is the last page of the file
2411                          * we may have to clip the size of it to keep from reading past
2412                          * the end of the last physical block associated with the file
2413                          */
2414                         upl_offset = start_pg * PAGE_SIZE;
2415                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2416
2417                         if ((upl_f_offset + upl_offset + io_size) > filesize)
2418                                 io_size = filesize - (upl_f_offset + upl_offset);
2419
2420                         /*
2421                          * issue a synchronous read to cluster_io
2422                          */
2423
2424                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2425                                            io_size, devblocksize, CL_READ, (struct buf *)0, (struct clios *)0);
2426                 }
2427                 if (error == 0) {
2428                         /*
2429                          * if the read completed successfully, or there was no I/O request
2430                          * issued, than map the upl into kernel address space and
2431                          * move the data into user land.... we'll first add on any 'valid'
2432                          * pages that were present in the upl when we acquired it.
2433                          */
2434                         u_int  val_size;
2435                         u_int  size_of_prefetch;
2436
2437                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2438                                 if (!upl_valid_page(pl, uio_last))
2439                                         break;
2440                         }
2441                         /*
2442                          * compute size to transfer this round,  if uio->uio_resid is
2443                          * still non-zero after this uiomove, we'll loop around and
2444                          * set up for another I/O.
2445                          */
2446                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2447
2448                         if (max_size < val_size)
2449                                 val_size = max_size;
2450
2451                         if (uio->uio_resid < val_size)
2452                                 val_size = uio->uio_resid;
2453
2454                         e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2455
2456                         if (size_of_prefetch = (uio->uio_resid - val_size)) {
2457                                 /*
2458                                  * if there's still I/O left to do for this request, then issue a
2459                                  * pre-fetch I/O... the I/O wait time will overlap
2460                                  * with the copying of the data
2461                                  */
2462                                 cluster_rd_prefetch(vp, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
2463                         } else {
2464                                 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2465                                         /*
2466                                          * let's try to read ahead if we're in
2467                                          * a sequential access pattern
2468                                          */
2469                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2470                                 vp->v_lastr = e_lblkno;
2471                         }
2472                         if (uio->uio_segflg == UIO_USERSPACE) {
2473                                 int       offset;
2474
2475                                 segflg = uio->uio_segflg;
2476
2477                                 uio->uio_segflg = UIO_PHYS_USERSPACE;
2478
2479
2480                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2481                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2482
2483                                 offset = start_offset;
2484
2485                                 while (val_size && retval == 0) {
2486                                         int       csize;
2487                                         int       i;
2488                                         caddr_t   paddr;
2489
2490                                         i = offset / PAGE_SIZE;
2491                                         csize = min(PAGE_SIZE - start_offset, val_size);
2492
2493                                         paddr = (caddr_t)upl_phys_page(pl, i) + start_offset;
2494
2495                                         retval = uiomove(paddr, csize, uio);
2496
2497                                         val_size    -= csize;
2498                                         offset      += csize;
2499                                         start_offset = offset & PAGE_MASK;
2500                                 }
2501                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2502                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2503
2504                                 uio->uio_segflg = segflg;
2505                         }
2506                         else
2507                         {
2508                                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
2509                                         panic("cluster_read: ubc_upl_map() failed\n");
2510
2511                                 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2512
2513                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
2514                                         panic("cluster_read: ubc_upl_unmap() failed\n");
2515                         }
2516                 }
2517                 if (start_pg < last_pg) {
2518                         /*
2519                          * compute the range of pages that we actually issued an I/O for
2520                          * and either commit them as valid if the I/O succeeded
2521                          * or abort them if the I/O failed
2522                          */
2523                         io_size = (last_pg - start_pg) * PAGE_SIZE;
2524
2525                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2526                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2527
2528                         if (error || (vp->v_flag & VNOCACHE_DATA))
2529                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2530                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2531                         else
2532                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2533                                                 UPL_COMMIT_CLEAR_DIRTY
2534                                                 | UPL_COMMIT_FREE_ON_EMPTY
2535                                                 | UPL_COMMIT_INACTIVATE);
2536
2537                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2538                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2539                 }
2540                 if ((last_pg - start_pg) < pages_in_upl) {
2541                         int cur_pg;
2542                         int commit_flags;
2543
2544                         /*
2545                          * the set of pages that we issued an I/O for did not encompass
2546                          * the entire upl... so just release these without modifying
2547                          * there state
2548                          */
2549                         if (error)
2550                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2551                         else {
2552                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2553                                              (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2554
2555                                 if (start_pg) {
2556                                         /*
2557                                          * we found some already valid pages at the beginning of
2558                                          * the upl commit these back to the inactive list with
2559                                          * reference cleared
2560                                          */
2561                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2562                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2563                                                                    | UPL_COMMIT_INACTIVATE;
2564
2565                                                 if (upl_dirty_page(pl, cur_pg))
2566                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2567
2568                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2569                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2570                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2571                                                 else
2572                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2573                                                                 PAGE_SIZE, commit_flags);
2574                                         }
2575                                 }
2576                                 if (last_pg < uio_last) {
2577                                         /*
2578                                          * we found some already valid pages immediately after the
2579                                          * pages we issued I/O for, commit these back to the
2580                                          * inactive list with reference cleared
2581                                          */
2582                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2583                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
2584                                                                                 | UPL_COMMIT_INACTIVATE;
2585
2586                                                 if (upl_dirty_page(pl, cur_pg))
2587                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2588
2589                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2590                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2591                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2592                                                 else
2593                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2594                                                                 PAGE_SIZE, commit_flags);
2595                                         }
2596                                 }
2597                                 if (uio_last < pages_in_upl) {
2598                                         /*
2599                                          * there were some invalid pages beyond the valid pages
2600                                          * that we didn't issue an I/O for, just release them
2601                                          * unchanged
2602                                          */
2603                                         ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2604                                                             (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
2605                                 }
2606
2607                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2608                                         (int)upl, -1, -1, 0, 0);
2609                         }
2610                 }
2611                 if (retval == 0)
2612                         retval = error;
2613         }
2614
2615         return (retval);
2616 }
2617
2618
2619 static int
2620 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2621         struct vnode *vp;
2622         struct uio   *uio;
2623         off_t         filesize;
2624         int           devblocksize;
2625         int           flags;
2626 {
2627         upl_t            upl;
2628         upl_page_info_t  *pl;
2629         off_t            upl_f_offset;
2630         vm_offset_t      upl_offset;
2631         off_t            start_upl_f_offset;
2632         off_t            max_io_size;
2633         int              io_size;
2634         int              upl_size;
2635         int              upl_needed_size;
2636         int              pages_in_pl;
2637         vm_offset_t      paddr;
2638         int              upl_flags;
2639         kern_return_t    kret;
2640         int              segflg;
2641         struct iovec     *iov;
2642         int              i;
2643         int              force_data_sync;
2644         int              retval = 0;
2645         int              first = 1;
2646         struct clios     iostate;
2647
2648         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2649                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2650
2651         /*
2652          * When we enter this routine, we know
2653          *  -- the offset into the file is on a pagesize boundary
2654          *  -- the resid is a page multiple
2655          *  -- the resid will not exceed iov_len
2656          */
2657
2658         iostate.io_completed = 0;
2659         iostate.io_issued = 0;
2660         iostate.io_error = 0;
2661         iostate.io_wanted = 0;
2662
2663         iov = uio->uio_iov;
2664
2665         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2666
2667                 max_io_size = filesize - uio->uio_offset;
2668
2669                 if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2670                         io_size = max_io_size;
2671                 else
2672                         io_size = uio->uio_resid;
2673
2674                 /*
2675                  * We don't come into this routine unless
2676                  * UIO_USERSPACE is set.
2677                  */
2678                 segflg = uio->uio_segflg;
2679
2680                 uio->uio_segflg = UIO_PHYS_USERSPACE;
2681
2682                 /*
2683                  * First look for pages already in the cache
2684                  * and move them to user space.
2685                  */
2686                 while (io_size && (retval == 0)) {
2687                         upl_f_offset = uio->uio_offset;
2688
2689                         /*
2690                          * If this call fails, it means the page is not
2691                          * in the page cache.
2692                          */
2693                         if (ubc_page_op(vp, upl_f_offset,
2694                                         UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS)
2695                                 break;
2696
2697                         retval = uiomove((caddr_t)(paddr), PAGE_SIZE, uio);
2698
2699                         ubc_page_op(vp, upl_f_offset,
2700                                     UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2701
2702                         io_size -= PAGE_SIZE;
2703                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
2704                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2705                 }
2706                 uio->uio_segflg = segflg;
2707
2708                 if (retval) {
2709                         /*
2710                          * we may have already spun some portion of this request
2711                          * off as async requests... we need to wait for the I/O
2712                          * to complete before returning
2713                          */
2714                         goto wait_for_reads;
2715                 }
2716                 /*
2717                  * If we are already finished with this read, then return
2718                  */
2719                 if (io_size == 0) {
2720                         /*
2721                          * we may have already spun some portion of this request
2722                          * off as async requests... we need to wait for the I/O
2723                          * to complete before returning
2724                          */
2725                         goto wait_for_reads;
2726                 }
2727                 max_io_size = io_size;
2728
2729                 if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2730                         max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2731                 if (first) {
2732                         if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2733                                 max_io_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 8;
2734                         first = 0;
2735                 }
2736                 start_upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
2737                 upl_f_offset = start_upl_f_offset;
2738                 io_size = 0;
2739
2740                 while (io_size < max_io_size) {
2741                         if (ubc_page_op(vp, upl_f_offset,
2742                                         UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS) {
2743                                 ubc_page_op(vp, upl_f_offset,
2744                                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2745                                 break;
2746                         }
2747                         /*
2748                          * Build up the io request parameters.
2749                          */
2750                         io_size += PAGE_SIZE_64;
2751                         upl_f_offset += PAGE_SIZE_64;
2752                 }
2753                 if (io_size == 0)
2754                         /*
2755                          * we may have already spun some portion of this request
2756                          * off as async requests... we need to wait for the I/O
2757                          * to complete before returning
2758                          */
2759                         goto wait_for_reads;
2760
2761                 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2762                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2763
2764                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2765                              (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
2766
2767                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2768                         pages_in_pl = 0;
2769                         upl_size = upl_needed_size;
2770                         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2771
2772                         kret = vm_map_get_upl(current_map(),
2773                                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2774                                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2775
2776                         if (kret != KERN_SUCCESS) {
2777                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2778                                              (int)upl_offset, upl_size, io_size, kret, 0);
2779
2780                                 /*
2781                                  * cluster_nocopy_read: failed to get pagelist
2782                                  *
2783                                  * we may have already spun some portion of this request
2784                                  * off as async requests... we need to wait for the I/O
2785                                  * to complete before returning
2786                                  */
2787                                 goto wait_for_reads;
2788                         }
2789                         pages_in_pl = upl_size / PAGE_SIZE;
2790                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2791
2792                         for (i = 0; i < pages_in_pl; i++) {
2793                                 if (!upl_valid_page(pl, i))
2794                                         break;
2795                         }
2796                         if (i == pages_in_pl)
2797                                 break;
2798
2799                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2800                                             UPL_ABORT_FREE_ON_EMPTY);
2801                 }
2802                 if (force_data_sync >= 3) {
2803                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2804                                      (int)upl_offset, upl_size, io_size, kret, 0);
2805
2806                         goto wait_for_reads;
2807                 }
2808                 /*
2809                  * Consider the possibility that upl_size wasn't satisfied.
2810                  */
2811                 if (upl_size != upl_needed_size)
2812                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2813
2814                 if (io_size == 0) {
2815                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2816                                             UPL_ABORT_FREE_ON_EMPTY);
2817                         goto wait_for_reads;
2818                 }
2819                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2820                              (int)upl_offset, upl_size, io_size, kret, 0);
2821
2822                 /*
2823                  * request asynchronously so that we can overlap
2824                  * the preparation of the next I/O
2825                  * if there are already too many outstanding reads
2826                  * wait until some have completed before issuing the next read
2827                  */
2828                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2829                         iostate.io_wanted = 1;
2830                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2831                 }
2832                 if (iostate.io_error) {
2833                         /*
2834                          * one of the earlier reads we issued ran into a hard error
2835                          * don't issue any more reads, cleanup the UPL
2836                          * that was just created but not used, then
2837                          * go wait for any other reads to complete before
2838                          * returning the error to the caller
2839                          */
2840                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2841                                             UPL_ABORT_FREE_ON_EMPTY);
2842
2843                         goto wait_for_reads;
2844                 }
2845                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2846                              (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
2847
2848                 retval = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
2849                                    io_size, devblocksize,
2850                                    CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
2851                                    (struct buf *)0, &iostate);
2852
2853                 /*
2854                  * update the uio structure
2855                  */
2856                 iov->iov_base   += io_size;
2857                 iov->iov_len    -= io_size;
2858                 uio->uio_resid  -= io_size;
2859                 uio->uio_offset += io_size;
2860
2861                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2862                              (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, retval, 0);
2863
2864         } /* end while */
2865
2866 wait_for_reads:
2867         /*
2868          * make sure all async reads that are part of this stream
2869          * have completed before we return
2870          */
2871         while (iostate.io_issued != iostate.io_completed) {
2872                 iostate.io_wanted = 1;
2873                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2874         }
2875         if (iostate.io_error)
2876                 retval = iostate.io_error;
2877
2878         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2879                      (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2880
2881         return (retval);
2882 }
2883
2884
2885 static int
2886 cluster_phys_read(vp, uio, filesize, devblocksize, flags)
2887         struct vnode *vp;
2888         struct uio   *uio;
2889         off_t        filesize;
2890         int          devblocksize;
2891         int          flags;
2892 {
2893         upl_page_info_t *pl;
2894         upl_t            upl;
2895         vm_offset_t      upl_offset;
2896         vm_offset_t      dst_paddr;
2897         off_t            max_size;
2898         int              io_size;
2899         int              tail_size;
2900         int              upl_size;
2901         int              upl_needed_size;
2902         int              pages_in_pl;
2903         int              upl_flags;
2904         kern_return_t    kret;
2905         struct iovec     *iov;
2906         struct clios     iostate;
2907         int              error;
2908
2909         /*
2910          * When we enter this routine, we know
2911          *  -- the resid will not exceed iov_len
2912          *  -- the target address is physically contiguous
2913          */
2914
2915         iov = uio->uio_iov;
2916
2917         max_size = filesize - uio->uio_offset;
2918
2919         if (max_size > (off_t)((unsigned int)iov->iov_len))
2920                 io_size = iov->iov_len;
2921         else
2922                 io_size = max_size;
2923
2924         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2925         upl_needed_size = upl_offset + io_size;
2926
2927         error       = 0;
2928         pages_in_pl = 0;
2929         upl_size = upl_needed_size;
2930         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2931
2932         kret = vm_map_get_upl(current_map(),
2933                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2934                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
2935
2936         if (kret != KERN_SUCCESS) {
2937                 /*
2938                  * cluster_phys_read: failed to get pagelist
2939                  */
2940                 return(EINVAL);
2941         }
2942         if (upl_size < upl_needed_size) {
2943                 /*
2944                  * The upl_size wasn't satisfied.
2945                  */
2946                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2947
2948                 return(EINVAL);
2949         }
2950         pl = ubc_upl_pageinfo(upl);
2951
2952         dst_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
2953
2954         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
2955                 int   head_size;
2956
2957                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
2958
2959                 if (head_size > io_size)
2960                         head_size = io_size;
2961
2962                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
2963
2964                 if (error) {
2965                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2966
2967                         return(EINVAL);
2968                 }
2969                 upl_offset += head_size;
2970                 dst_paddr  += head_size;
2971                 io_size    -= head_size;
2972         }
2973         tail_size = io_size & (devblocksize - 1);
2974         io_size  -= tail_size;
2975
2976         iostate.io_completed = 0;
2977         iostate.io_issued = 0;
2978         iostate.io_error = 0;
2979         iostate.io_wanted = 0;
2980
2981         while (io_size && error == 0) {
2982                 int  xsize;
2983
2984                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2985                         xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
2986                 else
2987                         xsize = io_size;
2988                 /*
2989                  * request asynchronously so that we can overlap
2990                  * the preparation of the next I/O... we'll do
2991                  * the commit after all the I/O has completed
2992                  * since its all issued against the same UPL
2993                  * if there are already too many outstanding reads
2994                  * wait until some have completed before issuing the next
2995                  */
2996                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2997                         iostate.io_wanted = 1;
2998                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
2999                 }
3000
3001                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0,
3002                                    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3003                                    (struct buf *)0, &iostate);
3004                 /*
3005                  * The cluster_io read was issued successfully,
3006                  * update the uio structure
3007                  */
3008                 if (error == 0) {
3009                         uio->uio_resid  -= xsize;
3010                         iov->iov_len    -= xsize;
3011                         iov->iov_base   += xsize;
3012                         uio->uio_offset += xsize;
3013                         dst_paddr       += xsize;
3014                         upl_offset      += xsize;
3015                         io_size         -= xsize;
3016                 }
3017         }
3018         /*
3019          * make sure all async reads that are part of this stream
3020          * have completed before we proceed
3021          */
3022         while (iostate.io_issued != iostate.io_completed) {
3023                 iostate.io_wanted = 1;
3024                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3025         }
3026         if (iostate.io_error) {
3027                 error = iostate.io_error;
3028         }
3029         if (error == 0 && tail_size)
3030                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
3031
3032         /*
3033          * just release our hold on the physically contiguous
3034          * region without changing any state
3035          */
3036         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3037
3038         return (error);
3039 }
3040
3041
3042 /*
3043  * generate advisory I/O's in the largest chunks possible
3044  * the completed pages will be released into the VM cache
3045  */
3046 int
3047 advisory_read(vp, filesize, f_offset, resid, devblocksize)
3048         struct vnode *vp;
3049         off_t         filesize;
3050         off_t         f_offset;
3051         int           resid;
3052         int           devblocksize;
3053 {
3054         upl_page_info_t *pl;
3055         upl_t            upl;
3056         vm_offset_t      upl_offset;
3057         int              upl_size;
3058         off_t            upl_f_offset;
3059         int              start_offset;
3060         int              start_pg;
3061         int              last_pg;
3062         int              pages_in_upl;
3063         off_t            max_size;
3064         int              io_size;
3065         kern_return_t    kret;
3066         int              retval = 0;
3067         int              issued_io;
3068
3069         if (!UBCINFOEXISTS(vp))
3070                 return(EINVAL);
3071
3072         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3073                      (int)f_offset, resid, (int)filesize, devblocksize, 0);
3074
3075         while (resid && f_offset < filesize && retval == 0) {
3076                 /*
3077                  * compute the size of the upl needed to encompass
3078                  * the requested read... limit each call to cluster_io
3079                  * to the maximum UPL size... cluster_io will clip if
3080                  * this exceeds the maximum io_size for the device,
3081                  * make sure to account for
3082                  * a starting offset that's not page aligned
3083                  */
3084                 start_offset = (int)(f_offset & PAGE_MASK_64);
3085                 upl_f_offset = f_offset - (off_t)start_offset;
3086                 max_size     = filesize - f_offset;
3087
3088                 if (resid < max_size)
3089                         io_size = resid;
3090                 else
3091                         io_size = max_size;
3092
3093                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3094                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3095                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3096                 pages_in_upl = upl_size / PAGE_SIZE;
3097
3098                 kret = ubc_create_upl(vp,
3099                                                 upl_f_offset,
3100                                                 upl_size,
3101                                                 &upl,
3102                                                 &pl,
3103                                                 UPL_RET_ONLY_ABSENT);
3104                 if (kret != KERN_SUCCESS)
3105                         return(retval);
3106                 issued_io = 0;
3107
3108                 /*
3109                  * before we start marching forward, we must make sure we end on
3110                  * a present page, otherwise we will be working with a freed
3111                  * upl
3112                  */
3113                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3114                         if (upl_page_present(pl, last_pg))
3115                                 break;
3116                 }
3117                 pages_in_upl = last_pg + 1;
3118
3119
3120                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
3121                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3122
3123
3124                 for (last_pg = 0; last_pg < pages_in_upl; ) {
3125                         /*
3126                          * scan from the beginning of the upl looking for the first
3127                          * page that is present.... this will become the first page in
3128                          * the request we're going to make to 'cluster_io'... if all
3129                          * of the pages are absent, we won't call through to 'cluster_io'
3130                          */
3131                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3132                                 if (upl_page_present(pl, start_pg))
3133                                         break;
3134                         }
3135
3136                         /*
3137                          * scan from the starting present page looking for an absent
3138                          * page before the end of the upl is reached, if we
3139                          * find one, then it will terminate the range of pages being
3140                          * presented to 'cluster_io'
3141                          */
3142                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3143                                 if (!upl_page_present(pl, last_pg))
3144                                         break;
3145                         }
3146
3147                         if (last_pg > start_pg) {
3148                                 /*
3149                                  * we found a range of pages that must be filled
3150                                  * if the last page in this range is the last page of the file
3151                                  * we may have to clip the size of it to keep from reading past
3152                                  * the end of the last physical block associated with the file
3153                                  */
3154                                 upl_offset = start_pg * PAGE_SIZE;
3155                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
3156
3157                                 if ((upl_f_offset + upl_offset + io_size) > filesize)
3158                                         io_size = filesize - (upl_f_offset + upl_offset);
3159
3160                                 /*
3161                                  * issue an asynchronous read to cluster_io
3162                                  */
3163                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
3164                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
3165
3166                                 issued_io = 1;
3167                         }
3168                 }
3169                 if (issued_io == 0)
3170                         ubc_upl_abort(upl, 0);
3171
3172                 io_size = upl_size - start_offset;
3173
3174                 if (io_size > resid)
3175                         io_size = resid;
3176                 f_offset += io_size;
3177                 resid    -= io_size;
3178         }
3179
3180         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3181                      (int)f_offset, resid, retval, 0, 0);
3182
3183         return(retval);
3184 }
3185
3186
3187 int
3188 cluster_push(vp)
3189         struct vnode *vp;
3190 {
3191         int  retval;
3192
3193         if (!UBCINFOEXISTS(vp) || vp->v_clen == 0) {
3194                 vp->v_flag &= ~VHASDIRTY;
3195                 return(0);
3196         }
3197
3198         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3199                      vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
3200
3201         if (vp->v_flag & VHASDIRTY) {
3202                 daddr_t start_pg;
3203                 daddr_t last_pg;
3204                 daddr_t end_pg;
3205
3206                 start_pg = vp->v_cstart;
3207                 end_pg   = vp->v_lastw;
3208
3209                 vp->v_flag &= ~VHASDIRTY;
3210                 vp->v_clen = 0;
3211
3212                 while (start_pg < end_pg) {
3213                         last_pg = start_pg + MAX_UPL_TRANSFER;
3214
3215                         if (last_pg > end_pg)
3216                                 last_pg = end_pg;
3217
3218                         cluster_push_x(vp, ubc_getsize(vp), start_pg, last_pg, 0);
3219
3220                         start_pg = last_pg;
3221                 }
3222                 return (1);
3223         }
3224         retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
3225
3226         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3227                      vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
3228
3229         return (retval);
3230 }
3231
3232
3233 static int
3234 cluster_try_push(vp, EOF, can_delay, push_all)
3235         struct vnode *vp;
3236         off_t  EOF;
3237         int    can_delay;
3238         int    push_all;
3239 {
3240         int cl_index;
3241         int cl_index1;
3242         int min_index;
3243         int cl_len;
3244         int cl_total;
3245         int cl_pushed;
3246         struct v_cluster l_clusters[MAX_CLUSTERS];
3247
3248         /*
3249          * make a local 'sorted' copy of the clusters
3250          * and clear vp->v_clen so that new clusters can
3251          * be developed
3252          */
3253         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3254                 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3255                         if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3256                                 continue;
3257                         if (min_index == -1)
3258                                 min_index = cl_index1;
3259                         else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3260                                 min_index = cl_index1;
3261                 }
3262                 if (min_index == -1)
3263                         break;
3264                 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3265                 l_clusters[cl_index].last_pg  = vp->v_clusters[min_index].last_pg;
3266
3267                 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3268         }
3269         cl_len     = cl_index;
3270         vp->v_clen = 0;
3271
3272         for (cl_pushed = 0, cl_index = 0; cl_index < cl_len; cl_index++) {
3273                 /*
3274                  * try to push each cluster in turn...  cluster_push_x may not
3275                  * push the cluster if can_delay is TRUE and the cluster doesn't
3276                  * meet the critera for an immediate push
3277                  */
3278                 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3279                         l_clusters[cl_index].start_pg = 0;
3280                         l_clusters[cl_index].last_pg  = 0;
3281
3282                         cl_pushed++;
3283
3284                         if (push_all == 0)
3285                                 break;
3286                 }
3287         }
3288         if (cl_len > cl_pushed) {
3289                /*
3290                 * we didn't push all of the clusters, so
3291                 * lets try to merge them back in to the vnode
3292                 */
3293                 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3294                         /*
3295                          * we picked up some new clusters while we were trying to
3296                          * push the old ones (I don't think this can happen because
3297                          * I'm holding the lock, but just in case)... the sum of the
3298                          * leftovers plus the new cluster count exceeds our ability
3299                          * to represent them, so fall back to the VHASDIRTY mechanism
3300                          */
3301                         for (cl_index = 0; cl_index < cl_len; cl_index++) {
3302                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3303                                         continue;
3304
3305                                 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3306                                         vp->v_cstart = l_clusters[cl_index].start_pg;
3307                                 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3308                                         vp->v_lastw = l_clusters[cl_index].last_pg;
3309                         }
3310                         vp->v_flag |= VHASDIRTY;
3311                 } else {
3312                         /*
3313                          * we've got room to merge the leftovers back in
3314                          * just append them starting at the next 'hole'
3315                          * represented by vp->v_clen
3316                          */
3317                         for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3318                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3319                                         continue;
3320
3321                                 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3322                                 vp->v_clusters[cl_index1].last_pg  = l_clusters[cl_index].last_pg;
3323
3324                                 if (cl_index1 == 0) {
3325                                         vp->v_cstart = l_clusters[cl_index].start_pg;
3326                                         vp->v_lastw  = l_clusters[cl_index].last_pg;
3327                                 } else {
3328                                         if (l_clusters[cl_index].start_pg < vp->v_cstart)
3329                                                 vp->v_cstart = l_clusters[cl_index].start_pg;
3330                                         if (l_clusters[cl_index].last_pg > vp->v_lastw)
3331                                                 vp->v_lastw = l_clusters[cl_index].last_pg;
3332                                 }
3333                                 cl_index1++;
3334                         }
3335                         /*
3336                          * update the cluster count
3337                          */
3338                         vp->v_clen = cl_index1;
3339                 }
3340         }
3341         return(MAX_CLUSTERS - vp->v_clen);
3342 }
3343
3344
3345
3346 static int
3347 cluster_push_x(vp, EOF, first, last, can_delay)
3348         struct vnode *vp;
3349         off_t  EOF;
3350         daddr_t first;
3351         daddr_t last;
3352         int    can_delay;
3353 {
3354         upl_page_info_t *pl;
3355         upl_t            upl;
3356         vm_offset_t      upl_offset;
3357         int              upl_size;
3358         off_t            upl_f_offset;
3359         int              pages_in_upl;
3360         int              start_pg;
3361         int              last_pg;
3362         int              io_size;
3363         int              io_flags;
3364         int              size;
3365         kern_return_t    kret;
3366
3367
3368         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3369                      vp->v_clen, first, last, EOF, 0);
3370
3371         if ((pages_in_upl = last - first) == 0) {
3372                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
3373
3374                 return (1);
3375         }
3376         upl_size = pages_in_upl * PAGE_SIZE;
3377         upl_f_offset = ((off_t)first) * PAGE_SIZE_64;
3378
3379         if (upl_f_offset + upl_size >= EOF) {
3380
3381                 if (upl_f_offset >= EOF) {
3382                         /*
3383                          * must have truncated the file and missed
3384                          * clearing a dangling cluster (i.e. it's completely
3385                          * beyond the new EOF
3386                          */
3387                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3388
3389                         return(1);
3390                 }
3391                 size = EOF - upl_f_offset;
3392
3393                 upl_size = (size + (PAGE_SIZE - 1) ) & ~(PAGE_SIZE - 1);
3394                 pages_in_upl = upl_size / PAGE_SIZE;
3395         } else {
3396                 if (can_delay && (pages_in_upl < (MAX_UPL_TRANSFER - (MAX_UPL_TRANSFER / 2))))
3397                         return(0);
3398                 size = upl_size;
3399         }
3400         kret = ubc_create_upl(vp,
3401                                 upl_f_offset,
3402                                 upl_size,
3403                                 &upl,
3404                                 &pl,
3405                                 UPL_RET_ONLY_DIRTY);
3406         if (kret != KERN_SUCCESS)
3407                 panic("cluster_push: failed to get pagelist");
3408
3409         if (can_delay) {
3410                 int  num_of_dirty;
3411
3412                 for (num_of_dirty = 0, start_pg = 0; start_pg < pages_in_upl; start_pg++) {
3413                         if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3414                                 num_of_dirty++;
3415                 }
3416                 if (num_of_dirty < pages_in_upl / 2) {
3417                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3418
3419                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 0, 2, num_of_dirty, (pages_in_upl / 2), 0);
3420
3421                         return(0);
3422                 }
3423         }
3424         last_pg = 0;
3425
3426         while (size) {
3427
3428                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3429                         if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3430                                 break;
3431                 }
3432                 if (start_pg > last_pg) {
3433                         io_size = (start_pg - last_pg) * PAGE_SIZE;
3434
3435                         ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size,
3436                                         UPL_ABORT_FREE_ON_EMPTY);
3437
3438                         if (io_size < size)
3439                                 size -= io_size;
3440                         else
3441                                 break;
3442                 }
3443                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3444                         if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
3445                                 break;
3446                 }
3447                 upl_offset = start_pg * PAGE_SIZE;
3448
3449                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3450
3451                 if (vp->v_flag & VNOCACHE_DATA)
3452                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC | CL_DUMP;
3453                 else
3454                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
3455
3456                 while (vp->v_numoutput >= ASYNC_THROTTLE) {
3457                         vp->v_flag |= VTHROTTLED;
3458                         tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
3459                 }
3460                 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
3461
3462                 size -= io_size;
3463         }
3464         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3465
3466         return(1);
3467 }
3468
3469
3470
3471 static int
3472 cluster_align_phys_io(struct vnode *vp, struct uio *uio, vm_offset_t usr_paddr, int xsize, int devblocksize, int flags)
3473 {
3474         struct iovec     *iov;
3475         upl_page_info_t  *pl;
3476         upl_t            upl;
3477         vm_offset_t      ubc_paddr;
3478         kern_return_t    kret;
3479         int              error = 0;
3480
3481         iov = uio->uio_iov;
3482
3483         kret = ubc_create_upl(vp,
3484                               uio->uio_offset & ~PAGE_MASK_64,
3485                               PAGE_SIZE,
3486                               &upl,
3487                               &pl,
3488                               UPL_FLAGS_NONE);
3489
3490         if (kret != KERN_SUCCESS)
3491                 return(EINVAL);
3492
3493         if (!upl_valid_page(pl, 0)) {
3494                 /*
3495                  * issue a synchronous read to cluster_io
3496                  */
3497                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3498                                    CL_READ, (struct buf *)0, (struct clios *)0);
3499                 if (error) {
3500                           ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3501
3502                           return(error);
3503                 }
3504         }
3505         ubc_paddr = (vm_offset_t)upl_phys_page(pl, 0) + (int)(uio->uio_offset & PAGE_MASK_64);
3506
3507         if (flags & CL_READ)
3508                 copyp2p(ubc_paddr, usr_paddr, xsize, 2);
3509         else
3510                 copyp2p(usr_paddr, ubc_paddr, xsize, 1);
3511
3512         if ( !(flags & CL_READ) || upl_dirty_page(pl, 0)) {
3513                 /*
3514                  * issue a synchronous write to cluster_io
3515                  */
3516                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3517                                    0, (struct buf *)0, (struct clios *)0);
3518         }
3519         if (error == 0) {
3520                 uio->uio_offset += xsize;
3521                 iov->iov_base   += xsize;
3522                 iov->iov_len    -= xsize;
3523                 uio->uio_resid  -= xsize;
3524         }
3525         ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3526
3527         return (error);
3528 }