bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * Redistribution and use in source and binary forms, with or without
  28  * modification, are permitted provided that the following conditions
  29  * are met:
  30  * 1. Redistributions of source code must retain the above copyright
  31  *    notice, this list of conditions and the following disclaimer.
  32  * 2. Redistributions in binary form must reproduce the above copyright
  33  *    notice, this list of conditions and the following disclaimer in the
  34  *    documentation and/or other materials provided with the distribution.
  35  * 3. All advertising materials mentioning features or use of this software
  36  *    must display the following acknowledgement:
  37  *      This product includes software developed by the University of
  38  *      California, Berkeley and its contributors.
  39  * 4. Neither the name of the University nor the names of its contributors
  40  *    may be used to endorse or promote products derived from this software
  41  *    without specific prior written permission.
  42  *
  43  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  46  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  53  * SUCH DAMAGE.
  54  *
  55  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  56  */
  57
  58 #include <sys/param.h>
  59 #include <sys/proc.h>
  60 #include <sys/buf.h>
  61 #include <sys/vnode.h>
  62 #include <sys/mount.h>
  63 #include <sys/trace.h>
  64 #include <sys/malloc.h>
  65 #include <sys/resourcevar.h>
  66 #include <libkern/libkern.h>
  67
  68 #include <sys/ubc.h>
  69 #include <vm/vm_pageout.h>
  70
  71 #include <sys/kdebug.h>
  72
  73 #define CL_READ      0x01
  74 #define CL_ASYNC     0x02
  75 #define CL_COMMIT    0x04
  76 #define CL_PAGEOUT   0x10
  77 #define CL_AGE       0x20
  78 #define CL_DUMP      0x40
  79 #define CL_NOZERO    0x80
  80 #define CL_PAGEIN    0x100
  81 #define CL_DEV_MEMORY 0x200
  82 #define CL_PRESERVE   0x400
  83
  84 struct clios {
  85         u_int  io_completed;
  86         u_int  io_issued;
  87         off_t  io_offset;
  88         int    io_error;
  89         int    io_wanted;
  90 };
  91
  92
  93 static void cluster_zero(upl_t upl, vm_offset_t   upl_offset,
  94                 int size, struct buf *bp);
  95 static int cluster_read_x(struct vnode *vp, struct uio *uio,
  96                 off_t filesize, int devblocksize, int flags);
  97 static int cluster_write_x(struct vnode *vp, struct uio *uio,
  98                 off_t oldEOF, off_t newEOF, off_t headOff,
  99                 off_t tailOff, int devblocksize, int flags);
 100 static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
 101                 off_t filesize, int devblocksize, int flags);
 102 static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
 103                 off_t newEOF, int devblocksize, int flags);
 104 static int cluster_phys_read(struct vnode *vp, struct uio *uio,
 105                 off_t filesize, int devblocksize, int flags);
 106 static int cluster_phys_write(struct vnode *vp, struct uio *uio,
 107                 off_t newEOF, int devblocksize, int flags);
 108 static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
 109                 vm_offset_t usr_paddr, int xsize, int devblocksize, int flags);
 110 static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
 111 static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all);
 112
 113
 114 /*
 115  * throttle the number of async writes that
 116  * can be outstanding on a single vnode
 117  * before we issue a synchronous write
 118  */
 119 #define ASYNC_THROTTLE  9
 120
 121 static int
 122 cluster_iodone(bp)
 123         struct buf *bp;
 124 {
 125         int         b_flags;
 126         int         error;
 127         int         total_size;
 128         int         total_resid;
 129         int         upl_offset;
 130         int         zero_offset;
 131         int         l_blkno;
 132         upl_t       upl;
 133         struct buf *cbp;
 134         struct buf *cbp_head;
 135         struct buf *cbp_next;
 136         struct buf *real_bp;
 137         struct vnode *vp;
 138         struct clios *iostate;
 139         int         commit_size;
 140         int         pg_offset;
 141
 142
 143         cbp_head = (struct buf *)(bp->b_trans_head);
 144
 145         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 146                      (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 147
 148         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 149                 /*
 150                  * all I/O requests that are part of this transaction
 151                  * have to complete before we can process it
 152                  */
 153                 if ( !(cbp->b_flags & B_DONE)) {
 154
 155                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 156                                      (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
 157
 158                         return 0;
 159                 }
 160         }
 161         error       = 0;
 162         total_size  = 0;
 163         total_resid = 0;
 164
 165         cbp        = cbp_head;
 166         upl_offset = cbp->b_uploffset;
 167         upl        = cbp->b_pagelist;
 168         b_flags    = cbp->b_flags;
 169         real_bp    = cbp->b_real_bp;
 170         vp         = cbp->b_vp;
 171         zero_offset= cbp->b_validend;
 172         l_blkno    = cbp->b_lblkno;
 173         iostate    = (struct clios *)cbp->b_iostate;
 174
 175         while (cbp) {
 176                 if (cbp->b_vectorcount > 1)
 177                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 178
 179                 if ((cbp->b_flags & B_ERROR) && error == 0)
 180                         error = cbp->b_error;
 181
 182                 total_resid += cbp->b_resid;
 183                 total_size  += cbp->b_bcount;
 184
 185                 cbp_next = cbp->b_trans_next;
 186
 187                 free_io_buf(cbp);
 188
 189                 cbp = cbp_next;
 190         }
 191         if (zero_offset)
 192                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 193
 194         if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
 195                 vp->v_flag &= ~VTHROTTLED;
 196                 wakeup((caddr_t)&vp->v_numoutput);
 197         }
 198         if (iostate) {
 199                 if (error) {
 200                         off_t   error_offset;
 201
 202                         error_offset = (off_t)l_blkno * PAGE_SIZE_64;
 203
 204                         if (iostate->io_error == 0) {
 205                                 iostate->io_error = error;
 206                                 iostate->io_offset = error_offset;
 207                         } else {
 208                                 if (error_offset < iostate->io_offset)
 209                                         iostate->io_offset = error_offset;
 210                         }
 211                 }
 212                 iostate->io_completed += total_size;
 213
 214                 if (iostate->io_wanted) {
 215                         iostate->io_wanted = 0;
 216                         wakeup((caddr_t)&iostate->io_wanted);
 217                 }
 218         }
 219         if ((b_flags & B_NEED_IODONE) && real_bp) {
 220                 if (error) {
 221                         real_bp->b_flags |= B_ERROR;
 222                         real_bp->b_error = error;
 223                 }
 224                 real_bp->b_resid = total_resid;
 225
 226                 biodone(real_bp);
 227         }
 228         if (error == 0 && total_resid)
 229                 error = EIO;
 230
 231         if (b_flags & B_COMMIT_UPL) {
 232                 pg_offset   = upl_offset & PAGE_MASK;
 233                 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 234
 235                 if (error || (b_flags & B_NOCACHE) || ((b_flags & B_PHYS) && !(b_flags & B_READ))) {
 236                         int upl_abort_code;
 237
 238                         if (b_flags & B_PHYS)
 239                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 240                         else if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
 241                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 242                         else if (b_flags & B_PGIN)
 243                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 244                         else
 245                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 246
 247                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 248                                         upl_abort_code);
 249
 250                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 251                                      (int)upl, upl_offset - pg_offset, commit_size,
 252                                      0x80000000|upl_abort_code, 0);
 253
 254                 } else {
 255                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 256
 257                         if (b_flags & B_PHYS)
 258                                 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
 259                         else if ( !(b_flags & B_PAGEOUT))
 260                                 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
 261                         if (b_flags & B_AGE)
 262                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 263
 264                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 265                                         upl_commit_flags);
 266
 267                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 268                                      (int)upl, upl_offset - pg_offset, commit_size,
 269                                      upl_commit_flags, 0);
 270                 }
 271         } else
 272                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 273                              (int)upl, upl_offset, 0, error, 0);
 274
 275         return (error);
 276 }
 277
 278
 279 static void
 280 cluster_zero(upl, upl_offset, size, bp)
 281         upl_t         upl;
 282         vm_offset_t   upl_offset;
 283         int           size;
 284         struct buf   *bp;
 285 {
 286         vm_offset_t   io_addr = 0;
 287         int           must_unmap = 0;
 288         kern_return_t kret;
 289
 290         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
 291                      upl_offset, size, (int)bp, 0, 0);
 292
 293         if (bp == NULL || bp->b_data == NULL) {
 294                 kret = ubc_upl_map(upl, &io_addr);
 295
 296                 if (kret != KERN_SUCCESS)
 297                         panic("cluster_zero: ubc_upl_map() failed with (%d)", kret);
 298                 if (io_addr == 0)
 299                         panic("cluster_zero: ubc_upl_map() mapped 0");
 300
 301                 must_unmap = 1;
 302         } else
 303                 io_addr = (vm_offset_t)bp->b_data;
 304         bzero((caddr_t)(io_addr + upl_offset), size);
 305
 306         if (must_unmap) {
 307                 kret = ubc_upl_unmap(upl);
 308
 309                 if (kret != KERN_SUCCESS)
 310                         panic("cluster_zero: kernel_upl_unmap failed");
 311         }
 312 }
 313
 314 static int
 315 cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
 316         struct vnode *vp;
 317         upl_t         upl;
 318         vm_offset_t   upl_offset;
 319         off_t         f_offset;
 320         int           non_rounded_size;
 321         int           devblocksize;
 322         int           flags;
 323         struct buf   *real_bp;
 324         struct clios *iostate;
 325 {
 326         struct buf   *cbp;
 327         struct iovec *iovp;
 328         u_int         size;
 329         u_int         io_size;
 330         int           io_flags;
 331         int           error = 0;
 332         int           retval = 0;
 333         struct buf   *cbp_head = 0;
 334         struct buf   *cbp_tail = 0;
 335         upl_page_info_t *pl;
 336         int buf_count = 0;
 337         int pg_count;
 338         int pg_offset;
 339         u_int max_iosize;
 340         u_int max_vectors;
 341         int priv;
 342         int zero_offset = 0;
 343         u_int  first_lblkno;
 344
 345         if (flags & CL_READ) {
 346                 io_flags = (B_VECTORLIST | B_READ);
 347
 348                 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
 349         } else {
 350                 io_flags = (B_VECTORLIST | B_WRITEINPROG);
 351
 352                 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
 353         }
 354         pl = ubc_upl_pageinfo(upl);
 355
 356         if (flags & CL_AGE)
 357                 io_flags |= B_AGE;
 358         if (flags & CL_DUMP)
 359                 io_flags |= B_NOCACHE;
 360         if (flags & CL_PAGEIN)
 361                 io_flags |= B_PGIN;
 362         if (flags & CL_PAGEOUT)
 363                 io_flags |= B_PAGEOUT;
 364         if (flags & CL_COMMIT)
 365                 io_flags |= B_COMMIT_UPL;
 366         if (flags & CL_PRESERVE)
 367                 io_flags |= B_PHYS;
 368
 369         if (devblocksize)
 370                 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
 371         else
 372                 size = non_rounded_size;
 373
 374
 375         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 376                      (int)f_offset, size, upl_offset, flags, 0);
 377
 378         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 379                 /*
 380                  * then we are going to end up
 381                  * with a page that we can't complete (the file size wasn't a multiple
 382                  * of PAGE_SIZE and we're trying to read to the end of the file
 383                  * so we'll go ahead and zero out the portion of the page we can't
 384                  * read in from the file
 385                  */
 386                 zero_offset = upl_offset + non_rounded_size;
 387         }
 388         while (size) {
 389                 int vsize;
 390                 int i;
 391                 int pl_index;
 392                 int pg_resid;
 393                 int num_contig;
 394                 daddr_t lblkno;
 395                 daddr_t blkno;
 396
 397                 if (size > max_iosize)
 398                         io_size = max_iosize;
 399                 else
 400                         io_size = size;
 401
 402                 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
 403                         if (error == EOPNOTSUPP)
 404                                 panic("VOP_CMAP Unimplemented");
 405                         break;
 406                 }
 407
 408                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 409                              (int)f_offset, (int)blkno, io_size, zero_offset, 0);
 410
 411                 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
 412                         if (flags & CL_PAGEOUT) {
 413                                 error = EINVAL;
 414                                 break;
 415                         };
 416
 417                         /* Try paging out the page individually before
 418                            giving up entirely and dumping it (it could
 419                            be mapped in a "hole" and require allocation
 420                            before the I/O:
 421                          */
 422                          ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE_64, UPL_ABORT_FREE_ON_EMPTY);
 423                          if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
 424                                 error = EINVAL;
 425                                 break;
 426                          };
 427
 428                         upl_offset += PAGE_SIZE_64;
 429                         f_offset   += PAGE_SIZE_64;
 430                         size       -= PAGE_SIZE_64;
 431                         continue;
 432                 }
 433                 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
 434                 /*
 435                  * we have now figured out how much I/O we can do - this is in 'io_size'
 436                  * pl_index represents the first page in the 'upl' that the I/O will occur for
 437                  * pg_offset is the starting point in the first page for the I/O
 438                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 439                  */
 440                 pl_index  = upl_offset / PAGE_SIZE;
 441                 pg_offset = upl_offset & PAGE_MASK;
 442                 pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 443
 444                 if (flags & CL_DEV_MEMORY) {
 445                         /*
 446                          * currently, can't deal with reading 'holes' in file
 447                          */
 448                         if ((long)blkno == -1) {
 449                                 error = EINVAL;
 450                                 break;
 451                         }
 452                         /*
 453                          * treat physical requests as one 'giant' page
 454                          */
 455                         pg_count = 1;
 456                 }
 457                 if ((flags & CL_READ) && (long)blkno == -1) {
 458                         int bytes_to_zero;
 459
 460                         /*
 461                          * if we're reading and blkno == -1, then we've got a
 462                          * 'hole' in the file that we need to deal with by zeroing
 463                          * out the affected area in the upl
 464                          */
 465                         if (zero_offset && io_size == size) {
 466                                 /*
 467                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
 468                                  * than 'zero_offset' will be non-zero
 469                                  * if the 'hole' returned by VOP_CMAP extends all the way to the eof
 470                                  * (indicated by the io_size finishing off the I/O request for this UPL)
 471                                  * than we're not going to issue an I/O for the
 472                                  * last page in this upl... we need to zero both the hole and the tail
 473                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
 474                                  */
 475                                 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
 476
 477                                 zero_offset = 0;
 478                         } else
 479                                 bytes_to_zero = io_size;
 480
 481                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
 482
 483                         if (cbp_head)
 484                                 /*
 485                                  * if there is a current I/O chain pending
 486                                  * then the first page of the group we just zero'd
 487                                  * will be handled by the I/O completion if the zero
 488                                  * fill started in the middle of the page
 489                                  */
 490                                 pg_count = (io_size - pg_offset) / PAGE_SIZE;
 491                         else {
 492                                 /*
 493                                  * no pending I/O to pick up that first page
 494                                  * so, we have to make sure it gets committed
 495                                  * here.
 496                                  * set the pg_offset to 0 so that the upl_commit_range
 497                                  * starts with this page
 498                                  */
 499                                 pg_count = (io_size + pg_offset) / PAGE_SIZE;
 500                                 pg_offset = 0;
 501                         }
 502                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 503                                 /*
 504                                  * if we're done with the request for this UPL
 505                                  * then we have to make sure to commit the last page
 506                                  * even if we only partially zero-filled it
 507                                  */
 508                                 pg_count++;
 509
 510                         if (pg_count) {
 511                                 if (pg_offset)
 512                                         pg_resid = PAGE_SIZE - pg_offset;
 513                                 else
 514                                         pg_resid = 0;
 515
 516                                 if (flags & CL_COMMIT)
 517                                         ubc_upl_commit_range(upl,
 518                                                         (upl_offset + pg_resid) & ~PAGE_MASK,
 519                                                         pg_count * PAGE_SIZE,
 520                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 521                         }
 522                         upl_offset += io_size;
 523                         f_offset   += io_size;
 524                         size       -= io_size;
 525
 526                         if (cbp_head && pg_count)
 527                                 goto start_io;
 528                         continue;
 529
 530                 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
 531                         real_bp->b_blkno = blkno;
 532                 }
 533
 534                 if (pg_count > 1) {
 535                         if (pg_count > max_vectors) {
 536                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 537
 538                                 if (io_size < 0) {
 539                                         io_size = PAGE_SIZE - pg_offset;
 540                                         pg_count = 1;
 541                                 } else
 542                                         pg_count = max_vectors;
 543                         }
 544                         /*
 545                          * we need to allocate space for the vector list
 546                          */
 547                         if (pg_count > 1) {
 548                                 iovp = (struct iovec *)_MALLOC(sizeof(struct iovec) * pg_count,
 549                                                                M_SEGMENT, M_NOWAIT);
 550
 551                                 if (iovp == (struct iovec *) 0) {
 552                                         /*
 553                                          * if the allocation fails, then throttle down to a single page
 554                                          */
 555                                         io_size = PAGE_SIZE - pg_offset;
 556                                         pg_count = 1;
 557                                 }
 558                         }
 559                 }
 560
 561                 /* Throttle the speculative IO */
 562                 if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 563                         priv = 0;
 564                 else
 565                         priv = 1;
 566
 567                 cbp = alloc_io_buf(vp, priv);
 568
 569                 if (pg_count == 1)
 570                         /*
 571                          * we use the io vector that's reserved in the buffer header
 572                          * this insures we can always issue an I/O even in a low memory
 573                          * condition that prevents the _MALLOC from succeeding... this
 574                          * is necessary to prevent deadlocks with the pager
 575                          */
 576                         iovp = (struct iovec *)(&cbp->b_vects[0]);
 577
 578                 cbp->b_vectorlist  = (void *)iovp;
 579                 cbp->b_vectorcount = pg_count;
 580
 581                 if (flags & CL_DEV_MEMORY) {
 582
 583                         iovp->iov_len  = io_size;
 584                         iovp->iov_base = (caddr_t)upl_phys_page(pl, 0);
 585
 586                         if (iovp->iov_base == (caddr_t) 0) {
 587                                 free_io_buf(cbp);
 588                                 error = EINVAL;
 589                         } else
 590                                 iovp->iov_base += upl_offset;
 591                 } else {
 592
 593                   for (i = 0, vsize = io_size; i < pg_count; i++, iovp++) {
 594                         int     psize;
 595
 596                         psize = PAGE_SIZE - pg_offset;
 597
 598                         if (psize > vsize)
 599                                 psize = vsize;
 600
 601                         iovp->iov_len  = psize;
 602                         iovp->iov_base = (caddr_t)upl_phys_page(pl, pl_index + i);
 603
 604                         if (iovp->iov_base == (caddr_t) 0) {
 605                                 if (pg_count > 1)
 606                                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 607                                 free_io_buf(cbp);
 608
 609                                 error = EINVAL;
 610                                 break;
 611                         }
 612                         iovp->iov_base += pg_offset;
 613                         pg_offset = 0;
 614
 615                         if (flags & CL_PAGEOUT) {
 616                                 int         s;
 617                                 struct buf *bp;
 618
 619                                 s = splbio();
 620                                 if (bp = incore(vp, lblkno + i)) {
 621                                         if (!ISSET(bp->b_flags, B_BUSY)) {
 622                                                 bremfree(bp);
 623                                                 SET(bp->b_flags, (B_BUSY | B_INVAL));
 624                                                 splx(s);
 625                                                 brelse(bp);
 626                                         } else
 627                                                 panic("BUSY bp found in cluster_io");
 628                                 }
 629                                 splx(s);
 630                         }
 631                         vsize -= psize;
 632                     }
 633                 }
 634                 if (error)
 635                         break;
 636
 637                 if (flags & CL_ASYNC) {
 638                         cbp->b_flags |= (B_CALL | B_ASYNC);
 639                         cbp->b_iodone = (void *)cluster_iodone;
 640                 }
 641                 cbp->b_flags |= io_flags;
 642
 643                 cbp->b_lblkno = lblkno;
 644                 cbp->b_blkno  = blkno;
 645                 cbp->b_bcount = io_size;
 646                 cbp->b_pagelist  = upl;
 647                 cbp->b_uploffset = upl_offset;
 648                 cbp->b_trans_next = (struct buf *)0;
 649
 650                 if (cbp->b_iostate = (void *)iostate)
 651                         iostate->io_issued += io_size;
 652
 653                 if (flags & CL_READ)
 654                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 655                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 656                 else
 657                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 658                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 659
 660                 if (cbp_head) {
 661                         cbp_tail->b_trans_next = cbp;
 662                         cbp_tail = cbp;
 663                 } else {
 664                         cbp_head = cbp;
 665                         cbp_tail = cbp;
 666                 }
 667                 (struct buf *)(cbp->b_trans_head) = cbp_head;
 668                 buf_count++;
 669
 670                 upl_offset += io_size;
 671                 f_offset   += io_size;
 672                 size       -= io_size;
 673
 674                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
 675                         /*
 676                          * if we have no more I/O to issue or
 677                          * the current I/O we've prepared fully
 678                          * completes the last page in this request
 679                          * and it's either an ASYNC request or
 680                          * we've already accumulated more than 8 I/O's into
 681                          * this transaction and it's not an I/O directed to
 682                          * special DEVICE memory
 683                          * then go ahead and issue the I/O
 684                          */
 685 start_io:
 686                         if (real_bp) {
 687                                 cbp_head->b_flags |= B_NEED_IODONE;
 688                                 cbp_head->b_real_bp = real_bp;
 689                         } else
 690                                 cbp_head->b_real_bp = (struct buf *)NULL;
 691
 692                         if (size == 0) {
 693                                 /*
 694                                  * we're about to issue the last I/O for this upl
 695                                  * if this was a read to the eof and the eof doesn't
 696                                  * finish on a page boundary, than we need to zero-fill
 697                                  * the rest of the page....
 698                                  */
 699                                 cbp_head->b_validend = zero_offset;
 700                         } else
 701                                 cbp_head->b_validend = 0;
 702
 703                         for (cbp = cbp_head; cbp;) {
 704                                 struct buf * cbp_next;
 705
 706                                 if (io_flags & B_WRITEINPROG)
 707                                         cbp->b_vp->v_numoutput++;
 708
 709                                 cbp_next = cbp->b_trans_next;
 710
 711                                 (void) VOP_STRATEGY(cbp);
 712                                 cbp = cbp_next;
 713                         }
 714                         if ( !(flags & CL_ASYNC)) {
 715                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 716                                         biowait(cbp);
 717
 718                                 if (error = cluster_iodone(cbp_head)) {
 719                                         if ((flags & CL_PAGEOUT) && (error == ENXIO))
 720                                                 retval = 0;     /* drop the error */
 721                                         else
 722                                                 retval = error;
 723                                         error  = 0;
 724                                 }
 725                         }
 726                         cbp_head = (struct buf *)0;
 727                         cbp_tail = (struct buf *)0;
 728
 729                         buf_count = 0;
 730                 }
 731         }
 732         if (error) {
 733                 int abort_size;
 734
 735                 io_size = 0;
 736
 737                 for (cbp = cbp_head; cbp;) {
 738                         struct buf * cbp_next;
 739
 740                         if (cbp->b_vectorcount > 1)
 741                                 _FREE(cbp->b_vectorlist, M_SEGMENT);
 742                         upl_offset -= cbp->b_bcount;
 743                         size       += cbp->b_bcount;
 744                         io_size    += cbp->b_bcount;
 745
 746                         cbp_next = cbp->b_trans_next;
 747                         free_io_buf(cbp);
 748                         cbp = cbp_next;
 749                 }
 750                 if (iostate) {
 751                         if (iostate->io_error == 0) {
 752                                 iostate->io_error = error;
 753                                 iostate->io_offset = f_offset - (off_t)io_size;
 754                         }
 755                         iostate->io_issued -= io_size;
 756
 757                         if (iostate->io_wanted) {
 758                                 iostate->io_wanted = 0;
 759                                 wakeup((caddr_t)&iostate->io_wanted);
 760                         }
 761                 }
 762                 pg_offset  = upl_offset & PAGE_MASK;
 763                 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 764
 765                 if (flags & CL_COMMIT) {
 766                         int upl_abort_code;
 767
 768                         if (flags & CL_PRESERVE)
 769                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 770                         else if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
 771                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 772                         else if (flags & CL_PAGEIN)
 773                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 774                         else
 775                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 776
 777                         ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
 778                                                 upl_abort_code);
 779
 780                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
 781                                      (int)upl, upl_offset - pg_offset, abort_size, error, 0);
 782                 }
 783                 if (real_bp) {
 784                         real_bp->b_flags |= B_ERROR;
 785                         real_bp->b_error  = error;
 786
 787                         biodone(real_bp);
 788                 }
 789                 if (retval == 0)
 790                         retval = error;
 791         }
 792         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
 793                      (int)f_offset, size, upl_offset, retval, 0);
 794
 795         return (retval);
 796 }
 797
 798
 799 static int
 800 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
 801         struct vnode *vp;
 802         off_t         f_offset;
 803         u_int         size;
 804         off_t         filesize;
 805         int           devblocksize;
 806 {
 807         int           pages_to_fetch;
 808         int           skipped_pages;
 809
 810         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
 811                      (int)f_offset, size, (int)filesize, 0, 0);
 812
 813         if (f_offset >= filesize) {
 814                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 815                              (int)f_offset, 0, 0, 0, 0);
 816                 return(0);
 817         }
 818         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
 819                 size = MAX_UPL_TRANSFER * PAGE_SIZE;
 820         else
 821                 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
 822
 823         if ((off_t)size > (filesize - f_offset))
 824                 size = filesize - f_offset;
 825
 826         pages_to_fetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
 827
 828         for (skipped_pages = 0; skipped_pages < pages_to_fetch; skipped_pages++) {
 829                 if (ubc_page_op(vp, f_offset, 0, 0, 0) != KERN_SUCCESS)
 830                         break;
 831                 f_offset += PAGE_SIZE;
 832                 size     -= PAGE_SIZE;
 833         }
 834         if (skipped_pages < pages_to_fetch)
 835                 advisory_read(vp, filesize, f_offset, size, devblocksize);
 836
 837         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 838                      (int)f_offset + (pages_to_fetch * PAGE_SIZE), skipped_pages, 0, 1, 0);
 839
 840         return (pages_to_fetch);
 841 }
 842
 843
 844
 845 static void
 846 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
 847         struct vnode *vp;
 848         daddr_t       b_lblkno;
 849         daddr_t       e_lblkno;
 850         off_t         filesize;
 851         int           devblocksize;
 852 {
 853         daddr_t       r_lblkno;
 854         off_t         f_offset;
 855         int           size_of_prefetch;
 856         int           max_pages;
 857
 858         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
 859                      b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
 860
 861         if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
 862                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 863                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
 864                 return;
 865         }
 866
 867         if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
 868                                  (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
 869                 vp->v_ralen = 0;
 870                 vp->v_maxra = 0;
 871
 872                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 873                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
 874
 875                 return;
 876         }
 877         max_pages = MAX_UPL_TRANSFER;
 878
 879         vp->v_ralen = vp->v_ralen ? min(max_pages, vp->v_ralen << 1) : 1;
 880
 881         if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
 882                 vp->v_ralen = min(max_pages, (e_lblkno + 1) - b_lblkno);
 883
 884         if (e_lblkno < vp->v_maxra) {
 885                 if ((vp->v_maxra - e_lblkno) > max(max_pages / 16, 4)) {
 886
 887                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 888                                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
 889                         return;
 890                 }
 891         }
 892         r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
 893         f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
 894
 895         if (f_offset < filesize) {
 896                 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
 897
 898                 if (size_of_prefetch)
 899                         vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
 900         }
 901         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 902                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
 903 }
 904
 905 int
 906 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 907         struct vnode *vp;
 908         upl_t         upl;
 909         vm_offset_t   upl_offset;
 910         off_t         f_offset;
 911         int           size;
 912         off_t         filesize;
 913         int           devblocksize;
 914         int           flags;
 915 {
 916         int           io_size;
 917         int           pg_size;
 918         off_t         max_size;
 919         int local_flags = CL_PAGEOUT;
 920
 921         if ((flags & UPL_IOSYNC) == 0)
 922                 local_flags |= CL_ASYNC;
 923         if ((flags & UPL_NOCOMMIT) == 0)
 924                 local_flags |= CL_COMMIT;
 925
 926
 927         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
 928                      (int)f_offset, size, (int)filesize, local_flags, 0);
 929
 930         /*
 931          * If they didn't specify any I/O, then we are done...
 932          * we can't issue an abort because we don't know how
 933          * big the upl really is
 934          */
 935         if (size <= 0)
 936                 return (EINVAL);
 937
 938         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
 939                 if (local_flags & CL_COMMIT)
 940                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 941                 return (EROFS);
 942         }
 943         /*
 944          * can't page-in from a negative offset
 945          * or if we're starting beyond the EOF
 946          * or if the file offset isn't page aligned
 947          * or the size requested isn't a multiple of PAGE_SIZE
 948          */
 949         if (f_offset < 0 || f_offset >= filesize ||
 950            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 951                 if (local_flags & CL_COMMIT)
 952                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 953                 return (EINVAL);
 954         }
 955         max_size = filesize - f_offset;
 956
 957         if (size < max_size)
 958                 io_size = size;
 959         else
 960                 io_size = max_size;
 961
 962         pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 963
 964         if (size > pg_size) {
 965                 if (local_flags & CL_COMMIT)
 966                         ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
 967                                         UPL_ABORT_FREE_ON_EMPTY);
 968         }
 969         while (vp->v_numoutput >= ASYNC_THROTTLE) {
 970                 vp->v_flag |= VTHROTTLED;
 971                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_pageout", 0);
 972         }
 973
 974         return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
 975                            local_flags, (struct buf *)0, (struct clios *)0));
 976 }
 977
 978 int
 979 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 980         struct vnode *vp;
 981         upl_t         upl;
 982         vm_offset_t   upl_offset;
 983         off_t         f_offset;
 984         int           size;
 985         off_t         filesize;
 986         int           devblocksize;
 987         int           flags;
 988 {
 989         u_int         io_size;
 990         int           rounded_size;
 991         off_t         max_size;
 992         int           retval;
 993         int           local_flags = 0;
 994
 995         if (upl == NULL || size < 0)
 996                 panic("cluster_pagein: NULL upl passed in");
 997
 998         if ((flags & UPL_IOSYNC) == 0)
 999                 local_flags |= CL_ASYNC;
1000         if ((flags & UPL_NOCOMMIT) == 0)
1001                 local_flags |= CL_COMMIT;
1002
1003
1004         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1005                      (int)f_offset, size, (int)filesize, local_flags, 0);
1006
1007         /*
1008          * can't page-in from a negative offset
1009          * or if we're starting beyond the EOF
1010          * or if the file offset isn't page aligned
1011          * or the size requested isn't a multiple of PAGE_SIZE
1012          */
1013         if (f_offset < 0 || f_offset >= filesize ||
1014            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1015                 if (local_flags & CL_COMMIT)
1016                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1017                 return (EINVAL);
1018         }
1019         max_size = filesize - f_offset;
1020
1021         if (size < max_size)
1022                 io_size = size;
1023         else
1024                 io_size = max_size;
1025
1026         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1027
1028         if (size > rounded_size && (local_flags & CL_COMMIT))
1029                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1030                                     size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1031
1032         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
1033                            local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
1034
1035         if (retval == 0) {
1036                 int b_lblkno;
1037                 int e_lblkno;
1038
1039                 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
1040                 e_lblkno = (int)
1041                         ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1042
1043                 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
1044                         /*
1045                          * we haven't read the last page in of the file yet
1046                          * so let's try to read ahead if we're in
1047                          * a sequential access pattern
1048                          */
1049                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1050                 }
1051                 vp->v_lastr = e_lblkno;
1052         }
1053         return (retval);
1054 }
1055
1056 int
1057 cluster_bp(bp)
1058         struct buf *bp;
1059 {
1060         off_t  f_offset;
1061         int    flags;
1062
1063         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1064                      (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1065
1066         if (bp->b_pagelist == (upl_t) 0)
1067                 panic("cluster_bp: can't handle NULL upl yet\n");
1068         if (bp->b_flags & B_READ)
1069                 flags = CL_ASYNC | CL_READ;
1070         else
1071                 flags = CL_ASYNC;
1072
1073         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1074
1075         return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
1076 }
1077
1078 int
1079 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1080         struct vnode *vp;
1081         struct uio   *uio;
1082         off_t         oldEOF;
1083         off_t         newEOF;
1084         off_t         headOff;
1085         off_t         tailOff;
1086         int           devblocksize;
1087         int           flags;
1088 {
1089         int           prev_resid;
1090         int           clip_size;
1091         off_t         max_io_size;
1092         struct iovec  *iov;
1093         vm_offset_t   upl_offset;
1094         int           upl_size;
1095         int           pages_in_pl;
1096         upl_page_info_t *pl;
1097         int           upl_flags;
1098         upl_t         upl;
1099         int           retval = 0;
1100
1101
1102         if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
1103           {
1104             retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1105             return(retval);
1106           }
1107
1108         while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1109           {
1110             /* we know we have a resid, so this is safe */
1111             iov = uio->uio_iov;
1112             while (iov->iov_len == 0) {
1113               uio->uio_iov++;
1114               uio->uio_iovcnt--;
1115               iov = uio->uio_iov;
1116             }
1117
1118             /*
1119              * We check every vector target and if it is physically
1120              * contiguous space, we skip the sanity checks.
1121              */
1122
1123             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1124             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1125             pages_in_pl = 0;
1126             upl_flags = UPL_QUERY_OBJECT_TYPE;
1127             if ((vm_map_get_upl(current_map(),
1128                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1129                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1130               {
1131                 /*
1132                  * the user app must have passed in an invalid address
1133                  */
1134                 return (EFAULT);
1135               }
1136
1137             if (upl_flags & UPL_PHYS_CONTIG)
1138               {
1139                 if (flags & IO_HEADZEROFILL)
1140                   {
1141                     flags &= ~IO_HEADZEROFILL;
1142
1143                     if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1144                         return(retval);
1145                   }
1146
1147                 retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
1148
1149                 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1150                   {
1151                     retval = cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL);
1152                     return(retval);
1153                   }
1154               }
1155             else if ((uio->uio_resid < 4 * PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1156               {
1157                 /*
1158                  * We set a threshhold of 4 pages to decide if the nocopy
1159                  * write loop is worth the trouble...
1160                  * we also come here if we're trying to zero the head and/or tail
1161                  * of a partially written page, and the user source is not a physically contiguous region
1162                  */
1163                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1164                 return(retval);
1165               }
1166             else if (uio->uio_offset & PAGE_MASK_64)
1167               {
1168                 /* Bring the file offset write up to a pagesize boundary */
1169                 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1170                 if (uio->uio_resid < clip_size)
1171                   clip_size = uio->uio_resid;
1172                 /*
1173                  * Fake the resid going into the cluster_write_x call
1174                  * and restore it on the way out.
1175                  */
1176                 prev_resid = uio->uio_resid;
1177                 uio->uio_resid = clip_size;
1178                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1179                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1180               }
1181             else if ((int)iov->iov_base & PAGE_MASK_64)
1182               {
1183                 clip_size = iov->iov_len;
1184                 prev_resid = uio->uio_resid;
1185                 uio->uio_resid = clip_size;
1186                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1187                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1188               }
1189             else
1190               {
1191                 /*
1192                  * If we come in here, we know the offset into
1193                  * the file is on a pagesize boundary
1194                  */
1195
1196                 max_io_size = newEOF - uio->uio_offset;
1197                 clip_size = uio->uio_resid;
1198                 if (iov->iov_len < clip_size)
1199                   clip_size = iov->iov_len;
1200                 if (max_io_size < clip_size)
1201                   clip_size = max_io_size;
1202
1203                 if (clip_size < PAGE_SIZE)
1204                   {
1205                     /*
1206                      * Take care of tail end of write in this vector
1207                      */
1208                     prev_resid = uio->uio_resid;
1209                     uio->uio_resid = clip_size;
1210                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1211                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1212                   }
1213                 else
1214                   {
1215                     /* round clip_size down to a multiple of pagesize */
1216                     clip_size = clip_size & ~(PAGE_MASK);
1217                     prev_resid = uio->uio_resid;
1218                     uio->uio_resid = clip_size;
1219                     retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1220                     if ((retval == 0) && uio->uio_resid)
1221                       retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1222                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1223                   }
1224               } /* end else */
1225           } /* end while */
1226         return(retval);
1227 }
1228
1229
1230 static int
1231 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1232         struct vnode *vp;
1233         struct uio   *uio;
1234         off_t         newEOF;
1235         int           devblocksize;
1236         int           flags;
1237 {
1238         upl_t            upl;
1239         upl_page_info_t  *pl;
1240         off_t            upl_f_offset;
1241         vm_offset_t      upl_offset;
1242         off_t            max_io_size;
1243         int              io_size;
1244         int              upl_size;
1245         int              upl_needed_size;
1246         int              pages_in_pl;
1247         int              upl_flags;
1248         kern_return_t    kret;
1249         struct iovec     *iov;
1250         int              i;
1251         int              force_data_sync;
1252         int              error  = 0;
1253
1254         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1255                      (int)uio->uio_offset, (int)uio->uio_resid,
1256                      (int)newEOF, devblocksize, 0);
1257
1258         /*
1259          * When we enter this routine, we know
1260          *  -- the offset into the file is on a pagesize boundary
1261          *  -- the resid is a page multiple
1262          *  -- the resid will not exceed iov_len
1263          */
1264         cluster_try_push(vp, newEOF, 0, 1);
1265
1266         iov = uio->uio_iov;
1267
1268         while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1269           io_size = uio->uio_resid;
1270
1271           if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1272             io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1273
1274           upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1275           upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1276
1277           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1278                        (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1279
1280           for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
1281             {
1282               pages_in_pl = 0;
1283               upl_size = upl_needed_size;
1284               upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1285                           UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1286
1287               kret = vm_map_get_upl(current_map(),
1288                                     (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1289                                     &upl_size,
1290                                         &upl,
1291                                         NULL,
1292                                         &pages_in_pl,
1293                                         &upl_flags,
1294                                         force_data_sync);
1295
1296               if (kret != KERN_SUCCESS)
1297                 {
1298                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1299                                0, 0, 0, kret, 0);
1300
1301                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1302                                (int)uio->uio_offset, (int)uio->uio_resid, kret, 1, 0);
1303
1304                   /* cluster_nocopy_write: failed to get pagelist */
1305                   /* do not return kret here */
1306                   return(0);
1307                 }
1308
1309               pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1310               pages_in_pl = upl_size / PAGE_SIZE;
1311
1312               for(i=0; i < pages_in_pl; i++)
1313                 {
1314                   if (!upl_valid_page(pl, i))
1315                     break;
1316                 }
1317
1318               if (i == pages_in_pl)
1319                 break;
1320
1321                 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1322                                 UPL_ABORT_FREE_ON_EMPTY);
1323             }
1324
1325           if (force_data_sync >= 3)
1326             {
1327               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1328                            i, pages_in_pl, upl_size, kret, 0);
1329
1330               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1331                            (int)uio->uio_offset, (int)uio->uio_resid, kret, 2, 0);
1332               return(0);
1333             }
1334
1335           /*
1336            * Consider the possibility that upl_size wasn't satisfied.
1337            */
1338           if (upl_size != upl_needed_size)
1339             io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1340
1341           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1342                        (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1343
1344           if (io_size == 0)
1345             {
1346               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1347                                    UPL_ABORT_FREE_ON_EMPTY);
1348               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1349                      (int)uio->uio_offset, uio->uio_resid, 0, 3, 0);
1350
1351               return(0);
1352             }
1353
1354           /*
1355            * Now look for pages already in the cache
1356            * and throw them away.
1357            */
1358
1359           upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
1360           max_io_size = io_size;
1361
1362           while (max_io_size) {
1363
1364             /*
1365              * Flag UPL_POP_DUMP says if the page is found
1366              * in the page cache it must be thrown away.
1367              */
1368             ubc_page_op(vp,
1369                         upl_f_offset,
1370                         UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1371                         0, 0);
1372             max_io_size  -= PAGE_SIZE;
1373             upl_f_offset += PAGE_SIZE;
1374           }
1375
1376           /*
1377            * issue a synchronous write to cluster_io
1378            */
1379
1380           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1381                        (int)upl_offset, (int)uio->uio_offset, io_size, 0, 0);
1382
1383           error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1384                              io_size, devblocksize, 0, (struct buf *)0, (struct clios *)0);
1385
1386           if (error == 0) {
1387             /*
1388              * The cluster_io write completed successfully,
1389              * update the uio structure.
1390              */
1391             iov->iov_base += io_size;
1392             iov->iov_len -= io_size;
1393             uio->uio_resid -= io_size;
1394             uio->uio_offset += io_size;
1395           }
1396           /*
1397            * always 'commit' the I/O via the abort primitive whether the I/O
1398            * succeeded cleanly or not... this is necessary to insure that
1399            * we preserve the state of the DIRTY flag on the pages used to
1400            * provide the data for the I/O... the state of this flag SHOULD
1401            * NOT be changed by a write
1402            */
1403           ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1404                               UPL_ABORT_FREE_ON_EMPTY);
1405
1406
1407           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1408                        (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1409
1410         } /* end while */
1411
1412
1413         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1414                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1415
1416         return (error);
1417 }
1418
1419
1420 static int
1421 cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
1422         struct vnode *vp;
1423         struct uio   *uio;
1424         off_t        newEOF;
1425         int          devblocksize;
1426         int          flags;
1427 {
1428         upl_page_info_t *pl;
1429         vm_offset_t      src_paddr;
1430         upl_t            upl;
1431         vm_offset_t      upl_offset;
1432         int              tail_size;
1433         int              io_size;
1434         int              upl_size;
1435         int              upl_needed_size;
1436         int              pages_in_pl;
1437         int              upl_flags;
1438         kern_return_t    kret;
1439         struct iovec     *iov;
1440         int              error  = 0;
1441
1442         /*
1443          * When we enter this routine, we know
1444          *  -- the resid will not exceed iov_len
1445          *  -- the vector target address is physcially contiguous
1446          */
1447         cluster_try_push(vp, newEOF, 0, 1);
1448
1449         iov = uio->uio_iov;
1450         io_size = iov->iov_len;
1451         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1452         upl_needed_size = upl_offset + io_size;
1453
1454         pages_in_pl = 0;
1455         upl_size = upl_needed_size;
1456         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1457                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1458
1459         kret = vm_map_get_upl(current_map(),
1460                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1461                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1462
1463         if (kret != KERN_SUCCESS) {
1464                 /*
1465                  * cluster_phys_write: failed to get pagelist
1466                  * note: return kret here
1467                  */
1468               return(EINVAL);
1469         }
1470         /*
1471          * Consider the possibility that upl_size wasn't satisfied.
1472          * This is a failure in the physical memory case.
1473          */
1474         if (upl_size < upl_needed_size) {
1475                 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1476                 return(EINVAL);
1477         }
1478         pl = ubc_upl_pageinfo(upl);
1479
1480         src_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
1481
1482         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1483                 int   head_size;
1484
1485                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1486
1487                 if (head_size > io_size)
1488                         head_size = io_size;
1489
1490                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
1491
1492                 if (error) {
1493                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1494
1495                         return(EINVAL);
1496                 }
1497                 upl_offset += head_size;
1498                 src_paddr  += head_size;
1499                 io_size    -= head_size;
1500         }
1501         tail_size = io_size & (devblocksize - 1);
1502         io_size  -= tail_size;
1503
1504         if (io_size) {
1505                 /*
1506                  * issue a synchronous write to cluster_io
1507                  */
1508                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1509                                    io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
1510         }
1511         if (error == 0) {
1512                 /*
1513                  * The cluster_io write completed successfully,
1514                  * update the uio structure
1515                  */
1516                 uio->uio_resid  -= io_size;
1517                 iov->iov_len    -= io_size;
1518                 iov->iov_base   += io_size;
1519                 uio->uio_offset += io_size;
1520                 src_paddr       += io_size;
1521
1522                 if (tail_size)
1523                         error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
1524         }
1525         /*
1526          * just release our hold on the physically contiguous
1527          * region without changing any state
1528          */
1529         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1530
1531         return (error);
1532 }
1533
1534
1535 static int
1536 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1537         struct vnode *vp;
1538         struct uio   *uio;
1539         off_t         oldEOF;
1540         off_t         newEOF;
1541         off_t         headOff;
1542         off_t         tailOff;
1543         int           devblocksize;
1544         int           flags;
1545 {
1546         upl_page_info_t *pl;
1547         upl_t            upl;
1548         vm_offset_t      upl_offset;
1549         int              upl_size;
1550         off_t            upl_f_offset;
1551         int              pages_in_upl;
1552         int              start_offset;
1553         int              xfer_resid;
1554         int              io_size;
1555         int              io_flags;
1556         vm_offset_t      io_address;
1557         int              io_offset;
1558         int              bytes_to_zero;
1559         int              bytes_to_move;
1560         kern_return_t    kret;
1561         int              retval = 0;
1562         int              uio_resid;
1563         long long        total_size;
1564         long long        zero_cnt;
1565         off_t            zero_off;
1566         long long        zero_cnt1;
1567         off_t            zero_off1;
1568         daddr_t          start_blkno;
1569         daddr_t          last_blkno;
1570
1571         if (uio) {
1572                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1573                              (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1574
1575                 uio_resid = uio->uio_resid;
1576         } else {
1577                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1578                              0, 0, (int)oldEOF, (int)newEOF, 0);
1579
1580                 uio_resid = 0;
1581         }
1582         zero_cnt  = 0;
1583         zero_cnt1 = 0;
1584
1585         if (flags & IO_HEADZEROFILL) {
1586                 /*
1587                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1588                  * so we zero fill the intervening space between the old EOF and the offset
1589                  * where the next chunk of real data begins.... ftruncate will also use this
1590                  * routine to zero fill to the new EOF when growing a file... in this case, the
1591                  * uio structure will not be provided
1592                  */
1593                 if (uio) {
1594                         if (headOff < uio->uio_offset) {
1595                                 zero_cnt = uio->uio_offset - headOff;
1596                                 zero_off = headOff;
1597                         }
1598                 } else if (headOff < newEOF) {
1599                         zero_cnt = newEOF - headOff;
1600                         zero_off = headOff;
1601                 }
1602         }
1603         if (flags & IO_TAILZEROFILL) {
1604                 if (uio) {
1605                         zero_off1 = uio->uio_offset + uio->uio_resid;
1606
1607                         if (zero_off1 < tailOff)
1608                                 zero_cnt1 = tailOff - zero_off1;
1609                 }
1610         }
1611         if (zero_cnt == 0 && uio == (struct uio *) 0)
1612           {
1613             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1614                          retval, 0, 0, 0, 0);
1615             return (0);
1616           }
1617
1618         while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1619                 /*
1620                  * for this iteration of the loop, figure out where our starting point is
1621                  */
1622                 if (zero_cnt) {
1623                         start_offset = (int)(zero_off & PAGE_MASK_64);
1624                         upl_f_offset = zero_off - start_offset;
1625                 } else if (uio_resid) {
1626                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1627                         upl_f_offset = uio->uio_offset - start_offset;
1628                 } else {
1629                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1630                         upl_f_offset = zero_off1 - start_offset;
1631                 }
1632                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1633                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1634
1635                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1636                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1637
1638                 /*
1639                  * compute the size of the upl needed to encompass
1640                  * the requested write... limit each call to cluster_io
1641                  * to the maximum UPL size... cluster_io will clip if
1642                  * this exceeds the maximum io_size for the device,
1643                  * make sure to account for
1644                  * a starting offset that's not page aligned
1645                  */
1646                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1647
1648                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1649                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1650
1651                 pages_in_upl = upl_size / PAGE_SIZE;
1652                 io_size      = upl_size - start_offset;
1653
1654                 if ((long long)io_size > total_size)
1655                         io_size = total_size;
1656
1657                 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1658                 last_blkno  = start_blkno + pages_in_upl;
1659
1660                 kret = ubc_create_upl(vp,
1661                                                         upl_f_offset,
1662                                                         upl_size,
1663                                                         &upl,
1664                                                         &pl,
1665                                                         UPL_FLAGS_NONE);
1666                 if (kret != KERN_SUCCESS)
1667                         panic("cluster_write: failed to get pagelist");
1668
1669                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
1670                         (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1671
1672                 if (start_offset && !upl_valid_page(pl, 0)) {
1673                         int   read_size;
1674
1675                         /*
1676                          * we're starting in the middle of the first page of the upl
1677                          * and the page isn't currently valid, so we're going to have
1678                          * to read it in first... this is a synchronous operation
1679                          */
1680                         read_size = PAGE_SIZE;
1681
1682                         if ((upl_f_offset + read_size) > newEOF)
1683                                 read_size = newEOF - upl_f_offset;
1684
1685                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
1686                                             CL_READ, (struct buf *)0, (struct clios *)0);
1687                         if (retval) {
1688                                 /*
1689                                  * we had an error during the read which causes us to abort
1690                                  * the current cluster_write request... before we do, we need
1691                                  * to release the rest of the pages in the upl without modifying
1692                                  * there state and mark the failed page in error
1693                                  */
1694                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1695                                 ubc_upl_abort_range(upl, 0, upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1696
1697                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1698                                              (int)upl, 0, 0, retval, 0);
1699                                 break;
1700                         }
1701                 }
1702                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1703                         /*
1704                          * the last offset we're writing to in this upl does not end on a page
1705                          * boundary... if it's not beyond the old EOF, then we'll also need to
1706                          * pre-read this page in if it isn't already valid
1707                          */
1708                         upl_offset = upl_size - PAGE_SIZE;
1709
1710                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1711                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1712                                 int   read_size;
1713
1714                                 read_size = PAGE_SIZE;
1715
1716                                 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1717                                         read_size = newEOF - (upl_f_offset + upl_offset);
1718
1719                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
1720                                                     CL_READ, (struct buf *)0, (struct clios *)0);
1721                                 if (retval) {
1722                                         /*
1723                                          * we had an error during the read which causes us to abort
1724                                          * the current cluster_write request... before we do, we
1725                                          * need to release the rest of the pages in the upl without
1726                                          * modifying there state and mark the failed page in error
1727                                          */
1728                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1729                                         ubc_upl_abort_range(upl, 0,          upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1730
1731                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1732                                                      (int)upl, 0, 0, retval, 0);
1733                                         break;
1734                                 }
1735                         }
1736                 }
1737                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
1738                         panic("cluster_write: ubc_upl_map failed\n");
1739                 xfer_resid = io_size;
1740                 io_offset = start_offset;
1741
1742                 while (zero_cnt && xfer_resid) {
1743
1744                         if (zero_cnt < (long long)xfer_resid)
1745                                 bytes_to_zero = zero_cnt;
1746                         else
1747                                 bytes_to_zero = xfer_resid;
1748
1749                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1750                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1751
1752                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1753                                              (int)upl_f_offset + io_offset, bytes_to_zero,
1754                                              (int)io_offset, xfer_resid, 0);
1755                         } else {
1756                                 int zero_pg_index;
1757
1758                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1759                                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1760
1761                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1762                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1763
1764                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1765                                                      (int)upl_f_offset + io_offset, bytes_to_zero,
1766                                                      (int)io_offset, xfer_resid, 0);
1767
1768                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1769                                            !upl_dirty_page(pl, zero_pg_index)) {
1770                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1771
1772                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1773                                                      (int)upl_f_offset + io_offset, bytes_to_zero,
1774                                                      (int)io_offset, xfer_resid, 0);
1775                                 }
1776                         }
1777                         xfer_resid -= bytes_to_zero;
1778                         zero_cnt   -= bytes_to_zero;
1779                         zero_off   += bytes_to_zero;
1780                         io_offset  += bytes_to_zero;
1781                 }
1782                 if (xfer_resid && uio_resid) {
1783                         bytes_to_move = min(uio_resid, xfer_resid);
1784
1785                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1786                                      (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1787
1788                         retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1789
1790
1791                         if (retval) {
1792                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1793                                         panic("cluster_write: kernel_upl_unmap failed\n");
1794
1795                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1796
1797                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1798                                              (int)upl, 0, 0, retval, 0);
1799                         } else {
1800                                 uio_resid  -= bytes_to_move;
1801                                 xfer_resid -= bytes_to_move;
1802                                 io_offset  += bytes_to_move;
1803                         }
1804                 }
1805                 while (xfer_resid && zero_cnt1 && retval == 0) {
1806
1807                         if (zero_cnt1 < (long long)xfer_resid)
1808                                 bytes_to_zero = zero_cnt1;
1809                         else
1810                                 bytes_to_zero = xfer_resid;
1811
1812                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1813                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1814
1815                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1816                                              (int)upl_f_offset + io_offset,
1817                                              bytes_to_zero, (int)io_offset, xfer_resid, 0);
1818                         } else {
1819                                 int zero_pg_index;
1820
1821                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1822                                 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1823
1824                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1825                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1826
1827                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1828                                                      (int)upl_f_offset + io_offset,
1829                                                      bytes_to_zero, (int)io_offset, xfer_resid, 0);
1830
1831                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1832                                            !upl_dirty_page(pl, zero_pg_index)) {
1833                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1834
1835                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1836                                                      (int)upl_f_offset + io_offset,
1837                                                      bytes_to_zero, (int)io_offset, xfer_resid, 0);
1838                                 }
1839                         }
1840                         xfer_resid -= bytes_to_zero;
1841                         zero_cnt1  -= bytes_to_zero;
1842                         zero_off1  += bytes_to_zero;
1843                         io_offset  += bytes_to_zero;
1844                 }
1845
1846                 if (retval == 0) {
1847                         int cl_index;
1848                         int can_delay;
1849
1850                         io_size += start_offset;
1851
1852                         if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1853                                 /*
1854                                  * if we're extending the file with this write
1855                                  * we'll zero fill the rest of the page so that
1856                                  * if the file gets extended again in such a way as to leave a
1857                                  * hole starting at this EOF, we'll have zero's in the correct spot
1858                                  */
1859                                 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1860
1861                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1862                                              (int)upl_f_offset + io_size,
1863                                              upl_size - io_size, 0, 0, 0);
1864                         }
1865                         if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1866                                 panic("cluster_write: kernel_upl_unmap failed\n");
1867
1868                         if (flags & IO_SYNC)
1869                                 /*
1870                                  * if the IO_SYNC flag is set than we need to
1871                                  * bypass any clusters and immediately issue
1872                                  * the I/O
1873                                  */
1874                                 goto issue_io;
1875
1876                         if (vp->v_clen == 0)
1877                                 /*
1878                                  * no clusters currently present
1879                                  */
1880                                 goto start_new_cluster;
1881
1882                         /*
1883                          * keep track of the overall dirty page
1884                          * range we've developed
1885                          * in case we have to fall back to the
1886                          * VHASDIRTY method of flushing
1887                          */
1888                         if (vp->v_flag & VHASDIRTY)
1889                                 goto delay_io;
1890
1891                         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
1892                                 /*
1893                                  * we have an existing cluster... see if this write will extend it nicely
1894                                  */
1895                                 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
1896                                         /*
1897                                          * the current write starts at or after the current cluster
1898                                          */
1899                                         if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1900                                                 /*
1901                                                  * we have a write that fits entirely
1902                                                  * within the existing cluster limits
1903                                                  */
1904                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg)
1905                                                         /*
1906                                                          * update our idea of where the cluster ends
1907                                                          */
1908                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
1909                                                 break;
1910                                         }
1911                                         if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1912                                                 /*
1913                                                  * we have a write that starts in the middle of the current cluster
1914                                                  * but extends beyond the cluster's limit
1915                                                  * we'll clip the current cluster if we actually
1916                                                  * overlap with the new write
1917                                                  * and start a new cluster with the current write
1918                                                  */
1919                                                  if (vp->v_clusters[cl_index].last_pg > start_blkno)
1920                                                         vp->v_clusters[cl_index].last_pg = start_blkno;
1921                                         }
1922                                         /*
1923                                          * we also get here for the case where the current write starts
1924                                          * beyond the limit of the existing cluster
1925                                          *
1926                                          * in either case, we'll check the remaining clusters before
1927                                          * starting a new one
1928                                          */
1929                                 } else {
1930                                         /*
1931                                          * the current write starts in front of the current cluster
1932                                          */
1933                                         if ((vp->v_clusters[cl_index].last_pg - start_blkno) <=  MAX_UPL_TRANSFER) {
1934                                                 /*
1935                                                  * we can just merge the old cluster
1936                                                  * with the new request and leave it
1937                                                  * in the cache
1938                                                  */
1939                                                 vp->v_clusters[cl_index].start_pg = start_blkno;
1940
1941                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
1942                                                         /*
1943                                                          * the current write completely
1944                                                          * envelops the existing cluster
1945                                                          */
1946                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
1947                                                 }
1948                                                 break;
1949                                         }
1950
1951                                         /*
1952                                          * if we were to combine this write with the current cluster
1953                                          * we would exceed the cluster size limit.... so,
1954                                          * let's see if there's any overlap of the new I/O with
1955                                          * the existing cluster...
1956                                          *
1957                                          */
1958                                         if (last_blkno > vp->v_clusters[cl_index].start_pg)
1959                                                 /*
1960                                                  * the current write extends into the existing cluster
1961                                                  * clip the current cluster by moving the start position
1962                                                  * to where the current write ends
1963                                                  */
1964                                                 vp->v_clusters[cl_index].start_pg = last_blkno;
1965                                         /*
1966                                          * if we get here, there was no way to merge
1967                                          * the new I/O with this cluster and
1968                                          * keep it under our maximum cluster length
1969                                          * we'll check the remaining clusters before starting a new one
1970                                          */
1971                                 }
1972                         }
1973                         if (cl_index < vp->v_clen)
1974                                 /*
1975                                  * we found an existing cluster that we
1976                                  * could merger this I/O into
1977                                  */
1978                                 goto delay_io;
1979
1980                         if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
1981                                 /*
1982                                  * we didn't find an existing cluster to
1983                                  * merge into, but there's room to start
1984                                  * a new one
1985                                  */
1986                                 goto start_new_cluster;
1987
1988                         /*
1989                          * no exisitng cluster to merge with and no
1990                          * room to start a new one... we'll try
1991                          * pushing the existing ones... if none of
1992                          * them are able to be pushed, we'll have
1993                          * to fall back on the VHASDIRTY mechanism
1994                          * cluster_try_push will set v_clen to the
1995                          * number of remaining clusters if it is
1996                          * unable to push all of them
1997                          */
1998                         if (vp->v_flag & VNOCACHE_DATA)
1999                                 can_delay = 0;
2000                         else
2001                                 can_delay = 1;
2002
2003                         if (cluster_try_push(vp, newEOF, 0, 0) == 0) {
2004                                 vp->v_flag |= VHASDIRTY;
2005                                 goto delay_io;
2006                         }
2007 start_new_cluster:
2008                         if (vp->v_clen == 0) {
2009                                 vp->v_ciosiz = devblocksize;
2010                                 vp->v_cstart = start_blkno;
2011                                 vp->v_lastw  = last_blkno;
2012                         }
2013                         vp->v_clusters[vp->v_clen].start_pg = start_blkno;
2014                         vp->v_clusters[vp->v_clen].last_pg  = last_blkno;
2015                         vp->v_clen++;
2016 delay_io:
2017                         /*
2018                          * make sure we keep v_cstart and v_lastw up to
2019                          * date in case we have to fall back on the
2020                          * V_HASDIRTY mechanism (or we've already entered it)
2021                          */
2022                         if (start_blkno < vp->v_cstart)
2023                                 vp->v_cstart = start_blkno;
2024                         if (last_blkno > vp->v_lastw)
2025                                 vp->v_lastw = last_blkno;
2026
2027                         ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2028                         continue;
2029 issue_io:
2030                         /*
2031                          * in order to maintain some semblance of coherency with mapped writes
2032                          * we need to write the cluster back out as a multiple of the PAGESIZE
2033                          * unless the cluster encompasses the last page of the file... in this
2034                          * case we'll round out to the nearest device block boundary
2035                          */
2036                         io_size = upl_size;
2037
2038                         if ((upl_f_offset + io_size) > newEOF) {
2039                                 io_size = newEOF - upl_f_offset;
2040                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2041                         }
2042
2043                         if (flags & IO_SYNC)
2044                                 io_flags = CL_COMMIT | CL_AGE;
2045                         else
2046                                 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
2047
2048                         if (vp->v_flag & VNOCACHE_DATA)
2049                                 io_flags |= CL_DUMP;
2050
2051                         while (vp->v_numoutput >= ASYNC_THROTTLE) {
2052                                 vp->v_flag |= VTHROTTLED;
2053                                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
2054                         }
2055                         retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
2056                                             io_flags, (struct buf *)0, (struct clios *)0);
2057                 }
2058         }
2059         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2060                      retval, 0, 0, 0, 0);
2061
2062         return (retval);
2063 }
2064
2065 int
2066 cluster_read(vp, uio, filesize, devblocksize, flags)
2067         struct vnode *vp;
2068         struct uio   *uio;
2069         off_t         filesize;
2070         int           devblocksize;
2071         int           flags;
2072 {
2073         int           prev_resid;
2074         int           clip_size;
2075         off_t         max_io_size;
2076         struct iovec  *iov;
2077         vm_offset_t   upl_offset;
2078         int           upl_size;
2079         int           pages_in_pl;
2080         upl_page_info_t *pl;
2081         int           upl_flags;
2082         upl_t         upl;
2083         int           retval = 0;
2084
2085         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2086                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2087
2088         /*
2089          * We set a threshhold of 4 pages to decide if the nocopy
2090          * read loop is worth the trouble...
2091          */
2092
2093         if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
2094           {
2095             retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2096             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2097                          (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2098             return(retval);
2099           }
2100
2101         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2102           {
2103             /* we know we have a resid, so this is safe */
2104             iov = uio->uio_iov;
2105             while (iov->iov_len == 0) {
2106               uio->uio_iov++;
2107               uio->uio_iovcnt--;
2108               iov = uio->uio_iov;
2109             }
2110
2111             /*
2112              * We check every vector target and if it is physically
2113              * contiguous space, we skip the sanity checks.
2114              */
2115
2116             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
2117             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
2118             pages_in_pl = 0;
2119             upl_flags = UPL_QUERY_OBJECT_TYPE;
2120             if((vm_map_get_upl(current_map(),
2121                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2122                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
2123               {
2124                 /*
2125                  * the user app must have passed in an invalid address
2126                  */
2127                 return (EFAULT);
2128               }
2129
2130             if (upl_flags & UPL_PHYS_CONTIG)
2131               {
2132                 retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
2133               }
2134             else if (uio->uio_resid < 4 * PAGE_SIZE)
2135               {
2136                 /*
2137                  * We set a threshhold of 4 pages to decide if the nocopy
2138                  * read loop is worth the trouble...
2139                  */
2140                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2141                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2142                              (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2143                 return(retval);
2144               }
2145             else if (uio->uio_offset & PAGE_MASK_64)
2146               {
2147                 /* Bring the file offset read up to a pagesize boundary */
2148                 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2149                 if (uio->uio_resid < clip_size)
2150                   clip_size = uio->uio_resid;
2151                 /*
2152                  * Fake the resid going into the cluster_read_x call
2153                  * and restore it on the way out.
2154                  */
2155                 prev_resid = uio->uio_resid;
2156                 uio->uio_resid = clip_size;
2157                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2158                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2159               }
2160             else if ((int)iov->iov_base & PAGE_MASK_64)
2161               {
2162                 clip_size = iov->iov_len;
2163                 prev_resid = uio->uio_resid;
2164                 uio->uio_resid = clip_size;
2165                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2166                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2167               }
2168             else
2169               {
2170                 /*
2171                  * If we come in here, we know the offset into
2172                  * the file is on a pagesize boundary
2173                  */
2174
2175                 max_io_size = filesize - uio->uio_offset;
2176                 clip_size = uio->uio_resid;
2177                 if (iov->iov_len < clip_size)
2178                   clip_size = iov->iov_len;
2179                 if (max_io_size < clip_size)
2180                   clip_size = (int)max_io_size;
2181
2182                 if (clip_size < PAGE_SIZE)
2183                   {
2184                     /*
2185                      * Take care of the tail end of the read in this vector.
2186                      */
2187                     prev_resid = uio->uio_resid;
2188                     uio->uio_resid = clip_size;
2189                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2190                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2191                   }
2192                 else
2193                   {
2194                     /* round clip_size down to a multiple of pagesize */
2195                     clip_size = clip_size & ~(PAGE_MASK);
2196                     prev_resid = uio->uio_resid;
2197                     uio->uio_resid = clip_size;
2198                     retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2199                     if ((retval==0) && uio->uio_resid)
2200                       retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2201                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2202                   }
2203               } /* end else */
2204           } /* end while */
2205
2206         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2207                      (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2208
2209         return(retval);
2210 }
2211
2212
2213 static int
2214 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2215         struct vnode *vp;
2216         struct uio   *uio;
2217         off_t         filesize;
2218         int           devblocksize;
2219         int           flags;
2220 {
2221         upl_page_info_t *pl;
2222         upl_t            upl;
2223         vm_offset_t      upl_offset;
2224         int              upl_size;
2225         off_t            upl_f_offset;
2226         int              start_offset;
2227         int              start_pg;
2228         int              last_pg;
2229         int              uio_last;
2230         int              pages_in_upl;
2231         off_t            max_size;
2232         int              io_size;
2233         vm_offset_t      io_address;
2234         kern_return_t    kret;
2235         int              segflg;
2236         int              error  = 0;
2237         int              retval = 0;
2238         int              b_lblkno;
2239         int              e_lblkno;
2240
2241         b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
2242
2243         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2244                 /*
2245                  * compute the size of the upl needed to encompass
2246                  * the requested read... limit each call to cluster_io
2247                  * to the maximum UPL size... cluster_io will clip if
2248                  * this exceeds the maximum io_size for the device,
2249                  * make sure to account for
2250                  * a starting offset that's not page aligned
2251                  */
2252                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2253                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2254                 max_size     = filesize - uio->uio_offset;
2255
2256                 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2257                         io_size = uio->uio_resid;
2258                 else
2259                         io_size = max_size;
2260
2261                 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
2262                         segflg = uio->uio_segflg;
2263
2264                         uio->uio_segflg = UIO_PHYS_USERSPACE;
2265
2266                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2267                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2268
2269                         while (io_size && retval == 0) {
2270                                 int         xsize;
2271                                 vm_offset_t paddr;
2272
2273                                 if (ubc_page_op(vp,
2274                                                 upl_f_offset,
2275                                                 UPL_POP_SET | UPL_POP_BUSY,
2276                                                 &paddr, 0) != KERN_SUCCESS)
2277                                         break;
2278
2279                                 xsize = PAGE_SIZE - start_offset;
2280
2281                                 if (xsize > io_size)
2282                                         xsize = io_size;
2283
2284                                 retval = uiomove((caddr_t)(paddr + start_offset), xsize, uio);
2285
2286                                 ubc_page_op(vp, upl_f_offset,
2287                                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2288
2289                                 io_size     -= xsize;
2290                                 start_offset = (int)
2291                                         (uio->uio_offset & PAGE_MASK_64);
2292                                 upl_f_offset = uio->uio_offset - start_offset;
2293                         }
2294                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2295                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2296
2297                         uio->uio_segflg = segflg;
2298
2299                         if (retval)
2300                                 break;
2301
2302                         if (io_size == 0) {
2303                                 /*
2304                                  * we're already finished with this read request
2305                                  * let's see if we should do a read-ahead
2306                                  */
2307                                 e_lblkno = (int)
2308                                         ((uio->uio_offset - 1) / PAGE_SIZE_64);
2309
2310                                 if (!(vp->v_flag & VRAOFF))
2311                                         /*
2312                                          * let's try to read ahead if we're in
2313                                          * a sequential access pattern
2314                                          */
2315                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2316                                 vp->v_lastr = e_lblkno;
2317
2318                                 break;
2319                         }
2320                         max_size = filesize - uio->uio_offset;
2321                 }
2322                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2323                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2324                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2325                 pages_in_upl = upl_size / PAGE_SIZE;
2326
2327                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2328                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2329
2330                 kret = ubc_create_upl(vp,
2331                                                 upl_f_offset,
2332                                                 upl_size,
2333                                                 &upl,
2334                                                 &pl,
2335                                                 UPL_FLAGS_NONE);
2336                 if (kret != KERN_SUCCESS)
2337                         panic("cluster_read: failed to get pagelist");
2338
2339                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2340                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2341
2342                 /*
2343                  * scan from the beginning of the upl looking for the first
2344                  * non-valid page.... this will become the first page in
2345                  * the request we're going to make to 'cluster_io'... if all
2346                  * of the pages are valid, we won't call through to 'cluster_io'
2347                  */
2348                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2349                         if (!upl_valid_page(pl, start_pg))
2350                                 break;
2351                 }
2352
2353                 /*
2354                  * scan from the starting invalid page looking for a valid
2355                  * page before the end of the upl is reached, if we
2356                  * find one, then it will be the last page of the request to
2357                  * 'cluster_io'
2358                  */
2359                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2360                         if (upl_valid_page(pl, last_pg))
2361                                 break;
2362                 }
2363
2364                 if (start_pg < last_pg) {
2365                         /*
2366                          * we found a range of 'invalid' pages that must be filled
2367                          * if the last page in this range is the last page of the file
2368                          * we may have to clip the size of it to keep from reading past
2369                          * the end of the last physical block associated with the file
2370                          */
2371                         upl_offset = start_pg * PAGE_SIZE;
2372                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2373
2374                         if ((upl_f_offset + upl_offset + io_size) > filesize)
2375                                 io_size = filesize - (upl_f_offset + upl_offset);
2376
2377                         /*
2378                          * issue a synchronous read to cluster_io
2379                          */
2380
2381                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2382                                            io_size, devblocksize, CL_READ, (struct buf *)0, (struct clios *)0);
2383                 }
2384                 if (error == 0) {
2385                         /*
2386                          * if the read completed successfully, or there was no I/O request
2387                          * issued, than map the upl into kernel address space and
2388                          * move the data into user land.... we'll first add on any 'valid'
2389                          * pages that were present in the upl when we acquired it.
2390                          */
2391                         u_int  val_size;
2392                         u_int  size_of_prefetch;
2393
2394                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2395                                 if (!upl_valid_page(pl, uio_last))
2396                                         break;
2397                         }
2398                         /*
2399                          * compute size to transfer this round,  if uio->uio_resid is
2400                          * still non-zero after this uiomove, we'll loop around and
2401                          * set up for another I/O.
2402                          */
2403                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2404
2405                         if (max_size < val_size)
2406                                 val_size = max_size;
2407
2408                         if (uio->uio_resid < val_size)
2409                                 val_size = uio->uio_resid;
2410
2411                         e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2412
2413                         if (size_of_prefetch = (uio->uio_resid - val_size)) {
2414                                 /*
2415                                  * if there's still I/O left to do for this request, then issue a
2416                                  * pre-fetch I/O... the I/O wait time will overlap
2417                                  * with the copying of the data
2418                                  */
2419                                 cluster_rd_prefetch(vp, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
2420                         } else {
2421                                 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2422                                         /*
2423                                          * let's try to read ahead if we're in
2424                                          * a sequential access pattern
2425                                          */
2426                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2427                                 vp->v_lastr = e_lblkno;
2428                         }
2429                         if (uio->uio_segflg == UIO_USERSPACE) {
2430                                 int       offset;
2431
2432                                 segflg = uio->uio_segflg;
2433
2434                                 uio->uio_segflg = UIO_PHYS_USERSPACE;
2435
2436
2437                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2438                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2439
2440                                 offset = start_offset;
2441
2442                                 while (val_size && retval == 0) {
2443                                         int       csize;
2444                                         int       i;
2445                                         caddr_t   paddr;
2446
2447                                         i = offset / PAGE_SIZE;
2448                                         csize = min(PAGE_SIZE - start_offset, val_size);
2449
2450                                         paddr = (caddr_t)upl_phys_page(pl, i) + start_offset;
2451
2452                                         retval = uiomove(paddr, csize, uio);
2453
2454                                         val_size    -= csize;
2455                                         offset      += csize;
2456                                         start_offset = offset & PAGE_MASK;
2457                                 }
2458                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2459                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2460
2461                                 uio->uio_segflg = segflg;
2462                         }
2463                         else
2464                         {
2465                                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
2466                                         panic("cluster_read: ubc_upl_map() failed\n");
2467
2468                                 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2469
2470                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
2471                                         panic("cluster_read: ubc_upl_unmap() failed\n");
2472                         }
2473                 }
2474                 if (start_pg < last_pg) {
2475                         /*
2476                          * compute the range of pages that we actually issued an I/O for
2477                          * and either commit them as valid if the I/O succeeded
2478                          * or abort them if the I/O failed
2479                          */
2480                         io_size = (last_pg - start_pg) * PAGE_SIZE;
2481
2482                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2483                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2484
2485                         if (error || (vp->v_flag & VNOCACHE_DATA))
2486                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2487                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2488                         else
2489                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2490                                                 UPL_COMMIT_CLEAR_DIRTY
2491                                                 | UPL_COMMIT_FREE_ON_EMPTY
2492                                                 | UPL_COMMIT_INACTIVATE);
2493
2494                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2495                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2496                 }
2497                 if ((last_pg - start_pg) < pages_in_upl) {
2498                         int cur_pg;
2499                         int commit_flags;
2500
2501                         /*
2502                          * the set of pages that we issued an I/O for did not encompass
2503                          * the entire upl... so just release these without modifying
2504                          * there state
2505                          */
2506                         if (error)
2507                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2508                         else {
2509                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2510                                              (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2511
2512                                 if (start_pg) {
2513                                         /*
2514                                          * we found some already valid pages at the beginning of
2515                                          * the upl commit these back to the inactive list with
2516                                          * reference cleared
2517                                          */
2518                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2519                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2520                                                                    | UPL_COMMIT_INACTIVATE;
2521
2522                                                 if (upl_dirty_page(pl, cur_pg))
2523                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2524
2525                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2526                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2527                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2528                                                 else
2529                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2530                                                                 PAGE_SIZE, commit_flags);
2531                                         }
2532                                 }
2533                                 if (last_pg < uio_last) {
2534                                         /*
2535                                          * we found some already valid pages immediately after the
2536                                          * pages we issued I/O for, commit these back to the
2537                                          * inactive list with reference cleared
2538                                          */
2539                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2540                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
2541                                                                                 | UPL_COMMIT_INACTIVATE;
2542
2543                                                 if (upl_dirty_page(pl, cur_pg))
2544                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2545
2546                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2547                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2548                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2549                                                 else
2550                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2551                                                                 PAGE_SIZE, commit_flags);
2552                                         }
2553                                 }
2554                                 if (uio_last < pages_in_upl) {
2555                                         /*
2556                                          * there were some invalid pages beyond the valid pages
2557                                          * that we didn't issue an I/O for, just release them
2558                                          * unchanged
2559                                          */
2560                                         ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2561                                                             (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
2562                                 }
2563
2564                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2565                                         (int)upl, -1, -1, 0, 0);
2566                         }
2567                 }
2568                 if (retval == 0)
2569                         retval = error;
2570         }
2571
2572         return (retval);
2573 }
2574
2575
2576 static int
2577 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2578         struct vnode *vp;
2579         struct uio   *uio;
2580         off_t         filesize;
2581         int           devblocksize;
2582         int           flags;
2583 {
2584         upl_t            upl;
2585         upl_page_info_t  *pl;
2586         off_t            upl_f_offset;
2587         vm_offset_t      upl_offset;
2588         off_t            start_upl_f_offset;
2589         off_t            max_io_size;
2590         int              io_size;
2591         int              upl_size;
2592         int              upl_needed_size;
2593         int              pages_in_pl;
2594         vm_offset_t      paddr;
2595         int              upl_flags;
2596         kern_return_t    kret;
2597         int              segflg;
2598         struct iovec     *iov;
2599         int              i;
2600         int              force_data_sync;
2601         int              error  = 0;
2602         int              retval = 0;
2603
2604         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2605                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2606
2607         /*
2608          * When we enter this routine, we know
2609          *  -- the offset into the file is on a pagesize boundary
2610          *  -- the resid is a page multiple
2611          *  -- the resid will not exceed iov_len
2612          */
2613
2614         iov = uio->uio_iov;
2615         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2616
2617           max_io_size = filesize - uio->uio_offset;
2618
2619           if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2620               io_size = max_io_size;
2621           else
2622               io_size = uio->uio_resid;
2623
2624           /*
2625            * We don't come into this routine unless
2626            * UIO_USERSPACE is set.
2627            */
2628           segflg = uio->uio_segflg;
2629
2630           uio->uio_segflg = UIO_PHYS_USERSPACE;
2631
2632           /*
2633            * First look for pages already in the cache
2634            * and move them to user space.
2635            */
2636           while (io_size && (retval == 0)) {
2637             upl_f_offset = uio->uio_offset;
2638
2639             /*
2640              * If this call fails, it means the page is not
2641              * in the page cache.
2642              */
2643             if (ubc_page_op(vp, upl_f_offset,
2644                             UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS)
2645               break;
2646
2647             retval = uiomove((caddr_t)(paddr), PAGE_SIZE, uio);
2648
2649             ubc_page_op(vp, upl_f_offset,
2650                         UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2651
2652             io_size     -= PAGE_SIZE;
2653             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
2654                            (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2655           }
2656
2657           uio->uio_segflg = segflg;
2658
2659           if (retval)
2660             {
2661               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2662                            (int)uio->uio_offset, uio->uio_resid, 2, retval, 0);
2663               return(retval);
2664             }
2665
2666           /* If we are already finished with this read, then return */
2667           if (io_size == 0)
2668             {
2669
2670               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2671                            (int)uio->uio_offset, uio->uio_resid, 3, io_size, 0);
2672               return(0);
2673             }
2674
2675           max_io_size = io_size;
2676           if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2677             max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2678
2679           start_upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
2680           upl_f_offset = start_upl_f_offset;
2681           io_size = 0;
2682
2683           while(io_size < max_io_size)
2684             {
2685
2686               if(ubc_page_op(vp, upl_f_offset,
2687                                 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS)
2688               {
2689                         ubc_page_op(vp, upl_f_offset,
2690                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2691                         break;
2692               }
2693
2694                   /*
2695                    * Build up the io request parameters.
2696                    */
2697
2698                   io_size += PAGE_SIZE;
2699                   upl_f_offset += PAGE_SIZE;
2700                 }
2701
2702               if (io_size == 0)
2703                 return(retval);
2704
2705           upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2706           upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2707
2708           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2709                        (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
2710
2711           for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
2712             {
2713               pages_in_pl = 0;
2714               upl_size = upl_needed_size;
2715               upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2716
2717               kret = vm_map_get_upl(current_map(),
2718                                     (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2719                                     &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2720
2721               if (kret != KERN_SUCCESS)
2722                 {
2723                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2724                                (int)upl_offset, upl_size, io_size, kret, 0);
2725
2726                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2727                                (int)uio->uio_offset, uio->uio_resid, 4, retval, 0);
2728
2729                   /* cluster_nocopy_read: failed to get pagelist */
2730                   /* do not return kret here */
2731                   return(retval);
2732                 }
2733
2734               pages_in_pl = upl_size / PAGE_SIZE;
2735               pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2736
2737               for(i=0; i < pages_in_pl; i++)
2738                 {
2739                   if (!upl_valid_page(pl, i))
2740                     break;
2741                 }
2742               if (i == pages_in_pl)
2743                 break;
2744
2745               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2746                                   UPL_ABORT_FREE_ON_EMPTY);
2747             }
2748
2749           if (force_data_sync >= 3)
2750             {
2751                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2752                                (int)upl_offset, upl_size, io_size, kret, 0);
2753
2754                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2755                                (int)uio->uio_offset, uio->uio_resid, 5, retval, 0);
2756               return(retval);
2757             }
2758           /*
2759            * Consider the possibility that upl_size wasn't satisfied.
2760            */
2761           if (upl_size != upl_needed_size)
2762             io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2763
2764           if (io_size == 0)
2765             {
2766               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2767                                    UPL_ABORT_FREE_ON_EMPTY);
2768               return(retval);
2769             }
2770
2771           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2772                        (int)upl_offset, upl_size, io_size, kret, 0);
2773
2774           /*
2775            * issue a synchronous read to cluster_io
2776            */
2777
2778           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2779                        (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
2780
2781           error = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
2782                              io_size, devblocksize, CL_READ| CL_NOZERO, (struct buf *)0,  (struct clios *)0);
2783
2784           if (error == 0) {
2785             /*
2786              * The cluster_io read completed successfully,
2787              * update the uio structure and commit.
2788              */
2789
2790             ubc_upl_commit_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2791                                         UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
2792
2793             iov->iov_base += io_size;
2794             iov->iov_len -= io_size;
2795             uio->uio_resid -= io_size;
2796             uio->uio_offset += io_size;
2797           }
2798           else {
2799             ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2800                                    UPL_ABORT_FREE_ON_EMPTY);
2801           }
2802
2803           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2804                        (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
2805
2806           if (retval == 0)
2807             retval = error;
2808
2809         } /* end while */
2810
2811
2812         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2813                      (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2814
2815         return (retval);
2816 }
2817
2818
2819
2820 static int
2821 cluster_phys_read(vp, uio, filesize, devblocksize, flags)
2822         struct vnode *vp;
2823         struct uio   *uio;
2824         off_t        filesize;
2825         int          devblocksize;
2826         int          flags;
2827 {
2828         upl_page_info_t *pl;
2829         upl_t            upl;
2830         vm_offset_t      upl_offset;
2831         vm_offset_t      dst_paddr;
2832         off_t            max_size;
2833         int              io_size;
2834         int              tail_size;
2835         int              upl_size;
2836         int              upl_needed_size;
2837         int              pages_in_pl;
2838         int              upl_flags;
2839         kern_return_t    kret;
2840         struct iovec     *iov;
2841         struct clios     iostate;
2842         int              error;
2843
2844         /*
2845          * When we enter this routine, we know
2846          *  -- the resid will not exceed iov_len
2847          *  -- the target address is physically contiguous
2848          */
2849
2850         iov = uio->uio_iov;
2851
2852         max_size = filesize - uio->uio_offset;
2853
2854         if (max_size > (off_t)((unsigned int)iov->iov_len))
2855                 io_size = iov->iov_len;
2856         else
2857                 io_size = max_size;
2858
2859         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2860         upl_needed_size = upl_offset + io_size;
2861
2862         error       = 0;
2863         pages_in_pl = 0;
2864         upl_size = upl_needed_size;
2865         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2866
2867         kret = vm_map_get_upl(current_map(),
2868                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2869                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
2870
2871         if (kret != KERN_SUCCESS) {
2872                 /*
2873                  * cluster_phys_read: failed to get pagelist
2874                  */
2875                 return(EINVAL);
2876         }
2877         if (upl_size < upl_needed_size) {
2878                 /*
2879                  * The upl_size wasn't satisfied.
2880                  */
2881                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2882
2883                 return(EINVAL);
2884         }
2885         pl = ubc_upl_pageinfo(upl);
2886
2887         dst_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
2888
2889         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
2890                 int   head_size;
2891
2892                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
2893
2894                 if (head_size > io_size)
2895                         head_size = io_size;
2896
2897                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
2898
2899                 if (error) {
2900                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2901
2902                         return(EINVAL);
2903                 }
2904                 upl_offset += head_size;
2905                 dst_paddr  += head_size;
2906                 io_size    -= head_size;
2907         }
2908         tail_size = io_size & (devblocksize - 1);
2909         io_size  -= tail_size;
2910
2911         iostate.io_completed = 0;
2912         iostate.io_issued = 0;
2913         iostate.io_error = 0;
2914         iostate.io_wanted = 0;
2915
2916         while (io_size && error == 0) {
2917                 int  xsize;
2918
2919                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2920                         xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
2921                 else
2922                         xsize = io_size;
2923                 /*
2924                  * request asynchronously so that we can overlap
2925                  * the preparation of the next I/O... we'll do
2926                  * the commit after all the I/O has completed
2927                  * since its all issued against the same UPL
2928                  * if there are already too many outstanding reads
2929                  * throttle back until we reach a more reasonable level
2930                  */
2931                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2932                         iostate.io_wanted = 1;
2933                         tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
2934                 }
2935
2936                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0,
2937                                    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
2938                                    (struct buf *)0, &iostate);
2939                 /*
2940                  * The cluster_io read was issued successfully,
2941                  * update the uio structure
2942                  */
2943                 if (error == 0) {
2944                         uio->uio_resid  -= xsize;
2945                         iov->iov_len    -= xsize;
2946                         iov->iov_base   += xsize;
2947                         uio->uio_offset += xsize;
2948                         dst_paddr       += xsize;
2949                         upl_offset      += xsize;
2950                         io_size         -= xsize;
2951                 }
2952         }
2953         /*
2954          * make sure any async reads have completed before
2955          * we proceed
2956          */
2957         while (iostate.io_issued != iostate.io_completed) {
2958                 iostate.io_wanted = 1;
2959                 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
2960         }
2961         if (iostate.io_error) {
2962                 error = iostate.io_error;
2963         }
2964         if (error == 0 && tail_size)
2965                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
2966
2967         /*
2968          * just release our hold on the physically contiguous
2969          * region without changing any state
2970          */
2971         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2972
2973         return (error);
2974 }
2975
2976
2977 /*
2978  * generate advisory I/O's in the largest chunks possible
2979  * the completed pages will be released into the VM cache
2980  */
2981 int
2982 advisory_read(vp, filesize, f_offset, resid, devblocksize)
2983         struct vnode *vp;
2984         off_t         filesize;
2985         off_t         f_offset;
2986         int           resid;
2987         int           devblocksize;
2988 {
2989         upl_page_info_t *pl;
2990         upl_t            upl;
2991         vm_offset_t      upl_offset;
2992         int              upl_size;
2993         off_t            upl_f_offset;
2994         int              start_offset;
2995         int              start_pg;
2996         int              last_pg;
2997         int              pages_in_upl;
2998         off_t            max_size;
2999         int              io_size;
3000         kern_return_t    kret;
3001         int              retval = 0;
3002         int              issued_io;
3003
3004         if (!UBCINFOEXISTS(vp))
3005                 return(EINVAL);
3006
3007         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3008                      (int)f_offset, resid, (int)filesize, devblocksize, 0);
3009
3010         while (resid && f_offset < filesize && retval == 0) {
3011                 /*
3012                  * compute the size of the upl needed to encompass
3013                  * the requested read... limit each call to cluster_io
3014                  * to the maximum UPL size... cluster_io will clip if
3015                  * this exceeds the maximum io_size for the device,
3016                  * make sure to account for
3017                  * a starting offset that's not page aligned
3018                  */
3019                 start_offset = (int)(f_offset & PAGE_MASK_64);
3020                 upl_f_offset = f_offset - (off_t)start_offset;
3021                 max_size     = filesize - f_offset;
3022
3023                 if (resid < max_size)
3024                         io_size = resid;
3025                 else
3026                         io_size = max_size;
3027
3028                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3029                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3030                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3031                 pages_in_upl = upl_size / PAGE_SIZE;
3032
3033                 kret = ubc_create_upl(vp,
3034                                                 upl_f_offset,
3035                                                 upl_size,
3036                                                 &upl,
3037                                                 &pl,
3038                                                 UPL_RET_ONLY_ABSENT);
3039                 if (kret != KERN_SUCCESS)
3040                         return(retval);
3041                 issued_io = 0;
3042
3043                 /*
3044                  * before we start marching forward, we must make sure we end on
3045                  * a present page, otherwise we will be working with a freed
3046                  * upl
3047                  */
3048                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3049                         if (upl_page_present(pl, last_pg))
3050                                 break;
3051                 }
3052                 pages_in_upl = last_pg + 1;
3053
3054
3055                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
3056                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3057
3058
3059                 for (last_pg = 0; last_pg < pages_in_upl; ) {
3060                         /*
3061                          * scan from the beginning of the upl looking for the first
3062                          * page that is present.... this will become the first page in
3063                          * the request we're going to make to 'cluster_io'... if all
3064                          * of the pages are absent, we won't call through to 'cluster_io'
3065                          */
3066                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3067                                 if (upl_page_present(pl, start_pg))
3068                                         break;
3069                         }
3070
3071                         /*
3072                          * scan from the starting present page looking for an absent
3073                          * page before the end of the upl is reached, if we
3074                          * find one, then it will terminate the range of pages being
3075                          * presented to 'cluster_io'
3076                          */
3077                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3078                                 if (!upl_page_present(pl, last_pg))
3079                                         break;
3080                         }
3081
3082                         if (last_pg > start_pg) {
3083                                 /*
3084                                  * we found a range of pages that must be filled
3085                                  * if the last page in this range is the last page of the file
3086                                  * we may have to clip the size of it to keep from reading past
3087                                  * the end of the last physical block associated with the file
3088                                  */
3089                                 upl_offset = start_pg * PAGE_SIZE;
3090                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
3091
3092                                 if ((upl_f_offset + upl_offset + io_size) > filesize)
3093                                         io_size = filesize - (upl_f_offset + upl_offset);
3094
3095                                 /*
3096                                  * issue an asynchronous read to cluster_io
3097                                  */
3098                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
3099                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
3100
3101                                 issued_io = 1;
3102                         }
3103                 }
3104                 if (issued_io == 0)
3105                         ubc_upl_abort(upl, 0);
3106
3107                 io_size = upl_size - start_offset;
3108
3109                 if (io_size > resid)
3110                         io_size = resid;
3111                 f_offset += io_size;
3112                 resid    -= io_size;
3113         }
3114
3115         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3116                      (int)f_offset, resid, retval, 0, 0);
3117
3118         return(retval);
3119 }
3120
3121
3122 int
3123 cluster_push(vp)
3124         struct vnode *vp;
3125 {
3126         int  retval;
3127
3128         if (!UBCINFOEXISTS(vp) || vp->v_clen == 0) {
3129                 vp->v_flag &= ~VHASDIRTY;
3130                 return(0);
3131         }
3132
3133         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3134                      vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
3135
3136         if (vp->v_flag & VHASDIRTY) {
3137                 daddr_t start_pg;
3138                 daddr_t last_pg;
3139                 daddr_t end_pg;
3140
3141                 start_pg = vp->v_cstart;
3142                 end_pg   = vp->v_lastw;
3143
3144                 vp->v_flag &= ~VHASDIRTY;
3145                 vp->v_clen = 0;
3146
3147                 while (start_pg < end_pg) {
3148                         last_pg = start_pg + MAX_UPL_TRANSFER;
3149
3150                         if (last_pg > end_pg)
3151                                 last_pg = end_pg;
3152
3153                         cluster_push_x(vp, ubc_getsize(vp), start_pg, last_pg, 0);
3154
3155                         start_pg = last_pg;
3156                 }
3157                 return (1);
3158         }
3159         retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
3160
3161         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3162                      vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
3163
3164         return (retval);
3165 }
3166
3167
3168 static int
3169 cluster_try_push(vp, EOF, can_delay, push_all)
3170         struct vnode *vp;
3171         off_t  EOF;
3172         int    can_delay;
3173         int    push_all;
3174 {
3175         int cl_index;
3176         int cl_index1;
3177         int min_index;
3178         int cl_len;
3179         int cl_total;
3180         int cl_pushed;
3181         struct v_cluster l_clusters[MAX_CLUSTERS];
3182
3183         /*
3184          * make a local 'sorted' copy of the clusters
3185          * and clear vp->v_clen so that new clusters can
3186          * be developed
3187          */
3188         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3189                 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3190                         if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3191                                 continue;
3192                         if (min_index == -1)
3193                                 min_index = cl_index1;
3194                         else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3195                                 min_index = cl_index1;
3196                 }
3197                 if (min_index == -1)
3198                         break;
3199                 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3200                 l_clusters[cl_index].last_pg  = vp->v_clusters[min_index].last_pg;
3201
3202                 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3203         }
3204         cl_len     = cl_index;
3205         vp->v_clen = 0;
3206
3207         for (cl_pushed = 0, cl_index = 0; cl_index < cl_len; cl_index++) {
3208                 /*
3209                  * try to push each cluster in turn...  cluster_push_x may not
3210                  * push the cluster if can_delay is TRUE and the cluster doesn't
3211                  * meet the critera for an immediate push
3212                  */
3213                 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3214                         l_clusters[cl_index].start_pg = 0;
3215                         l_clusters[cl_index].last_pg  = 0;
3216
3217                         cl_pushed++;
3218
3219                         if (push_all == 0)
3220                                 break;
3221                 }
3222         }
3223         if (cl_len > cl_pushed) {
3224                /*
3225                 * we didn't push all of the clusters, so
3226                 * lets try to merge them back in to the vnode
3227                 */
3228                 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3229                         /*
3230                          * we picked up some new clusters while we were trying to
3231                          * push the old ones (I don't think this can happen because
3232                          * I'm holding the lock, but just in case)... the sum of the
3233                          * leftovers plus the new cluster count exceeds our ability
3234                          * to represent them, so fall back to the VHASDIRTY mechanism
3235                          */
3236                         for (cl_index = 0; cl_index < cl_len; cl_index++) {
3237                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3238                                         continue;
3239
3240                                 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3241                                         vp->v_cstart = l_clusters[cl_index].start_pg;
3242                                 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3243                                         vp->v_lastw = l_clusters[cl_index].last_pg;
3244                         }
3245                         vp->v_flag |= VHASDIRTY;
3246                 } else {
3247                         /*
3248                          * we've got room to merge the leftovers back in
3249                          * just append them starting at the next 'hole'
3250                          * represented by vp->v_clen
3251                          */
3252                         for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3253                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3254                                         continue;
3255
3256                                 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3257                                 vp->v_clusters[cl_index1].last_pg  = l_clusters[cl_index].last_pg;
3258
3259                                 if (cl_index1 == 0) {
3260                                         vp->v_cstart = l_clusters[cl_index].start_pg;
3261                                         vp->v_lastw  = l_clusters[cl_index].last_pg;
3262                                 } else {
3263                                         if (l_clusters[cl_index].start_pg < vp->v_cstart)
3264                                                 vp->v_cstart = l_clusters[cl_index].start_pg;
3265                                         if (l_clusters[cl_index].last_pg > vp->v_lastw)
3266                                                 vp->v_lastw = l_clusters[cl_index].last_pg;
3267                                 }
3268                                 cl_index1++;
3269                         }
3270                         /*
3271                          * update the cluster count
3272                          */
3273                         vp->v_clen = cl_index1;
3274                 }
3275         }
3276         return(MAX_CLUSTERS - vp->v_clen);
3277 }
3278
3279
3280
3281 static int
3282 cluster_push_x(vp, EOF, first, last, can_delay)
3283         struct vnode *vp;
3284         off_t  EOF;
3285         daddr_t first;
3286         daddr_t last;
3287         int    can_delay;
3288 {
3289         upl_page_info_t *pl;
3290         upl_t            upl;
3291         vm_offset_t      upl_offset;
3292         int              upl_size;
3293         off_t            upl_f_offset;
3294         int              pages_in_upl;
3295         int              start_pg;
3296         int              last_pg;
3297         int              io_size;
3298         int              io_flags;
3299         int              size;
3300         kern_return_t    kret;
3301
3302
3303         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3304                      vp->v_clen, first, last, EOF, 0);
3305
3306         if ((pages_in_upl = last - first) == 0) {
3307                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
3308
3309                 return (1);
3310         }
3311         upl_size = pages_in_upl * PAGE_SIZE;
3312         upl_f_offset = ((off_t)first) * PAGE_SIZE_64;
3313
3314         if (upl_f_offset + upl_size >= EOF) {
3315
3316                 if (upl_f_offset >= EOF) {
3317                         /*
3318                          * must have truncated the file and missed
3319                          * clearing a dangling cluster (i.e. it's completely
3320                          * beyond the new EOF
3321                          */
3322                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3323
3324                         return(1);
3325                 }
3326                 size = EOF - upl_f_offset;
3327
3328                 upl_size = (size + (PAGE_SIZE - 1) ) & ~(PAGE_SIZE - 1);
3329                 pages_in_upl = upl_size / PAGE_SIZE;
3330         } else {
3331                 if (can_delay && (pages_in_upl < (MAX_UPL_TRANSFER - (MAX_UPL_TRANSFER / 2))))
3332                         return(0);
3333                 size = upl_size;
3334         }
3335         kret = ubc_create_upl(vp,
3336                                 upl_f_offset,
3337                                 upl_size,
3338                                 &upl,
3339                                 &pl,
3340                                 UPL_RET_ONLY_DIRTY);
3341         if (kret != KERN_SUCCESS)
3342                 panic("cluster_push: failed to get pagelist");
3343
3344         if (can_delay) {
3345                 int  num_of_dirty;
3346
3347                 for (num_of_dirty = 0, start_pg = 0; start_pg < pages_in_upl; start_pg++) {
3348                         if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3349                                 num_of_dirty++;
3350                 }
3351                 if (num_of_dirty < pages_in_upl / 2) {
3352                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3353
3354                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 0, 2, num_of_dirty, (pages_in_upl / 2), 0);
3355
3356                         return(0);
3357                 }
3358         }
3359         last_pg = 0;
3360
3361         while (size) {
3362
3363                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3364                         if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3365                                 break;
3366                 }
3367                 if (start_pg > last_pg) {
3368                         io_size = (start_pg - last_pg) * PAGE_SIZE;
3369
3370                         ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size,
3371                                         UPL_ABORT_FREE_ON_EMPTY);
3372
3373                         if (io_size < size)
3374                                 size -= io_size;
3375                         else
3376                                 break;
3377                 }
3378                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3379                         if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
3380                                 break;
3381                 }
3382                 upl_offset = start_pg * PAGE_SIZE;
3383
3384                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3385
3386                 if (vp->v_flag & VNOCACHE_DATA)
3387                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC | CL_DUMP;
3388                 else
3389                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
3390
3391                 while (vp->v_numoutput >= ASYNC_THROTTLE) {
3392                         vp->v_flag |= VTHROTTLED;
3393                         tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
3394                 }
3395                 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
3396
3397                 size -= io_size;
3398         }
3399         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3400
3401         return(1);
3402 }
3403
3404
3405
3406 static int
3407 cluster_align_phys_io(struct vnode *vp, struct uio *uio, vm_offset_t usr_paddr, int xsize, int devblocksize, int flags)
3408 {
3409         struct iovec     *iov;
3410         upl_page_info_t  *pl;
3411         upl_t            upl;
3412         vm_offset_t      ubc_paddr;
3413         kern_return_t    kret;
3414         int              error = 0;
3415
3416         iov = uio->uio_iov;
3417
3418         kret = ubc_create_upl(vp,
3419                               uio->uio_offset & ~PAGE_MASK_64,
3420                               PAGE_SIZE,
3421                               &upl,
3422                               &pl,
3423                               UPL_FLAGS_NONE);
3424
3425         if (kret != KERN_SUCCESS)
3426                 return(EINVAL);
3427
3428         if (!upl_valid_page(pl, 0)) {
3429                 /*
3430                  * issue a synchronous read to cluster_io
3431                  */
3432                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3433                                    CL_READ, (struct buf *)0, (struct clios *)0);
3434                 if (error) {
3435                           ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3436
3437                           return(error);
3438                 }
3439         }
3440         ubc_paddr = (vm_offset_t)upl_phys_page(pl, 0) + (int)(uio->uio_offset & PAGE_MASK_64);
3441
3442         if (flags & CL_READ)
3443                 copyp2p(ubc_paddr, usr_paddr, xsize, 2);
3444         else
3445                 copyp2p(usr_paddr, ubc_paddr, xsize, 1);
3446
3447         if ( !(flags & CL_READ) || upl_dirty_page(pl, 0)) {
3448                 /*
3449                  * issue a synchronous write to cluster_io
3450                  */
3451                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3452                                    0, (struct buf *)0, (struct clios *)0);
3453         }
3454         if (error == 0) {
3455                 uio->uio_offset += xsize;
3456                 iov->iov_base   += xsize;
3457                 iov->iov_len    -= xsize;
3458                 uio->uio_resid  -= xsize;
3459         }
3460         ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3461
3462         return (error);
3463 }