bsd/vfs/vfs_cluster.c

   1
   2 /*
   3  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   4  *
   5  * @APPLE_LICENSE_HEADER_START@
   6  *
   7  * The contents of this file constitute Original Code as defined in and
   8  * are subject to the Apple Public Source License Version 1.1 (the
   9  * "License").  You may not use this file except in compliance with the
  10  * License.  Please obtain a copy of the License at
  11  * http://www.apple.com/publicsource and read it before using this file.
  12  *
  13  * This Original Code and all software distributed under the License are
  14  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  18  * License for the specific language governing rights and limitations
  19  * under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  24 /*
  25  * Copyright (c) 1993
  26  *      The Regents of the University of California.  All rights reserved.
  27  *
  28  * Redistribution and use in source and binary forms, with or without
  29  * modification, are permitted provided that the following conditions
  30  * are met:
  31  * 1. Redistributions of source code must retain the above copyright
  32  *    notice, this list of conditions and the following disclaimer.
  33  * 2. Redistributions in binary form must reproduce the above copyright
  34  *    notice, this list of conditions and the following disclaimer in the
  35  *    documentation and/or other materials provided with the distribution.
  36  * 3. All advertising materials mentioning features or use of this software
  37  *    must display the following acknowledgement:
  38  *      This product includes software developed by the University of
  39  *      California, Berkeley and its contributors.
  40  * 4. Neither the name of the University nor the names of its contributors
  41  *    may be used to endorse or promote products derived from this software
  42  *    without specific prior written permission.
  43  *
  44  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  45  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  46  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  47  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  48  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  49  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  50  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  51  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  52  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  53  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  54  * SUCH DAMAGE.
  55  *
  56  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  57  */
  58
  59 #include <sys/param.h>
  60 #include <sys/proc.h>
  61 #include <sys/buf.h>
  62 #include <sys/vnode.h>
  63 #include <sys/mount.h>
  64 #include <sys/trace.h>
  65 #include <sys/malloc.h>
  66 #include <sys/resourcevar.h>
  67 #include <libkern/libkern.h>
  68
  69 #include <sys/ubc.h>
  70 #include <vm/vm_pageout.h>
  71
  72 #include <sys/kdebug.h>
  73
  74 #define CL_READ      0x01
  75 #define CL_ASYNC     0x02
  76 #define CL_COMMIT    0x04
  77 #define CL_PAGEOUT   0x10
  78 #define CL_AGE       0x20
  79 #define CL_DUMP      0x40
  80 #define CL_NOZERO    0x80
  81 #define CL_PAGEIN    0x100
  82 #define CL_DEV_MEMORY 0x200
  83
  84 static void cluster_zero(upl_t upl, vm_offset_t   upl_offset,
  85                 int size, struct buf *bp);
  86 static int cluster_read_x(struct vnode *vp, struct uio *uio,
  87                 off_t filesize, int devblocksize, int flags);
  88 static int cluster_write_x(struct vnode *vp, struct uio *uio,
  89                 off_t oldEOF, off_t newEOF, off_t headOff,
  90                 off_t tailOff, int devblocksize, int flags);
  91 static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
  92                 off_t filesize, int devblocksize, int flags);
  93 static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
  94                 off_t newEOF, int devblocksize, int flags);
  95 static int cluster_phys_read(struct vnode *vp, struct uio *uio,
  96                 off_t filesize);
  97 static int cluster_phys_write(struct vnode *vp, struct uio *uio);
  98 static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
  99 static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all);
 100
 101
 102 /*
 103  * throttle the number of async writes that
 104  * can be outstanding on a single vnode
 105  * before we issue a synchronous write
 106  */
 107 #define ASYNC_THROTTLE  9
 108
 109 static int
 110 cluster_iodone(bp)
 111         struct buf *bp;
 112 {
 113         int         b_flags;
 114         int         error;
 115         int         total_size;
 116         int         total_resid;
 117         int         upl_offset;
 118         int         zero_offset;
 119         upl_t       upl;
 120         struct buf *cbp;
 121         struct buf *cbp_head;
 122         struct buf *cbp_next;
 123         struct buf *real_bp;
 124         struct vnode *vp;
 125         int         commit_size;
 126         int         pg_offset;
 127
 128
 129         cbp_head = (struct buf *)(bp->b_trans_head);
 130
 131         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 132                      (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 133
 134         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 135                 /*
 136                  * all I/O requests that are part of this transaction
 137                  * have to complete before we can process it
 138                  */
 139                 if ( !(cbp->b_flags & B_DONE)) {
 140
 141                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 142                                      (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
 143
 144                         return 0;
 145                 }
 146         }
 147         error       = 0;
 148         total_size  = 0;
 149         total_resid = 0;
 150
 151         cbp        = cbp_head;
 152         upl_offset = cbp->b_uploffset;
 153         upl        = cbp->b_pagelist;
 154         b_flags    = cbp->b_flags;
 155         real_bp    = cbp->b_real_bp;
 156         vp         = cbp->b_vp;
 157         zero_offset= cbp->b_validend;
 158
 159         while (cbp) {
 160                 if (cbp->b_vectorcount > 1)
 161                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 162
 163                 if ((cbp->b_flags & B_ERROR) && error == 0)
 164                         error = cbp->b_error;
 165
 166                 total_resid += cbp->b_resid;
 167                 total_size  += cbp->b_bcount;
 168
 169                 cbp_next = cbp->b_trans_next;
 170
 171                 free_io_buf(cbp);
 172
 173                 cbp = cbp_next;
 174         }
 175         if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
 176                 vp->v_flag &= ~VTHROTTLED;
 177                 wakeup((caddr_t)&vp->v_numoutput);
 178         }
 179         if (zero_offset)
 180                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 181
 182         if ((b_flags & B_NEED_IODONE) && real_bp) {
 183                 if (error) {
 184                         real_bp->b_flags |= B_ERROR;
 185                         real_bp->b_error = error;
 186                 }
 187                 real_bp->b_resid = total_resid;
 188
 189                 biodone(real_bp);
 190         }
 191         if (error == 0 && total_resid)
 192                 error = EIO;
 193
 194         if (b_flags & B_COMMIT_UPL) {
 195                 pg_offset   = upl_offset & PAGE_MASK;
 196                 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 197
 198                 if (error || (b_flags & B_NOCACHE)) {
 199                         int upl_abort_code;
 200
 201                         if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
 202                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 203                         else if (b_flags & B_PGIN)
 204                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 205                         else
 206                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 207
 208                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 209                                         upl_abort_code);
 210
 211                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 212                                      (int)upl, upl_offset - pg_offset, commit_size,
 213                                      0x80000000|upl_abort_code, 0);
 214
 215                 } else {
 216                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 217
 218                         if ( !(b_flags & B_PAGEOUT))
 219                                 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
 220                         if (b_flags & B_AGE)
 221                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 222
 223                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 224                                         upl_commit_flags);
 225
 226                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 227                                      (int)upl, upl_offset - pg_offset, commit_size,
 228                                      upl_commit_flags, 0);
 229                 }
 230         } else
 231                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 232                              (int)upl, upl_offset, 0, error, 0);
 233
 234         return (error);
 235 }
 236
 237
 238 static void
 239 cluster_zero(upl, upl_offset, size, bp)
 240         upl_t         upl;
 241         vm_offset_t   upl_offset;
 242         int           size;
 243         struct buf   *bp;
 244 {
 245         vm_offset_t   io_addr = 0;
 246         int           must_unmap = 0;
 247         kern_return_t kret;
 248
 249         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
 250                      upl_offset, size, (int)bp, 0, 0);
 251
 252         if (bp == NULL || bp->b_data == NULL) {
 253                 kret = ubc_upl_map(upl, &io_addr);
 254
 255                 if (kret != KERN_SUCCESS)
 256                         panic("cluster_zero: ubc_upl_map() failed with (%d)", kret);
 257                 if (io_addr == 0)
 258                         panic("cluster_zero: ubc_upl_map() mapped 0");
 259
 260                 must_unmap = 1;
 261         } else
 262                 io_addr = (vm_offset_t)bp->b_data;
 263         bzero((caddr_t)(io_addr + upl_offset), size);
 264
 265         if (must_unmap) {
 266                 kret = ubc_upl_unmap(upl);
 267
 268                 if (kret != KERN_SUCCESS)
 269                         panic("cluster_zero: kernel_upl_unmap failed");
 270         }
 271 }
 272
 273 static int
 274 cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp)
 275         struct vnode *vp;
 276         upl_t         upl;
 277         vm_offset_t   upl_offset;
 278         off_t         f_offset;
 279         int           non_rounded_size;
 280         int           devblocksize;
 281         int           flags;
 282         struct buf   *real_bp;
 283 {
 284         struct buf   *cbp;
 285         struct iovec *iovp;
 286         u_int           size;
 287         int           io_flags;
 288         int           error = 0;
 289         int           retval = 0;
 290         struct buf   *cbp_head = 0;
 291         struct buf   *cbp_tail = 0;
 292         upl_page_info_t *pl;
 293         int buf_count = 0;
 294         int pg_count;
 295         int pg_offset;
 296         u_int max_iosize;
 297         u_int max_vectors;
 298         int priv;
 299         int zero_offset = 0;
 300
 301         if (flags & CL_READ) {
 302                 io_flags = (B_VECTORLIST | B_READ);
 303
 304                 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
 305         } else {
 306                 io_flags = (B_VECTORLIST | B_WRITEINPROG);
 307
 308                 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
 309         }
 310         pl = ubc_upl_pageinfo(upl);
 311
 312         if (flags & CL_ASYNC)
 313                 io_flags |= (B_CALL | B_ASYNC);
 314         if (flags & CL_AGE)
 315                 io_flags |= B_AGE;
 316         if (flags & CL_DUMP)
 317                 io_flags |= B_NOCACHE;
 318         if (flags & CL_PAGEIN)
 319                 io_flags |= B_PGIN;
 320
 321         if (devblocksize)
 322                 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
 323         else
 324                 size = non_rounded_size;
 325
 326
 327         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 328                      (int)f_offset, size, upl_offset, flags, 0);
 329
 330         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 331                 /*
 332                  * then we are going to end up
 333                  * with a page that we can't complete (the file size wasn't a multiple
 334                  * of PAGE_SIZE and we're trying to read to the end of the file
 335                  * so we'll go ahead and zero out the portion of the page we can't
 336                  * read in from the file
 337                  */
 338                 zero_offset = upl_offset + non_rounded_size;
 339         }
 340         while (size) {
 341                 size_t io_size;
 342                 int vsize;
 343                 int i;
 344                 int pl_index;
 345                 int pg_resid;
 346                 int num_contig;
 347                 daddr_t lblkno;
 348                 daddr_t blkno;
 349
 350                 if (size > max_iosize)
 351                         io_size = max_iosize;
 352                 else
 353                         io_size = size;
 354
 355                 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, &io_size, NULL)) {
 356                         if (error == EOPNOTSUPP)
 357                                 panic("VOP_CMAP Unimplemented");
 358                         break;
 359                 }
 360
 361                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 362                              (int)f_offset, (int)blkno, io_size, zero_offset, 0);
 363
 364                 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
 365                         if (flags & CL_PAGEOUT) {
 366                                 error = EINVAL;
 367                                 break;
 368                         };
 369
 370                         /* Try paging out the page individually before
 371                            giving up entirely and dumping it (it could
 372                            be mapped in a "hole" and require allocation
 373                            before the I/O:
 374                          */
 375                          ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE_64, UPL_ABORT_FREE_ON_EMPTY);
 376                          if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
 377                                 error = EINVAL;
 378                                 break;
 379                          };
 380
 381                         upl_offset += PAGE_SIZE_64;
 382                         f_offset   += PAGE_SIZE_64;
 383                         size       -= PAGE_SIZE_64;
 384                         continue;
 385                 }
 386                 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
 387                 /*
 388                  * we have now figured out how much I/O we can do - this is in 'io_size'
 389                  * pl_index represents the first page in the 'upl' that the I/O will occur for
 390                  * pg_offset is the starting point in the first page for the I/O
 391                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 392                  */
 393                 pl_index  = upl_offset / PAGE_SIZE;
 394                 pg_offset = upl_offset & PAGE_MASK;
 395                 pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 396
 397                 if (flags & CL_DEV_MEMORY) {
 398                         /*
 399                          * currently, can't deal with reading 'holes' in file
 400                          */
 401                         if ((long)blkno == -1) {
 402                                 error = EINVAL;
 403                                 break;
 404                         }
 405                         /*
 406                          * treat physical requests as one 'giant' page
 407                          */
 408                         pg_count = 1;
 409                 }
 410                 if ((flags & CL_READ) && (long)blkno == -1) {
 411                         int bytes_to_zero;
 412
 413                         /*
 414                          * if we're reading and blkno == -1, then we've got a
 415                          * 'hole' in the file that we need to deal with by zeroing
 416                          * out the affected area in the upl
 417                          */
 418                         if (zero_offset && io_size == size) {
 419                                 /*
 420                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
 421                                  * than 'zero_offset' will be non-zero
 422                                  * if the 'hole' returned by VOP_CMAP extends all the way to the eof
 423                                  * (indicated by the io_size finishing off the I/O request for this UPL)
 424                                  * than we're not going to issue an I/O for the
 425                                  * last page in this upl... we need to zero both the hole and the tail
 426                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
 427                                  */
 428                                 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
 429
 430                                 zero_offset = 0;
 431                         } else
 432                                 bytes_to_zero = io_size;
 433
 434                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
 435
 436                         if (cbp_head)
 437                                 /*
 438                                  * if there is a current I/O chain pending
 439                                  * then the first page of the group we just zero'd
 440                                  * will be handled by the I/O completion if the zero
 441                                  * fill started in the middle of the page
 442                                  */
 443                                 pg_count = (io_size - pg_offset) / PAGE_SIZE;
 444                         else {
 445                                 /*
 446                                  * no pending I/O to pick up that first page
 447                                  * so, we have to make sure it gets committed
 448                                  * here.
 449                                  * set the pg_offset to 0 so that the upl_commit_range
 450                                  * starts with this page
 451                                  */
 452                                 pg_count = (io_size + pg_offset) / PAGE_SIZE;
 453                                 pg_offset = 0;
 454                         }
 455                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 456                                 /*
 457                                  * if we're done with the request for this UPL
 458                                  * then we have to make sure to commit the last page
 459                                  * even if we only partially zero-filled it
 460                                  */
 461                                 pg_count++;
 462
 463                         if (pg_count) {
 464                                 if (pg_offset)
 465                                         pg_resid = PAGE_SIZE - pg_offset;
 466                                 else
 467                                         pg_resid = 0;
 468
 469                                 if (flags & CL_COMMIT)
 470                                         ubc_upl_commit_range(upl,
 471                                                         (upl_offset + pg_resid) & ~PAGE_MASK,
 472                                                         pg_count * PAGE_SIZE,
 473                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 474                         }
 475                         upl_offset += io_size;
 476                         f_offset   += io_size;
 477                         size       -= io_size;
 478
 479                         if (cbp_head && pg_count)
 480                                 goto start_io;
 481                         continue;
 482
 483                 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
 484                         real_bp->b_blkno = blkno;
 485                 }
 486
 487                 if (pg_count > 1) {
 488                         if (pg_count > max_vectors) {
 489                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 490
 491                                 if (io_size < 0) {
 492                                         io_size = PAGE_SIZE - pg_offset;
 493                                         pg_count = 1;
 494                                 } else
 495                                         pg_count = max_vectors;
 496                         }
 497                         /*
 498                          * we need to allocate space for the vector list
 499                          */
 500                         if (pg_count > 1) {
 501                                 iovp = (struct iovec *)_MALLOC(sizeof(struct iovec) * pg_count,
 502                                                                M_SEGMENT, M_NOWAIT);
 503
 504                                 if (iovp == (struct iovec *) 0) {
 505                                         /*
 506                                          * if the allocation fails, then throttle down to a single page
 507                                          */
 508                                         io_size = PAGE_SIZE - pg_offset;
 509                                         pg_count = 1;
 510                                 }
 511                         }
 512                 }
 513
 514                 /* Throttle the speculative IO */
 515                 if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 516                         priv = 0;
 517                 else
 518                         priv = 1;
 519
 520                 cbp = alloc_io_buf(vp, priv);
 521
 522                 if (pg_count == 1)
 523                         /*
 524                          * we use the io vector that's reserved in the buffer header
 525                          * this insures we can always issue an I/O even in a low memory
 526                          * condition that prevents the _MALLOC from succeeding... this
 527                          * is necessary to prevent deadlocks with the pager
 528                          */
 529                         iovp = (struct iovec *)(&cbp->b_vects[0]);
 530
 531                 cbp->b_vectorlist  = (void *)iovp;
 532                 cbp->b_vectorcount = pg_count;
 533
 534                 if (flags & CL_DEV_MEMORY) {
 535
 536                         iovp->iov_len  = io_size;
 537                         iovp->iov_base = (caddr_t)upl_phys_page(pl, 0);
 538
 539                         if (iovp->iov_base == (caddr_t) 0) {
 540                                 free_io_buf(cbp);
 541                                 error = EINVAL;
 542                         } else
 543                                 iovp->iov_base += upl_offset;
 544                 } else {
 545
 546                   for (i = 0, vsize = io_size; i < pg_count; i++, iovp++) {
 547                         int     psize;
 548
 549                         psize = PAGE_SIZE - pg_offset;
 550
 551                         if (psize > vsize)
 552                                 psize = vsize;
 553
 554                         iovp->iov_len  = psize;
 555                         iovp->iov_base = (caddr_t)upl_phys_page(pl, pl_index + i);
 556
 557                         if (iovp->iov_base == (caddr_t) 0) {
 558                                 if (pg_count > 1)
 559                                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 560                                 free_io_buf(cbp);
 561
 562                                 error = EINVAL;
 563                                 break;
 564                         }
 565                         iovp->iov_base += pg_offset;
 566                         pg_offset = 0;
 567
 568                         if (flags & CL_PAGEOUT) {
 569                                 int         s;
 570                                 struct buf *bp;
 571
 572                                 s = splbio();
 573                                 if (bp = incore(vp, lblkno + i)) {
 574                                         if (!ISSET(bp->b_flags, B_BUSY)) {
 575                                                 bremfree(bp);
 576                                                 SET(bp->b_flags, (B_BUSY | B_INVAL));
 577                                                 splx(s);
 578                                                 brelse(bp);
 579                                         } else
 580                                                 panic("BUSY bp found in cluster_io");
 581                                 }
 582                                 splx(s);
 583                         }
 584                         vsize -= psize;
 585                     }
 586                 }
 587                 if (error)
 588                         break;
 589
 590                 if (flags & CL_ASYNC)
 591                         cbp->b_iodone = (void *)cluster_iodone;
 592                 cbp->b_flags |= io_flags;
 593
 594                 cbp->b_lblkno = lblkno;
 595                 cbp->b_blkno  = blkno;
 596                 cbp->b_bcount = io_size;
 597                 cbp->b_pagelist  = upl;
 598                 cbp->b_uploffset = upl_offset;
 599                 cbp->b_trans_next = (struct buf *)0;
 600
 601                 if (flags & CL_READ)
 602                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 603                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 604                 else
 605                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 606                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 607
 608                 if (cbp_head) {
 609                         cbp_tail->b_trans_next = cbp;
 610                         cbp_tail = cbp;
 611                 } else {
 612                         cbp_head = cbp;
 613                         cbp_tail = cbp;
 614                 }
 615                 (struct buf *)(cbp->b_trans_head) = cbp_head;
 616                 buf_count++;
 617
 618                 upl_offset += io_size;
 619                 f_offset   += io_size;
 620                 size       -= io_size;
 621
 622                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
 623                         /*
 624                          * if we have no more I/O to issue or
 625                          * the current I/O we've prepared fully
 626                          * completes the last page in this request
 627                          * and it's either an ASYNC request or
 628                          * we've already accumulated more than 8 I/O's into
 629                          * this transaction and it's not an I/O directed to
 630                          * special DEVICE memory
 631                          * then go ahead and issue the I/O
 632                          */
 633 start_io:
 634                         if (flags & CL_COMMIT)
 635                                 cbp_head->b_flags |= B_COMMIT_UPL;
 636                         if (flags & CL_PAGEOUT)
 637                                 cbp_head->b_flags |= B_PAGEOUT;
 638                         if (flags & CL_PAGEIN)
 639                                 cbp_head->b_flags |= B_PGIN;
 640
 641                         if (real_bp) {
 642                                 cbp_head->b_flags |= B_NEED_IODONE;
 643                                 cbp_head->b_real_bp = real_bp;
 644                         } else
 645                                 cbp_head->b_real_bp = (struct buf *)NULL;
 646
 647                         if (size == 0) {
 648                                 /*
 649                                  * we're about to issue the last I/O for this upl
 650                                  * if this was a read to the eof and the eof doesn't
 651                                  * finish on a page boundary, than we need to zero-fill
 652                                  * the rest of the page....
 653                                  */
 654                                 cbp_head->b_validend = zero_offset;
 655                         } else
 656                                 cbp_head->b_validend = 0;
 657
 658                         for (cbp = cbp_head; cbp;) {
 659                                 struct buf * cbp_next;
 660
 661                                 if (io_flags & B_WRITEINPROG)
 662                                         cbp->b_vp->v_numoutput++;
 663
 664                                 cbp_next = cbp->b_trans_next;
 665
 666                                 (void) VOP_STRATEGY(cbp);
 667                                 cbp = cbp_next;
 668                         }
 669                         if ( !(flags & CL_ASYNC)) {
 670                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 671                                         biowait(cbp);
 672
 673                                 if (error = cluster_iodone(cbp_head)) {
 674                                         if ((flags & CL_PAGEOUT) && (error == ENXIO))
 675                                                 retval = 0;     /* drop the error */
 676                                         else
 677                                                 retval = error;
 678                                         error  = 0;
 679                                 }
 680                         }
 681                         cbp_head = (struct buf *)0;
 682                         cbp_tail = (struct buf *)0;
 683
 684                         buf_count = 0;
 685                 }
 686         }
 687         if (error) {
 688                 int abort_size;
 689
 690                 for (cbp = cbp_head; cbp;) {
 691                         struct buf * cbp_next;
 692
 693                         if (cbp->b_vectorcount > 1)
 694                                 _FREE(cbp->b_vectorlist, M_SEGMENT);
 695                         upl_offset -= cbp->b_bcount;
 696                         size       += cbp->b_bcount;
 697
 698                         cbp_next = cbp->b_trans_next;
 699                         free_io_buf(cbp);
 700                         cbp = cbp_next;
 701                 }
 702                 pg_offset  = upl_offset & PAGE_MASK;
 703                 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 704
 705                 if (flags & CL_COMMIT) {
 706                         int upl_abort_code;
 707
 708                         if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
 709                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 710                         else if (flags & CL_PAGEIN)
 711                             upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 712                         else
 713                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 714
 715                         ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
 716                                                 upl_abort_code);
 717
 718                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
 719                                      (int)upl, upl_offset - pg_offset, abort_size, error, 0);
 720                 }
 721                 if (real_bp) {
 722                         real_bp->b_flags |= B_ERROR;
 723                         real_bp->b_error  = error;
 724
 725                         biodone(real_bp);
 726                 }
 727                 if (retval == 0)
 728                         retval = error;
 729         }
 730         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
 731                      (int)f_offset, size, upl_offset, retval, 0);
 732
 733         return (retval);
 734 }
 735
 736
 737 static int
 738 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
 739         struct vnode *vp;
 740         off_t         f_offset;
 741         u_int         size;
 742         off_t         filesize;
 743         int           devblocksize;
 744 {
 745         int           pages_to_fetch;
 746         int           skipped_pages;
 747
 748         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
 749                      (int)f_offset, size, (int)filesize, 0, 0);
 750
 751         if (f_offset >= filesize) {
 752                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 753                              (int)f_offset, 0, 0, 0, 0);
 754                 return(0);
 755         }
 756         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
 757                 size = MAX_UPL_TRANSFER * PAGE_SIZE;
 758         else
 759                 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
 760
 761         if ((off_t)size > (filesize - f_offset))
 762                 size = filesize - f_offset;
 763
 764         pages_to_fetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
 765
 766         for (skipped_pages = 0; skipped_pages < pages_to_fetch; skipped_pages++) {
 767                 if (ubc_page_op(vp, f_offset, 0, 0, 0) != KERN_SUCCESS)
 768                         break;
 769                 f_offset += PAGE_SIZE;
 770                 size     -= PAGE_SIZE;
 771         }
 772         if (skipped_pages < pages_to_fetch)
 773                 advisory_read(vp, filesize, f_offset, size, devblocksize);
 774
 775         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 776                      (int)f_offset + (pages_to_fetch * PAGE_SIZE), skipped_pages, 0, 1, 0);
 777
 778         return (pages_to_fetch);
 779 }
 780
 781
 782
 783 static void
 784 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
 785         struct vnode *vp;
 786         daddr_t       b_lblkno;
 787         daddr_t       e_lblkno;
 788         off_t         filesize;
 789         int           devblocksize;
 790 {
 791         daddr_t       r_lblkno;
 792         off_t         f_offset;
 793         int           size_of_prefetch;
 794         int           max_pages;
 795
 796         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
 797                      b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
 798
 799         if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
 800                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 801                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
 802                 return;
 803         }
 804
 805         if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
 806                                  (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
 807                 vp->v_ralen = 0;
 808                 vp->v_maxra = 0;
 809
 810                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 811                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
 812
 813                 return;
 814         }
 815         max_pages = MAX_UPL_TRANSFER;
 816
 817         vp->v_ralen = vp->v_ralen ? min(max_pages, vp->v_ralen << 1) : 1;
 818
 819         if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
 820                 vp->v_ralen = min(max_pages, (e_lblkno + 1) - b_lblkno);
 821
 822         if (e_lblkno < vp->v_maxra) {
 823                 if ((vp->v_maxra - e_lblkno) > max(max_pages / 16, 4)) {
 824
 825                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 826                                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
 827                         return;
 828                 }
 829         }
 830         r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
 831         f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
 832
 833         if (f_offset < filesize) {
 834                 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
 835
 836                 if (size_of_prefetch)
 837                         vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
 838         }
 839         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 840                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
 841 }
 842
 843 int
 844 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 845         struct vnode *vp;
 846         upl_t         upl;
 847         vm_offset_t   upl_offset;
 848         off_t         f_offset;
 849         int           size;
 850         off_t         filesize;
 851         int           devblocksize;
 852         int           flags;
 853 {
 854         int           io_size;
 855         int           pg_size;
 856         off_t         max_size;
 857         int local_flags = CL_PAGEOUT;
 858
 859         if ((flags & UPL_IOSYNC) == 0)
 860                 local_flags |= CL_ASYNC;
 861         if ((flags & UPL_NOCOMMIT) == 0)
 862                 local_flags |= CL_COMMIT;
 863
 864
 865         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
 866                      (int)f_offset, size, (int)filesize, local_flags, 0);
 867
 868         /*
 869          * If they didn't specify any I/O, then we are done...
 870          * we can't issue an abort because we don't know how
 871          * big the upl really is
 872          */
 873         if (size <= 0)
 874                 return (EINVAL);
 875
 876         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
 877                 if (local_flags & CL_COMMIT)
 878                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 879                 return (EROFS);
 880         }
 881         /*
 882          * can't page-in from a negative offset
 883          * or if we're starting beyond the EOF
 884          * or if the file offset isn't page aligned
 885          * or the size requested isn't a multiple of PAGE_SIZE
 886          */
 887         if (f_offset < 0 || f_offset >= filesize ||
 888            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 889                 if (local_flags & CL_COMMIT)
 890                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 891                 return (EINVAL);
 892         }
 893         max_size = filesize - f_offset;
 894
 895         if (size < max_size)
 896                 io_size = size;
 897         else
 898                 io_size = max_size;
 899
 900         pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 901
 902         if (size > pg_size) {
 903                 if (local_flags & CL_COMMIT)
 904                         ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
 905                                         UPL_ABORT_FREE_ON_EMPTY);
 906         }
 907         while (vp->v_numoutput >= ASYNC_THROTTLE) {
 908                 vp->v_flag |= VTHROTTLED;
 909                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_pageout", 0);
 910         }
 911
 912         return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
 913                            local_flags, (struct buf *)0));
 914 }
 915
 916 int
 917 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 918         struct vnode *vp;
 919         upl_t         upl;
 920         vm_offset_t   upl_offset;
 921         off_t         f_offset;
 922         int           size;
 923         off_t         filesize;
 924         int           devblocksize;
 925         int           flags;
 926 {
 927         u_int         io_size;
 928         int           rounded_size;
 929         off_t         max_size;
 930         int           retval;
 931         int           local_flags = 0;
 932
 933         if (upl == NULL || size < 0)
 934                 panic("cluster_pagein: NULL upl passed in");
 935
 936         if ((flags & UPL_IOSYNC) == 0)
 937                 local_flags |= CL_ASYNC;
 938         if ((flags & UPL_NOCOMMIT) == 0)
 939                 local_flags |= CL_COMMIT;
 940
 941
 942         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
 943                      (int)f_offset, size, (int)filesize, local_flags, 0);
 944
 945         /*
 946          * can't page-in from a negative offset
 947          * or if we're starting beyond the EOF
 948          * or if the file offset isn't page aligned
 949          * or the size requested isn't a multiple of PAGE_SIZE
 950          */
 951         if (f_offset < 0 || f_offset >= filesize ||
 952            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
 953                 if (local_flags & CL_COMMIT)
 954                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
 955                 return (EINVAL);
 956         }
 957         max_size = filesize - f_offset;
 958
 959         if (size < max_size)
 960                 io_size = size;
 961         else
 962                 io_size = max_size;
 963
 964         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 965
 966         if (size > rounded_size && (local_flags & CL_COMMIT))
 967                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
 968                                     size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
 969
 970         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
 971                            local_flags | CL_READ | CL_PAGEIN, (struct buf *)0);
 972
 973         if (retval == 0) {
 974                 int b_lblkno;
 975                 int e_lblkno;
 976
 977                 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
 978                 e_lblkno = (int)
 979                         ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
 980
 981                 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
 982                         /*
 983                          * we haven't read the last page in of the file yet
 984                          * so let's try to read ahead if we're in
 985                          * a sequential access pattern
 986                          */
 987                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
 988                 }
 989                 vp->v_lastr = e_lblkno;
 990         }
 991         return (retval);
 992 }
 993
 994 int
 995 cluster_bp(bp)
 996         struct buf *bp;
 997 {
 998         off_t  f_offset;
 999         int    flags;
1000
1001         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1002                      (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1003
1004         if (bp->b_pagelist == (upl_t) 0)
1005                 panic("cluster_bp: can't handle NULL upl yet\n");
1006         if (bp->b_flags & B_READ)
1007                 flags = CL_ASYNC | CL_READ;
1008         else
1009                 flags = CL_ASYNC;
1010
1011         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1012
1013         return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp));
1014 }
1015
1016 int
1017 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1018         struct vnode *vp;
1019         struct uio   *uio;
1020         off_t         oldEOF;
1021         off_t         newEOF;
1022         off_t         headOff;
1023         off_t         tailOff;
1024         int           devblocksize;
1025         int           flags;
1026 {
1027         int           prev_resid;
1028         int           clip_size;
1029         off_t         max_io_size;
1030         struct iovec  *iov;
1031         vm_offset_t   upl_offset;
1032         int           upl_size;
1033         int           pages_in_pl;
1034         upl_page_info_t *pl;
1035         int           upl_flags;
1036         upl_t         upl;
1037         int           retval = 0;
1038
1039
1040         if ((!uio) || (uio->uio_segflg != UIO_USERSPACE) || (!(vp->v_flag & VNOCACHE_DATA)))
1041           {
1042             retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1043             return(retval);
1044           }
1045
1046         while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1047           {
1048             /* we know we have a resid, so this is safe */
1049             iov = uio->uio_iov;
1050             while (iov->iov_len == 0) {
1051               uio->uio_iov++;
1052               uio->uio_iovcnt--;
1053               iov = uio->uio_iov;
1054             }
1055
1056             /*
1057              * We check every vector target and if it is physically
1058              * contiguous space, we skip the sanity checks.
1059              */
1060
1061             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1062             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1063             pages_in_pl = 0;
1064             upl_flags = UPL_QUERY_OBJECT_TYPE;
1065             if ((vm_map_get_upl(current_map(),
1066                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1067                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1068               {
1069                 /*
1070                  * the user app must have passed in an invalid address
1071                  */
1072                 return (EFAULT);
1073               }
1074
1075             if (upl_flags & UPL_PHYS_CONTIG)
1076               {
1077                 /*
1078                  * since the interface to the IOKit below us uses physical block #'s and
1079                  * block counts to specify the I/O, we can't handle anything that isn't
1080                  * devblocksize aligned
1081                  */
1082                 if ((uio->uio_offset & (devblocksize - 1)) || (uio->uio_resid & (devblocksize - 1)))
1083                     return(EINVAL);
1084
1085                 if (flags & IO_HEADZEROFILL)
1086                   {
1087                     flags &= ~IO_HEADZEROFILL;
1088
1089                     if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1090                         return(retval);
1091                   }
1092
1093                 retval = cluster_phys_write(vp, uio);
1094
1095                 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1096                   {
1097                     retval = cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL);
1098                     return(retval);
1099                   }
1100               }
1101             else if ((uio->uio_resid < 4 * PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1102               {
1103                 /*
1104                  * We set a threshhold of 4 pages to decide if the nocopy
1105                  * write loop is worth the trouble...
1106                  * we also come here if we're trying to zero the head and/or tail
1107                  * of a partially written page, and the user source is not a physically contiguous region
1108                  */
1109                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1110                 return(retval);
1111               }
1112             else if (uio->uio_offset & PAGE_MASK_64)
1113               {
1114                 /* Bring the file offset write up to a pagesize boundary */
1115                 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1116                 if (uio->uio_resid < clip_size)
1117                   clip_size = uio->uio_resid;
1118                 /*
1119                  * Fake the resid going into the cluster_write_x call
1120                  * and restore it on the way out.
1121                  */
1122                 prev_resid = uio->uio_resid;
1123                 uio->uio_resid = clip_size;
1124                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1125                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1126               }
1127             else if ((int)iov->iov_base & PAGE_MASK_64)
1128               {
1129                 clip_size = iov->iov_len;
1130                 prev_resid = uio->uio_resid;
1131                 uio->uio_resid = clip_size;
1132                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1133                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1134               }
1135             else
1136               {
1137                 /*
1138                  * If we come in here, we know the offset into
1139                  * the file is on a pagesize boundary
1140                  */
1141
1142                 max_io_size = newEOF - uio->uio_offset;
1143                 clip_size = uio->uio_resid;
1144                 if (iov->iov_len < clip_size)
1145                   clip_size = iov->iov_len;
1146                 if (max_io_size < clip_size)
1147                   clip_size = max_io_size;
1148
1149                 if (clip_size < PAGE_SIZE)
1150                   {
1151                     /*
1152                      * Take care of tail end of write in this vector
1153                      */
1154                     prev_resid = uio->uio_resid;
1155                     uio->uio_resid = clip_size;
1156                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1157                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1158                   }
1159                 else
1160                   {
1161                     /* round clip_size down to a multiple of pagesize */
1162                     clip_size = clip_size & ~(PAGE_MASK);
1163                     prev_resid = uio->uio_resid;
1164                     uio->uio_resid = clip_size;
1165                     retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1166                     if ((retval == 0) && uio->uio_resid)
1167                       retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1168                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1169                   }
1170               } /* end else */
1171           } /* end while */
1172         return(retval);
1173 }
1174
1175 static int
1176 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1177         struct vnode *vp;
1178         struct uio   *uio;
1179         off_t         newEOF;
1180         int           devblocksize;
1181         int           flags;
1182 {
1183         upl_t            upl;
1184         upl_page_info_t  *pl;
1185         off_t            upl_f_offset;
1186         vm_offset_t      upl_offset;
1187         off_t            max_io_size;
1188         int              io_size;
1189         int              upl_size;
1190         int              upl_needed_size;
1191         int              pages_in_pl;
1192         int              upl_flags;
1193         kern_return_t    kret;
1194         struct iovec     *iov;
1195         int              i;
1196         int              force_data_sync;
1197         int              error  = 0;
1198
1199         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1200                      (int)uio->uio_offset, (int)uio->uio_resid,
1201                      (int)newEOF, devblocksize, 0);
1202
1203         /*
1204          * When we enter this routine, we know
1205          *  -- the offset into the file is on a pagesize boundary
1206          *  -- the resid is a page multiple
1207          *  -- the resid will not exceed iov_len
1208          */
1209
1210         iov = uio->uio_iov;
1211
1212         while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1213           io_size = uio->uio_resid;
1214
1215           if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1216             io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1217
1218           upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1219           upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1220
1221           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1222                        (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1223
1224           for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
1225             {
1226               pages_in_pl = 0;
1227               upl_size = upl_needed_size;
1228               upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1229                           UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1230
1231               kret = vm_map_get_upl(current_map(),
1232                                     (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1233                                     &upl_size,
1234                                         &upl,
1235                                         NULL,
1236                                         &pages_in_pl,
1237                                         &upl_flags,
1238                                         force_data_sync);
1239
1240               if (kret != KERN_SUCCESS)
1241                 {
1242                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1243                                0, 0, 0, kret, 0);
1244
1245                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1246                                (int)uio->uio_offset, (int)uio->uio_resid, kret, 1, 0);
1247
1248                   /* cluster_nocopy_write: failed to get pagelist */
1249                   /* do not return kret here */
1250                   return(0);
1251                 }
1252
1253               pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1254               pages_in_pl = upl_size / PAGE_SIZE;
1255
1256               for(i=0; i < pages_in_pl; i++)
1257                 {
1258                   if (!upl_valid_page(pl, i))
1259                     break;
1260                 }
1261
1262               if (i == pages_in_pl)
1263                 break;
1264
1265                 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1266                                 UPL_ABORT_FREE_ON_EMPTY);
1267             }
1268
1269           if (force_data_sync >= 3)
1270             {
1271               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1272                            i, pages_in_pl, upl_size, kret, 0);
1273
1274               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1275                            (int)uio->uio_offset, (int)uio->uio_resid, kret, 2, 0);
1276               return(0);
1277             }
1278
1279           /*
1280            * Consider the possibility that upl_size wasn't satisfied.
1281            */
1282           if (upl_size != upl_needed_size)
1283             io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1284
1285           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1286                        (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1287
1288           if (io_size == 0)
1289             {
1290               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1291                                    UPL_ABORT_FREE_ON_EMPTY);
1292               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1293                      (int)uio->uio_offset, uio->uio_resid, 0, 3, 0);
1294
1295               return(0);
1296             }
1297
1298           /*
1299            * Now look for pages already in the cache
1300            * and throw them away.
1301            */
1302
1303           upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
1304           max_io_size = io_size;
1305
1306           while (max_io_size) {
1307
1308             /*
1309              * Flag UPL_POP_DUMP says if the page is found
1310              * in the page cache it must be thrown away.
1311              */
1312             ubc_page_op(vp,
1313                         upl_f_offset,
1314                         UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1315                         0, 0);
1316             max_io_size  -= PAGE_SIZE;
1317             upl_f_offset += PAGE_SIZE;
1318           }
1319
1320           /*
1321            * issue a synchronous write to cluster_io
1322            */
1323
1324           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1325                        (int)upl_offset, (int)uio->uio_offset, io_size, 0, 0);
1326
1327           error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1328                              io_size, devblocksize, 0, (struct buf *)0);
1329
1330           if (error == 0) {
1331             /*
1332              * The cluster_io write completed successfully,
1333              * update the uio structure.
1334              */
1335             iov->iov_base += io_size;
1336             iov->iov_len -= io_size;
1337             uio->uio_resid -= io_size;
1338             uio->uio_offset += io_size;
1339           }
1340           /*
1341            * always 'commit' the I/O via the abort primitive whether the I/O
1342            * succeeded cleanly or not... this is necessary to insure that
1343            * we preserve the state of the DIRTY flag on the pages used to
1344            * provide the data for the I/O... the state of this flag SHOULD
1345            * NOT be changed by a write
1346            */
1347           ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1348                               UPL_ABORT_FREE_ON_EMPTY);
1349
1350
1351           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1352                        (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1353
1354         } /* end while */
1355
1356
1357         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1358                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1359
1360         return (error);
1361 }
1362
1363 static int
1364 cluster_phys_write(vp, uio)
1365         struct vnode *vp;
1366         struct uio   *uio;
1367 {
1368         upl_t            upl;
1369         vm_offset_t      upl_offset;
1370         int              io_size;
1371         int              upl_size;
1372         int              upl_needed_size;
1373         int              pages_in_pl;
1374         int              upl_flags;
1375         kern_return_t    kret;
1376         struct iovec     *iov;
1377         int              error  = 0;
1378
1379         /*
1380          * When we enter this routine, we know
1381          *  -- the resid will not exceed iov_len
1382          *  -- the vector target address is physcially contiguous
1383          */
1384
1385         iov = uio->uio_iov;
1386         io_size = iov->iov_len;
1387         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1388         upl_needed_size = upl_offset + io_size;
1389
1390         pages_in_pl = 0;
1391         upl_size = upl_needed_size;
1392         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1393                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1394
1395         kret = vm_map_get_upl(current_map(),
1396                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1397                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1398
1399         if (kret != KERN_SUCCESS)
1400           {
1401             /* cluster_phys_write: failed to get pagelist */
1402               /* note: return kret here */
1403               return(EINVAL);
1404           }
1405
1406         /*
1407          * Consider the possibility that upl_size wasn't satisfied.
1408          * This is a failure in the physical memory case.
1409          */
1410         if (upl_size < upl_needed_size)
1411           {
1412             kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1413             return(EINVAL);
1414           }
1415
1416         /*
1417          * issue a synchronous write to cluster_io
1418          */
1419
1420         error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1421                            io_size, 0, CL_DEV_MEMORY, (struct buf *)0);
1422
1423         if (error == 0) {
1424           /*
1425            * The cluster_io write completed successfully,
1426            * update the uio structure and commit.
1427            */
1428
1429           ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
1430
1431           iov->iov_base += io_size;
1432           iov->iov_len -= io_size;
1433           uio->uio_resid -= io_size;
1434           uio->uio_offset += io_size;
1435         }
1436         else
1437           ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1438
1439         return (error);
1440 }
1441
1442 static int
1443 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1444         struct vnode *vp;
1445         struct uio   *uio;
1446         off_t         oldEOF;
1447         off_t         newEOF;
1448         off_t         headOff;
1449         off_t         tailOff;
1450         int           devblocksize;
1451         int           flags;
1452 {
1453         upl_page_info_t *pl;
1454         upl_t            upl;
1455         vm_offset_t      upl_offset;
1456         int              upl_size;
1457         off_t            upl_f_offset;
1458         int              pages_in_upl;
1459         int              start_offset;
1460         int              xfer_resid;
1461         int              io_size;
1462         int              io_flags;
1463         vm_offset_t      io_address;
1464         int              io_offset;
1465         int              bytes_to_zero;
1466         int              bytes_to_move;
1467         kern_return_t    kret;
1468         int              retval = 0;
1469         int              uio_resid;
1470         long long        total_size;
1471         long long        zero_cnt;
1472         off_t            zero_off;
1473         long long        zero_cnt1;
1474         off_t            zero_off1;
1475         daddr_t          start_blkno;
1476         daddr_t          last_blkno;
1477
1478         if (uio) {
1479                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1480                              (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1481
1482                 uio_resid = uio->uio_resid;
1483         } else {
1484                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1485                              0, 0, (int)oldEOF, (int)newEOF, 0);
1486
1487                 uio_resid = 0;
1488         }
1489         zero_cnt  = 0;
1490         zero_cnt1 = 0;
1491
1492         if (flags & IO_HEADZEROFILL) {
1493                 /*
1494                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1495                  * so we zero fill the intervening space between the old EOF and the offset
1496                  * where the next chunk of real data begins.... ftruncate will also use this
1497                  * routine to zero fill to the new EOF when growing a file... in this case, the
1498                  * uio structure will not be provided
1499                  */
1500                 if (uio) {
1501                         if (headOff < uio->uio_offset) {
1502                                 zero_cnt = uio->uio_offset - headOff;
1503                                 zero_off = headOff;
1504                         }
1505                 } else if (headOff < newEOF) {
1506                         zero_cnt = newEOF - headOff;
1507                         zero_off = headOff;
1508                 }
1509         }
1510         if (flags & IO_TAILZEROFILL) {
1511                 if (uio) {
1512                         zero_off1 = uio->uio_offset + uio->uio_resid;
1513
1514                         if (zero_off1 < tailOff)
1515                                 zero_cnt1 = tailOff - zero_off1;
1516                 }
1517         }
1518         if (zero_cnt == 0 && uio == (struct uio *) 0)
1519           {
1520             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1521                          retval, 0, 0, 0, 0);
1522             return (0);
1523           }
1524
1525         while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1526                 /*
1527                  * for this iteration of the loop, figure out where our starting point is
1528                  */
1529                 if (zero_cnt) {
1530                         start_offset = (int)(zero_off & PAGE_MASK_64);
1531                         upl_f_offset = zero_off - start_offset;
1532                 } else if (uio_resid) {
1533                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1534                         upl_f_offset = uio->uio_offset - start_offset;
1535                 } else {
1536                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1537                         upl_f_offset = zero_off1 - start_offset;
1538                 }
1539                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1540                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1541
1542                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1543                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1544
1545                 /*
1546                  * compute the size of the upl needed to encompass
1547                  * the requested write... limit each call to cluster_io
1548                  * to the maximum UPL size... cluster_io will clip if
1549                  * this exceeds the maximum io_size for the device,
1550                  * make sure to account for
1551                  * a starting offset that's not page aligned
1552                  */
1553                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1554
1555                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1556                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1557
1558                 pages_in_upl = upl_size / PAGE_SIZE;
1559                 io_size      = upl_size - start_offset;
1560
1561                 if ((long long)io_size > total_size)
1562                         io_size = total_size;
1563
1564                 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1565                 last_blkno  = start_blkno + pages_in_upl;
1566
1567                 kret = ubc_create_upl(vp,
1568                                                         upl_f_offset,
1569                                                         upl_size,
1570                                                         &upl,
1571                                                         &pl,
1572                                                         UPL_FLAGS_NONE);
1573                 if (kret != KERN_SUCCESS)
1574                         panic("cluster_write: failed to get pagelist");
1575
1576                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
1577                         (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1578
1579                 if (start_offset && !upl_valid_page(pl, 0)) {
1580                         int   read_size;
1581
1582                         /*
1583                          * we're starting in the middle of the first page of the upl
1584                          * and the page isn't currently valid, so we're going to have
1585                          * to read it in first... this is a synchronous operation
1586                          */
1587                         read_size = PAGE_SIZE;
1588
1589                         if ((upl_f_offset + read_size) > newEOF)
1590                                 read_size = newEOF - upl_f_offset;
1591
1592                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
1593                                             CL_READ, (struct buf *)0);
1594                         if (retval) {
1595                                 /*
1596                                  * we had an error during the read which causes us to abort
1597                                  * the current cluster_write request... before we do, we need
1598                                  * to release the rest of the pages in the upl without modifying
1599                                  * there state and mark the failed page in error
1600                                  */
1601                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1602                                 ubc_upl_abort_range(upl, 0, upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1603
1604                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1605                                              (int)upl, 0, 0, retval, 0);
1606                                 break;
1607                         }
1608                 }
1609                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1610                         /*
1611                          * the last offset we're writing to in this upl does not end on a page
1612                          * boundary... if it's not beyond the old EOF, then we'll also need to
1613                          * pre-read this page in if it isn't already valid
1614                          */
1615                         upl_offset = upl_size - PAGE_SIZE;
1616
1617                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1618                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1619                                 int   read_size;
1620
1621                                 read_size = PAGE_SIZE;
1622
1623                                 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1624                                         read_size = newEOF - (upl_f_offset + upl_offset);
1625
1626                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
1627                                                     CL_READ, (struct buf *)0);
1628                                 if (retval) {
1629                                         /*
1630                                          * we had an error during the read which causes us to abort
1631                                          * the current cluster_write request... before we do, we
1632                                          * need to release the rest of the pages in the upl without
1633                                          * modifying there state and mark the failed page in error
1634                                          */
1635                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1636                                         ubc_upl_abort_range(upl, 0,          upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1637
1638                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1639                                                      (int)upl, 0, 0, retval, 0);
1640                                         break;
1641                                 }
1642                         }
1643                 }
1644                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
1645                         panic("cluster_write: ubc_upl_map failed\n");
1646                 xfer_resid = io_size;
1647                 io_offset = start_offset;
1648
1649                 while (zero_cnt && xfer_resid) {
1650
1651                         if (zero_cnt < (long long)xfer_resid)
1652                                 bytes_to_zero = zero_cnt;
1653                         else
1654                                 bytes_to_zero = xfer_resid;
1655
1656                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1657                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1658
1659                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1660                                              (int)upl_f_offset + io_offset, bytes_to_zero,
1661                                              (int)io_offset, xfer_resid, 0);
1662                         } else {
1663                                 int zero_pg_index;
1664
1665                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1666                                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1667
1668                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1669                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1670
1671                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1672                                                      (int)upl_f_offset + io_offset, bytes_to_zero,
1673                                                      (int)io_offset, xfer_resid, 0);
1674
1675                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1676                                            !upl_dirty_page(pl, zero_pg_index)) {
1677                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1678
1679                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1680                                                      (int)upl_f_offset + io_offset, bytes_to_zero,
1681                                                      (int)io_offset, xfer_resid, 0);
1682                                 }
1683                         }
1684                         xfer_resid -= bytes_to_zero;
1685                         zero_cnt   -= bytes_to_zero;
1686                         zero_off   += bytes_to_zero;
1687                         io_offset  += bytes_to_zero;
1688                 }
1689                 if (xfer_resid && uio_resid) {
1690                         bytes_to_move = min(uio_resid, xfer_resid);
1691
1692                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1693                                      (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1694
1695                         retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1696
1697
1698                         if (retval) {
1699                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1700                                         panic("cluster_write: kernel_upl_unmap failed\n");
1701
1702                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1703
1704                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1705                                              (int)upl, 0, 0, retval, 0);
1706                         } else {
1707                                 uio_resid  -= bytes_to_move;
1708                                 xfer_resid -= bytes_to_move;
1709                                 io_offset  += bytes_to_move;
1710                         }
1711                 }
1712                 while (xfer_resid && zero_cnt1 && retval == 0) {
1713
1714                         if (zero_cnt1 < (long long)xfer_resid)
1715                                 bytes_to_zero = zero_cnt1;
1716                         else
1717                                 bytes_to_zero = xfer_resid;
1718
1719                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1720                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1721
1722                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1723                                              (int)upl_f_offset + io_offset,
1724                                              bytes_to_zero, (int)io_offset, xfer_resid, 0);
1725                         } else {
1726                                 int zero_pg_index;
1727
1728                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1729                                 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1730
1731                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1732                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1733
1734                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1735                                                      (int)upl_f_offset + io_offset,
1736                                                      bytes_to_zero, (int)io_offset, xfer_resid, 0);
1737
1738                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1739                                            !upl_dirty_page(pl, zero_pg_index)) {
1740                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1741
1742                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1743                                                      (int)upl_f_offset + io_offset,
1744                                                      bytes_to_zero, (int)io_offset, xfer_resid, 0);
1745                                 }
1746                         }
1747                         xfer_resid -= bytes_to_zero;
1748                         zero_cnt1  -= bytes_to_zero;
1749                         zero_off1  += bytes_to_zero;
1750                         io_offset  += bytes_to_zero;
1751                 }
1752
1753                 if (retval == 0) {
1754                         int cl_index;
1755                         int can_delay;
1756
1757                         io_size += start_offset;
1758
1759                         if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1760                                 /*
1761                                  * if we're extending the file with this write
1762                                  * we'll zero fill the rest of the page so that
1763                                  * if the file gets extended again in such a way as to leave a
1764                                  * hole starting at this EOF, we'll have zero's in the correct spot
1765                                  */
1766                                 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1767
1768                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1769                                              (int)upl_f_offset + io_size,
1770                                              upl_size - io_size, 0, 0, 0);
1771                         }
1772                         if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1773                                 panic("cluster_write: kernel_upl_unmap failed\n");
1774
1775                         if (flags & IO_SYNC)
1776                                 /*
1777                                  * if the IO_SYNC flag is set than we need to
1778                                  * bypass any clusters and immediately issue
1779                                  * the I/O
1780                                  */
1781                                 goto issue_io;
1782
1783                         if (vp->v_clen == 0)
1784                                 /*
1785                                  * no clusters currently present
1786                                  */
1787                                 goto start_new_cluster;
1788
1789                         /*
1790                          * keep track of the overall dirty page
1791                          * range we've developed
1792                          * in case we have to fall back to the
1793                          * VHASDIRTY method of flushing
1794                          */
1795                         if (vp->v_flag & VHASDIRTY)
1796                                 goto delay_io;
1797
1798                         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
1799                                 /*
1800                                  * we have an existing cluster... see if this write will extend it nicely
1801                                  */
1802                                 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
1803                                         /*
1804                                          * the current write starts at or after the current cluster
1805                                          */
1806                                         if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1807                                                 /*
1808                                                  * we have a write that fits entirely
1809                                                  * within the existing cluster limits
1810                                                  */
1811                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg)
1812                                                         /*
1813                                                          * update our idea of where the cluster ends
1814                                                          */
1815                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
1816                                                 break;
1817                                         }
1818                                         if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1819                                                 /*
1820                                                  * we have a write that starts in the middle of the current cluster
1821                                                  * but extends beyond the cluster's limit
1822                                                  * we'll clip the current cluster if we actually
1823                                                  * overlap with the new write
1824                                                  * and start a new cluster with the current write
1825                                                  */
1826                                                  if (vp->v_clusters[cl_index].last_pg > start_blkno)
1827                                                         vp->v_clusters[cl_index].last_pg = start_blkno;
1828                                         }
1829                                         /*
1830                                          * we also get here for the case where the current write starts
1831                                          * beyond the limit of the existing cluster
1832                                          *
1833                                          * in either case, we'll check the remaining clusters before
1834                                          * starting a new one
1835                                          */
1836                                 } else {
1837                                         /*
1838                                          * the current write starts in front of the current cluster
1839                                          */
1840                                         if ((vp->v_clusters[cl_index].last_pg - start_blkno) <=  MAX_UPL_TRANSFER) {
1841                                                 /*
1842                                                  * we can just merge the old cluster
1843                                                  * with the new request and leave it
1844                                                  * in the cache
1845                                                  */
1846                                                 vp->v_clusters[cl_index].start_pg = start_blkno;
1847
1848                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
1849                                                         /*
1850                                                          * the current write completely
1851                                                          * envelops the existing cluster
1852                                                          */
1853                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
1854                                                 }
1855                                                 break;
1856                                         }
1857
1858                                         /*
1859                                          * if we were to combine this write with the current cluster
1860                                          * we would exceed the cluster size limit.... so,
1861                                          * let's see if there's any overlap of the new I/O with
1862                                          * the existing cluster...
1863                                          *
1864                                          */
1865                                         if (last_blkno > vp->v_clusters[cl_index].start_pg)
1866                                                 /*
1867                                                  * the current write extends into the existing cluster
1868                                                  * clip the current cluster by moving the start position
1869                                                  * to where the current write ends
1870                                                  */
1871                                                 vp->v_clusters[cl_index].start_pg = last_blkno;
1872                                         /*
1873                                          * if we get here, there was no way to merge
1874                                          * the new I/O with this cluster and
1875                                          * keep it under our maximum cluster length
1876                                          * we'll check the remaining clusters before starting a new one
1877                                          */
1878                                 }
1879                         }
1880                         if (cl_index < vp->v_clen)
1881                                 /*
1882                                  * we found an existing cluster that we
1883                                  * could merger this I/O into
1884                                  */
1885                                 goto delay_io;
1886
1887                         if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
1888                                 /*
1889                                  * we didn't find an existing cluster to
1890                                  * merge into, but there's room to start
1891                                  * a new one
1892                                  */
1893                                 goto start_new_cluster;
1894
1895                         /*
1896                          * no exisitng cluster to merge with and no
1897                          * room to start a new one... we'll try
1898                          * pushing the existing ones... if none of
1899                          * them are able to be pushed, we'll have
1900                          * to fall back on the VHASDIRTY mechanism
1901                          * cluster_try_push will set v_clen to the
1902                          * number of remaining clusters if it is
1903                          * unable to push all of them
1904                          */
1905                         if (vp->v_flag & VNOCACHE_DATA)
1906                                 can_delay = 0;
1907                         else
1908                                 can_delay = 1;
1909
1910                         if (cluster_try_push(vp, newEOF, can_delay, 0) == 0) {
1911                                 vp->v_flag |= VHASDIRTY;
1912                                 goto delay_io;
1913                         }
1914 start_new_cluster:
1915                         if (vp->v_clen == 0) {
1916                                 vp->v_ciosiz = devblocksize;
1917                                 vp->v_cstart = start_blkno;
1918                                 vp->v_lastw  = last_blkno;
1919                         }
1920                         vp->v_clusters[vp->v_clen].start_pg = start_blkno;
1921                         vp->v_clusters[vp->v_clen].last_pg  = last_blkno;
1922                         vp->v_clen++;
1923 delay_io:
1924                         /*
1925                          * make sure we keep v_cstart and v_lastw up to
1926                          * date in case we have to fall back on the
1927                          * V_HASDIRTY mechanism (or we've already entered it)
1928                          */
1929                         if (start_blkno < vp->v_cstart)
1930                                 vp->v_cstart = start_blkno;
1931                         if (last_blkno > vp->v_lastw)
1932                                 vp->v_lastw = last_blkno;
1933
1934                         ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1935                         continue;
1936 issue_io:
1937                         /*
1938                          * in order to maintain some semblance of coherency with mapped writes
1939                          * we need to write the cluster back out as a multiple of the PAGESIZE
1940                          * unless the cluster encompasses the last page of the file... in this
1941                          * case we'll round out to the nearest device block boundary
1942                          */
1943                         io_size = upl_size;
1944
1945                         if ((upl_f_offset + io_size) > newEOF) {
1946                                 io_size = newEOF - upl_f_offset;
1947                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
1948                         }
1949
1950                         if (flags & IO_SYNC)
1951                                 io_flags = CL_COMMIT | CL_AGE;
1952                         else
1953                                 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
1954
1955                         if (vp->v_flag & VNOCACHE_DATA)
1956                                 io_flags |= CL_DUMP;
1957
1958                         while (vp->v_numoutput >= ASYNC_THROTTLE) {
1959                                 vp->v_flag |= VTHROTTLED;
1960                                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
1961                         }
1962                         retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
1963                                             io_flags, (struct buf *)0);
1964                 }
1965         }
1966         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1967                      retval, 0, 0, 0, 0);
1968
1969         return (retval);
1970 }
1971
1972 int
1973 cluster_read(vp, uio, filesize, devblocksize, flags)
1974         struct vnode *vp;
1975         struct uio   *uio;
1976         off_t         filesize;
1977         int           devblocksize;
1978         int           flags;
1979 {
1980         int           prev_resid;
1981         int           clip_size;
1982         off_t         max_io_size;
1983         struct iovec  *iov;
1984         vm_offset_t   upl_offset;
1985         int           upl_size;
1986         int           pages_in_pl;
1987         upl_page_info_t *pl;
1988         int           upl_flags;
1989         upl_t         upl;
1990         int           retval = 0;
1991
1992         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
1993                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
1994
1995         /*
1996          * We set a threshhold of 4 pages to decide if the nocopy
1997          * read loop is worth the trouble...
1998          */
1999
2000         if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
2001           {
2002             retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2003             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2004                          (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2005             return(retval);
2006           }
2007
2008         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2009           {
2010             /* we know we have a resid, so this is safe */
2011             iov = uio->uio_iov;
2012             while (iov->iov_len == 0) {
2013               uio->uio_iov++;
2014               uio->uio_iovcnt--;
2015               iov = uio->uio_iov;
2016             }
2017
2018             /*
2019              * We check every vector target and if it is physically
2020              * contiguous space, we skip the sanity checks.
2021              */
2022
2023             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
2024             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
2025             pages_in_pl = 0;
2026             upl_flags = UPL_QUERY_OBJECT_TYPE;
2027             if((vm_map_get_upl(current_map(),
2028                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2029                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
2030               {
2031                 /*
2032                  * the user app must have passed in an invalid address
2033                  */
2034                 return (EFAULT);
2035               }
2036
2037             if (upl_flags & UPL_PHYS_CONTIG)
2038               {
2039                 retval = cluster_phys_read(vp, uio, filesize);
2040               }
2041             else if (uio->uio_resid < 4 * PAGE_SIZE)
2042               {
2043                 /*
2044                  * We set a threshhold of 4 pages to decide if the nocopy
2045                  * read loop is worth the trouble...
2046                  */
2047                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2048                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2049                              (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2050                 return(retval);
2051               }
2052             else if (uio->uio_offset & PAGE_MASK_64)
2053               {
2054                 /* Bring the file offset read up to a pagesize boundary */
2055                 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2056                 if (uio->uio_resid < clip_size)
2057                   clip_size = uio->uio_resid;
2058                 /*
2059                  * Fake the resid going into the cluster_read_x call
2060                  * and restore it on the way out.
2061                  */
2062                 prev_resid = uio->uio_resid;
2063                 uio->uio_resid = clip_size;
2064                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2065                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2066               }
2067             else if ((int)iov->iov_base & PAGE_MASK_64)
2068               {
2069                 clip_size = iov->iov_len;
2070                 prev_resid = uio->uio_resid;
2071                 uio->uio_resid = clip_size;
2072                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2073                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2074               }
2075             else
2076               {
2077                 /*
2078                  * If we come in here, we know the offset into
2079                  * the file is on a pagesize boundary
2080                  */
2081
2082                 max_io_size = filesize - uio->uio_offset;
2083                 clip_size = uio->uio_resid;
2084                 if (iov->iov_len < clip_size)
2085                   clip_size = iov->iov_len;
2086                 if (max_io_size < clip_size)
2087                   clip_size = (int)max_io_size;
2088
2089                 if (clip_size < PAGE_SIZE)
2090                   {
2091                     /*
2092                      * Take care of the tail end of the read in this vector.
2093                      */
2094                     prev_resid = uio->uio_resid;
2095                     uio->uio_resid = clip_size;
2096                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2097                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2098                   }
2099                 else
2100                   {
2101                     /* round clip_size down to a multiple of pagesize */
2102                     clip_size = clip_size & ~(PAGE_MASK);
2103                     prev_resid = uio->uio_resid;
2104                     uio->uio_resid = clip_size;
2105                     retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2106                     if ((retval==0) && uio->uio_resid)
2107                       retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2108                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2109                   }
2110               } /* end else */
2111           } /* end while */
2112
2113         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2114                      (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2115
2116         return(retval);
2117 }
2118
2119 static int
2120 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2121         struct vnode *vp;
2122         struct uio   *uio;
2123         off_t         filesize;
2124         int           devblocksize;
2125         int           flags;
2126 {
2127         upl_page_info_t *pl;
2128         upl_t            upl;
2129         vm_offset_t      upl_offset;
2130         int              upl_size;
2131         off_t            upl_f_offset;
2132         int              start_offset;
2133         int              start_pg;
2134         int              last_pg;
2135         int              uio_last;
2136         int              pages_in_upl;
2137         off_t            max_size;
2138         int              io_size;
2139         vm_offset_t      io_address;
2140         kern_return_t    kret;
2141         int              segflg;
2142         int              error  = 0;
2143         int              retval = 0;
2144         int              b_lblkno;
2145         int              e_lblkno;
2146
2147         b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
2148
2149         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2150                 /*
2151                  * compute the size of the upl needed to encompass
2152                  * the requested read... limit each call to cluster_io
2153                  * to the maximum UPL size... cluster_io will clip if
2154                  * this exceeds the maximum io_size for the device,
2155                  * make sure to account for
2156                  * a starting offset that's not page aligned
2157                  */
2158                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2159                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2160                 max_size     = filesize - uio->uio_offset;
2161
2162                 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2163                         io_size = uio->uio_resid;
2164                 else
2165                         io_size = max_size;
2166
2167                 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
2168                         segflg = uio->uio_segflg;
2169
2170                         uio->uio_segflg = UIO_PHYS_USERSPACE;
2171
2172                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2173                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2174
2175                         while (io_size && retval == 0) {
2176                                 int         xsize;
2177                                 vm_offset_t paddr;
2178
2179                                 if (ubc_page_op(vp,
2180                                                 upl_f_offset,
2181                                                 UPL_POP_SET | UPL_POP_BUSY,
2182                                                 &paddr, 0) != KERN_SUCCESS)
2183                                         break;
2184
2185                                 xsize = PAGE_SIZE - start_offset;
2186
2187                                 if (xsize > io_size)
2188                                         xsize = io_size;
2189
2190                                 retval = uiomove((caddr_t)(paddr + start_offset), xsize, uio);
2191
2192                                 ubc_page_op(vp, upl_f_offset,
2193                                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2194
2195                                 io_size     -= xsize;
2196                                 start_offset = (int)
2197                                         (uio->uio_offset & PAGE_MASK_64);
2198                                 upl_f_offset = uio->uio_offset - start_offset;
2199                         }
2200                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2201                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2202
2203                         uio->uio_segflg = segflg;
2204
2205                         if (retval)
2206                                 break;
2207
2208                         if (io_size == 0) {
2209                                 /*
2210                                  * we're already finished with this read request
2211                                  * let's see if we should do a read-ahead
2212                                  */
2213                                 e_lblkno = (int)
2214                                         ((uio->uio_offset - 1) / PAGE_SIZE_64);
2215
2216                                 if (!(vp->v_flag & VRAOFF))
2217                                         /*
2218                                          * let's try to read ahead if we're in
2219                                          * a sequential access pattern
2220                                          */
2221                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2222                                 vp->v_lastr = e_lblkno;
2223
2224                                 break;
2225                         }
2226                         max_size = filesize - uio->uio_offset;
2227                 }
2228                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2229                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2230                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2231                 pages_in_upl = upl_size / PAGE_SIZE;
2232
2233                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2234                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2235
2236                 kret = ubc_create_upl(vp,
2237                                                 upl_f_offset,
2238                                                 upl_size,
2239                                                 &upl,
2240                                                 &pl,
2241                                                 UPL_FLAGS_NONE);
2242                 if (kret != KERN_SUCCESS)
2243                         panic("cluster_read: failed to get pagelist");
2244
2245                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2246                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2247
2248                 /*
2249                  * scan from the beginning of the upl looking for the first
2250                  * non-valid page.... this will become the first page in
2251                  * the request we're going to make to 'cluster_io'... if all
2252                  * of the pages are valid, we won't call through to 'cluster_io'
2253                  */
2254                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2255                         if (!upl_valid_page(pl, start_pg))
2256                                 break;
2257                 }
2258
2259                 /*
2260                  * scan from the starting invalid page looking for a valid
2261                  * page before the end of the upl is reached, if we
2262                  * find one, then it will be the last page of the request to
2263                  * 'cluster_io'
2264                  */
2265                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2266                         if (upl_valid_page(pl, last_pg))
2267                                 break;
2268                 }
2269
2270                 if (start_pg < last_pg) {
2271                         /*
2272                          * we found a range of 'invalid' pages that must be filled
2273                          * if the last page in this range is the last page of the file
2274                          * we may have to clip the size of it to keep from reading past
2275                          * the end of the last physical block associated with the file
2276                          */
2277                         upl_offset = start_pg * PAGE_SIZE;
2278                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2279
2280                         if ((upl_f_offset + upl_offset + io_size) > filesize)
2281                                 io_size = filesize - (upl_f_offset + upl_offset);
2282
2283                         /*
2284                          * issue a synchronous read to cluster_io
2285                          */
2286
2287                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2288                                            io_size, devblocksize, CL_READ, (struct buf *)0);
2289                 }
2290                 if (error == 0) {
2291                         /*
2292                          * if the read completed successfully, or there was no I/O request
2293                          * issued, than map the upl into kernel address space and
2294                          * move the data into user land.... we'll first add on any 'valid'
2295                          * pages that were present in the upl when we acquired it.
2296                          */
2297                         u_int  val_size;
2298                         u_int  size_of_prefetch;
2299
2300                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2301                                 if (!upl_valid_page(pl, uio_last))
2302                                         break;
2303                         }
2304                         /*
2305                          * compute size to transfer this round,  if uio->uio_resid is
2306                          * still non-zero after this uiomove, we'll loop around and
2307                          * set up for another I/O.
2308                          */
2309                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2310
2311                         if (max_size < val_size)
2312                                 val_size = max_size;
2313
2314                         if (uio->uio_resid < val_size)
2315                                 val_size = uio->uio_resid;
2316
2317                         e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2318
2319                         if (size_of_prefetch = (uio->uio_resid - val_size)) {
2320                                 /*
2321                                  * if there's still I/O left to do for this request, then issue a
2322                                  * pre-fetch I/O... the I/O wait time will overlap
2323                                  * with the copying of the data
2324                                  */
2325                                 cluster_rd_prefetch(vp, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
2326                         } else {
2327                                 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2328                                         /*
2329                                          * let's try to read ahead if we're in
2330                                          * a sequential access pattern
2331                                          */
2332                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2333                                 vp->v_lastr = e_lblkno;
2334                         }
2335                         if (uio->uio_segflg == UIO_USERSPACE) {
2336                                 int       offset;
2337
2338                                 segflg = uio->uio_segflg;
2339
2340                                 uio->uio_segflg = UIO_PHYS_USERSPACE;
2341
2342
2343                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2344                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2345
2346                                 offset = start_offset;
2347
2348                                 while (val_size && retval == 0) {
2349                                         int       csize;
2350                                         int       i;
2351                                         caddr_t   paddr;
2352
2353                                         i = offset / PAGE_SIZE;
2354                                         csize = min(PAGE_SIZE - start_offset, val_size);
2355
2356                                         paddr = (caddr_t)upl_phys_page(pl, i) + start_offset;
2357
2358                                         retval = uiomove(paddr, csize, uio);
2359
2360                                         val_size    -= csize;
2361                                         offset      += csize;
2362                                         start_offset = offset & PAGE_MASK;
2363                                 }
2364                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2365                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2366
2367                                 uio->uio_segflg = segflg;
2368                         }
2369                         else
2370                         {
2371                                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
2372                                         panic("cluster_read: ubc_upl_map() failed\n");
2373
2374                                 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2375
2376                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
2377                                         panic("cluster_read: ubc_upl_unmap() failed\n");
2378                         }
2379                 }
2380                 if (start_pg < last_pg) {
2381                         /*
2382                          * compute the range of pages that we actually issued an I/O for
2383                          * and either commit them as valid if the I/O succeeded
2384                          * or abort them if the I/O failed
2385                          */
2386                         io_size = (last_pg - start_pg) * PAGE_SIZE;
2387
2388                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2389                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2390
2391                         if (error || (vp->v_flag & VNOCACHE_DATA))
2392                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2393                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2394                         else
2395                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2396                                                 UPL_COMMIT_CLEAR_DIRTY
2397                                                 | UPL_COMMIT_FREE_ON_EMPTY
2398                                                 | UPL_COMMIT_INACTIVATE);
2399
2400                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2401                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2402                 }
2403                 if ((last_pg - start_pg) < pages_in_upl) {
2404                         int cur_pg;
2405                         int commit_flags;
2406
2407                         /*
2408                          * the set of pages that we issued an I/O for did not encompass
2409                          * the entire upl... so just release these without modifying
2410                          * there state
2411                          */
2412                         if (error)
2413                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2414                         else {
2415                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2416                                              (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2417
2418                                 if (start_pg) {
2419                                         /*
2420                                          * we found some already valid pages at the beginning of
2421                                          * the upl commit these back to the inactive list with
2422                                          * reference cleared
2423                                          */
2424                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2425                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2426                                                                    | UPL_COMMIT_INACTIVATE;
2427
2428                                                 if (upl_dirty_page(pl, cur_pg))
2429                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2430
2431                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2432                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2433                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2434                                                 else
2435                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2436                                                                 PAGE_SIZE, commit_flags);
2437                                         }
2438                                 }
2439                                 if (last_pg < uio_last) {
2440                                         /*
2441                                          * we found some already valid pages immediately after the
2442                                          * pages we issued I/O for, commit these back to the
2443                                          * inactive list with reference cleared
2444                                          */
2445                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2446                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
2447                                                                                 | UPL_COMMIT_INACTIVATE;
2448
2449                                                 if (upl_dirty_page(pl, cur_pg))
2450                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2451
2452                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2453                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2454                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2455                                                 else
2456                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2457                                                                 PAGE_SIZE, commit_flags);
2458                                         }
2459                                 }
2460                                 if (uio_last < pages_in_upl) {
2461                                         /*
2462                                          * there were some invalid pages beyond the valid pages
2463                                          * that we didn't issue an I/O for, just release them
2464                                          * unchanged
2465                                          */
2466                                         ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2467                                                             (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
2468                                 }
2469
2470                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2471                                         (int)upl, -1, -1, 0, 0);
2472                         }
2473                 }
2474                 if (retval == 0)
2475                         retval = error;
2476         }
2477
2478         return (retval);
2479 }
2480
2481 static int
2482 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2483         struct vnode *vp;
2484         struct uio   *uio;
2485         off_t         filesize;
2486         int           devblocksize;
2487         int           flags;
2488 {
2489         upl_t            upl;
2490         upl_page_info_t  *pl;
2491         off_t            upl_f_offset;
2492         vm_offset_t      upl_offset;
2493         off_t            start_upl_f_offset;
2494         off_t            max_io_size;
2495         int              io_size;
2496         int              upl_size;
2497         int              upl_needed_size;
2498         int              pages_in_pl;
2499         vm_offset_t      paddr;
2500         int              upl_flags;
2501         kern_return_t    kret;
2502         int              segflg;
2503         struct iovec     *iov;
2504         int              i;
2505         int              force_data_sync;
2506         int              error  = 0;
2507         int              retval = 0;
2508
2509         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2510                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2511
2512         /*
2513          * When we enter this routine, we know
2514          *  -- the offset into the file is on a pagesize boundary
2515          *  -- the resid is a page multiple
2516          *  -- the resid will not exceed iov_len
2517          */
2518
2519         iov = uio->uio_iov;
2520         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2521
2522           max_io_size = filesize - uio->uio_offset;
2523
2524           if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2525               io_size = max_io_size;
2526           else
2527               io_size = uio->uio_resid;
2528
2529           /*
2530            * We don't come into this routine unless
2531            * UIO_USERSPACE is set.
2532            */
2533           segflg = uio->uio_segflg;
2534
2535           uio->uio_segflg = UIO_PHYS_USERSPACE;
2536
2537           /*
2538            * First look for pages already in the cache
2539            * and move them to user space.
2540            */
2541           while (io_size && (retval == 0)) {
2542             upl_f_offset = uio->uio_offset;
2543
2544             /*
2545              * If this call fails, it means the page is not
2546              * in the page cache.
2547              */
2548             if (ubc_page_op(vp, upl_f_offset,
2549                             UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS)
2550               break;
2551
2552             retval = uiomove((caddr_t)(paddr), PAGE_SIZE, uio);
2553
2554             ubc_page_op(vp, upl_f_offset,
2555                         UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2556
2557             io_size     -= PAGE_SIZE;
2558             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
2559                            (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2560           }
2561
2562           uio->uio_segflg = segflg;
2563
2564           if (retval)
2565             {
2566               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2567                            (int)uio->uio_offset, uio->uio_resid, 2, retval, 0);
2568               return(retval);
2569             }
2570
2571           /* If we are already finished with this read, then return */
2572           if (io_size == 0)
2573             {
2574
2575               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2576                            (int)uio->uio_offset, uio->uio_resid, 3, io_size, 0);
2577               return(0);
2578             }
2579
2580           max_io_size = io_size;
2581           if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2582             max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2583
2584           start_upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
2585           upl_f_offset = start_upl_f_offset;
2586           io_size = 0;
2587
2588           while(io_size < max_io_size)
2589             {
2590
2591               if(ubc_page_op(vp, upl_f_offset,
2592                                 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS)
2593               {
2594                         ubc_page_op(vp, upl_f_offset,
2595                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2596                         break;
2597               }
2598
2599                   /*
2600                    * Build up the io request parameters.
2601                    */
2602
2603                   io_size += PAGE_SIZE;
2604                   upl_f_offset += PAGE_SIZE;
2605                 }
2606
2607               if (io_size == 0)
2608                 return(retval);
2609
2610           upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2611           upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2612
2613           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2614                        (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
2615
2616           for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
2617             {
2618               pages_in_pl = 0;
2619               upl_size = upl_needed_size;
2620               upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2621
2622               kret = vm_map_get_upl(current_map(),
2623                                     (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2624                                     &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2625
2626               if (kret != KERN_SUCCESS)
2627                 {
2628                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2629                                (int)upl_offset, upl_size, io_size, kret, 0);
2630
2631                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2632                                (int)uio->uio_offset, uio->uio_resid, 4, retval, 0);
2633
2634                   /* cluster_nocopy_read: failed to get pagelist */
2635                   /* do not return kret here */
2636                   return(retval);
2637                 }
2638
2639               pages_in_pl = upl_size / PAGE_SIZE;
2640               pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2641
2642               for(i=0; i < pages_in_pl; i++)
2643                 {
2644                   if (!upl_valid_page(pl, i))
2645                     break;
2646                 }
2647               if (i == pages_in_pl)
2648                 break;
2649
2650               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2651                                   UPL_ABORT_FREE_ON_EMPTY);
2652             }
2653
2654           if (force_data_sync >= 3)
2655             {
2656                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2657                                (int)upl_offset, upl_size, io_size, kret, 0);
2658
2659                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2660                                (int)uio->uio_offset, uio->uio_resid, 5, retval, 0);
2661               return(retval);
2662             }
2663           /*
2664            * Consider the possibility that upl_size wasn't satisfied.
2665            */
2666           if (upl_size != upl_needed_size)
2667             io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2668
2669           if (io_size == 0)
2670             {
2671               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2672                                    UPL_ABORT_FREE_ON_EMPTY);
2673               return(retval);
2674             }
2675
2676           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2677                        (int)upl_offset, upl_size, io_size, kret, 0);
2678
2679           /*
2680            * issue a synchronous read to cluster_io
2681            */
2682
2683           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2684                        (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
2685
2686           error = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
2687                              io_size, devblocksize, CL_READ| CL_NOZERO, (struct buf *)0);
2688
2689           if (error == 0) {
2690             /*
2691              * The cluster_io read completed successfully,
2692              * update the uio structure and commit.
2693              */
2694
2695             ubc_upl_commit_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2696                                         UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
2697
2698             iov->iov_base += io_size;
2699             iov->iov_len -= io_size;
2700             uio->uio_resid -= io_size;
2701             uio->uio_offset += io_size;
2702           }
2703           else {
2704             ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2705                                    UPL_ABORT_FREE_ON_EMPTY);
2706           }
2707
2708           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2709                        (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
2710
2711           if (retval == 0)
2712             retval = error;
2713
2714         } /* end while */
2715
2716
2717         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2718                      (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2719
2720         return (retval);
2721 }
2722
2723
2724 static int
2725 cluster_phys_read(vp, uio, filesize)
2726         struct vnode *vp;
2727         struct uio   *uio;
2728         off_t        filesize;
2729 {
2730         upl_t            upl;
2731         vm_offset_t      upl_offset;
2732         off_t            max_size;
2733         int              io_size;
2734         int              upl_size;
2735         int              upl_needed_size;
2736         int              pages_in_pl;
2737         int              upl_flags;
2738         kern_return_t    kret;
2739         struct iovec     *iov;
2740         int              error;
2741
2742         /*
2743          * When we enter this routine, we know
2744          *  -- the resid will not exceed iov_len
2745          *  -- the target address is physically contiguous
2746          */
2747
2748         iov = uio->uio_iov;
2749
2750         max_size = filesize - uio->uio_offset;
2751
2752         if (max_size < (off_t)((unsigned int)iov->iov_len))
2753             io_size = max_size;
2754         else
2755             io_size = iov->iov_len;
2756
2757         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2758         upl_needed_size = upl_offset + io_size;
2759
2760         pages_in_pl = 0;
2761         upl_size = upl_needed_size;
2762         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2763
2764         kret = vm_map_get_upl(current_map(),
2765                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2766                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
2767
2768         if (kret != KERN_SUCCESS)
2769           {
2770             /* cluster_phys_read: failed to get pagelist */
2771             return(EINVAL);
2772           }
2773
2774         /*
2775          * Consider the possibility that upl_size wasn't satisfied.
2776          */
2777         if (upl_size < upl_needed_size)
2778           {
2779             ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2780             return(EINVAL);
2781           }
2782
2783         /*
2784          * issue a synchronous read to cluster_io
2785          */
2786
2787         error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2788                            io_size, 0, CL_READ| CL_NOZERO | CL_DEV_MEMORY, (struct buf *)0);
2789
2790         if (error == 0)
2791           {
2792             /*
2793              * The cluster_io read completed successfully,
2794              * update the uio structure and commit.
2795              */
2796
2797             ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
2798
2799             iov->iov_base += io_size;
2800             iov->iov_len -= io_size;
2801             uio->uio_resid -= io_size;
2802             uio->uio_offset += io_size;
2803           }
2804         else
2805             ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2806
2807         return (error);
2808 }
2809
2810 /*
2811  * generate advisory I/O's in the largest chunks possible
2812  * the completed pages will be released into the VM cache
2813  */
2814 int
2815 advisory_read(vp, filesize, f_offset, resid, devblocksize)
2816         struct vnode *vp;
2817         off_t         filesize;
2818         off_t         f_offset;
2819         int           resid;
2820         int           devblocksize;
2821 {
2822         upl_page_info_t *pl;
2823         upl_t            upl;
2824         vm_offset_t      upl_offset;
2825         int              upl_size;
2826         off_t            upl_f_offset;
2827         int              start_offset;
2828         int              start_pg;
2829         int              last_pg;
2830         int              pages_in_upl;
2831         off_t            max_size;
2832         int              io_size;
2833         kern_return_t    kret;
2834         int              retval = 0;
2835         int              issued_io;
2836
2837         if (!UBCINFOEXISTS(vp))
2838                 return(EINVAL);
2839
2840         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
2841                      (int)f_offset, resid, (int)filesize, devblocksize, 0);
2842
2843         while (resid && f_offset < filesize && retval == 0) {
2844                 /*
2845                  * compute the size of the upl needed to encompass
2846                  * the requested read... limit each call to cluster_io
2847                  * to the maximum UPL size... cluster_io will clip if
2848                  * this exceeds the maximum io_size for the device,
2849                  * make sure to account for
2850                  * a starting offset that's not page aligned
2851                  */
2852                 start_offset = (int)(f_offset & PAGE_MASK_64);
2853                 upl_f_offset = f_offset - (off_t)start_offset;
2854                 max_size     = filesize - f_offset;
2855
2856                 if (resid < max_size)
2857                         io_size = resid;
2858                 else
2859                         io_size = max_size;
2860
2861                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2862                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2863                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2864                 pages_in_upl = upl_size / PAGE_SIZE;
2865
2866                 kret = ubc_create_upl(vp,
2867                                                 upl_f_offset,
2868                                                 upl_size,
2869                                                 &upl,
2870                                                 &pl,
2871                                                 UPL_RET_ONLY_ABSENT);
2872                 if (kret != KERN_SUCCESS)
2873                         return(retval);
2874                 issued_io = 0;
2875
2876                 /*
2877                  * before we start marching forward, we must make sure we end on
2878                  * a present page, otherwise we will be working with a freed
2879                  * upl
2880                  */
2881                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
2882                         if (upl_page_present(pl, last_pg))
2883                                 break;
2884                 }
2885                 pages_in_upl = last_pg + 1;
2886
2887
2888                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
2889                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2890
2891
2892                 for (last_pg = 0; last_pg < pages_in_upl; ) {
2893                         /*
2894                          * scan from the beginning of the upl looking for the first
2895                          * page that is present.... this will become the first page in
2896                          * the request we're going to make to 'cluster_io'... if all
2897                          * of the pages are absent, we won't call through to 'cluster_io'
2898                          */
2899                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
2900                                 if (upl_page_present(pl, start_pg))
2901                                         break;
2902                         }
2903
2904                         /*
2905                          * scan from the starting present page looking for an absent
2906                          * page before the end of the upl is reached, if we
2907                          * find one, then it will terminate the range of pages being
2908                          * presented to 'cluster_io'
2909                          */
2910                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2911                                 if (!upl_page_present(pl, last_pg))
2912                                         break;
2913                         }
2914
2915                         if (last_pg > start_pg) {
2916                                 /*
2917                                  * we found a range of pages that must be filled
2918                                  * if the last page in this range is the last page of the file
2919                                  * we may have to clip the size of it to keep from reading past
2920                                  * the end of the last physical block associated with the file
2921                                  */
2922                                 upl_offset = start_pg * PAGE_SIZE;
2923                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
2924
2925                                 if ((upl_f_offset + upl_offset + io_size) > filesize)
2926                                         io_size = filesize - (upl_f_offset + upl_offset);
2927
2928                                 /*
2929                                  * issue an asynchronous read to cluster_io
2930                                  */
2931                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
2932                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0);
2933
2934                                 issued_io = 1;
2935                         }
2936                 }
2937                 if (issued_io == 0)
2938                         ubc_upl_abort(upl, 0);
2939
2940                 io_size = upl_size - start_offset;
2941
2942                 if (io_size > resid)
2943                         io_size = resid;
2944                 f_offset += io_size;
2945                 resid    -= io_size;
2946         }
2947
2948         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
2949                      (int)f_offset, resid, retval, 0, 0);
2950
2951         return(retval);
2952 }
2953
2954
2955 int
2956 cluster_push(vp)
2957         struct vnode *vp;
2958 {
2959         int  retval;
2960
2961         if (!UBCINFOEXISTS(vp) || vp->v_clen == 0) {
2962                 vp->v_flag &= ~VHASDIRTY;
2963                 return(0);
2964         }
2965
2966         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
2967                      vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
2968
2969         if (vp->v_flag & VHASDIRTY) {
2970                 daddr_t start_pg;
2971                 daddr_t last_pg;
2972                 daddr_t end_pg;
2973
2974                 start_pg = vp->v_cstart;
2975                 end_pg   = vp->v_lastw;
2976
2977                 vp->v_flag &= ~VHASDIRTY;
2978                 vp->v_clen = 0;
2979
2980                 while (start_pg < end_pg) {
2981                         last_pg = start_pg + MAX_UPL_TRANSFER;
2982
2983                         if (last_pg > end_pg)
2984                                 last_pg = end_pg;
2985
2986                         cluster_push_x(vp, ubc_getsize(vp), start_pg, last_pg, 0);
2987
2988                         start_pg = last_pg;
2989                 }
2990                 return (1);
2991         }
2992         retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
2993
2994         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
2995                      vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
2996
2997         return (retval);
2998 }
2999
3000
3001 static int
3002 cluster_try_push(vp, EOF, can_delay, push_all)
3003         struct vnode *vp;
3004         off_t  EOF;
3005         int    can_delay;
3006         int    push_all;
3007 {
3008         int cl_index;
3009         int cl_index1;
3010         int min_index;
3011         int cl_len;
3012         int cl_total;
3013         int cl_pushed;
3014         struct v_cluster l_clusters[MAX_CLUSTERS];
3015
3016         /*
3017          * make a local 'sorted' copy of the clusters
3018          * and clear vp->v_clen so that new clusters can
3019          * be developed
3020          */
3021         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3022                 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3023                         if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3024                                 continue;
3025                         if (min_index == -1)
3026                                 min_index = cl_index1;
3027                         else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3028                                 min_index = cl_index1;
3029                 }
3030                 if (min_index == -1)
3031                         break;
3032                 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3033                 l_clusters[cl_index].last_pg  = vp->v_clusters[min_index].last_pg;
3034
3035                 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3036         }
3037         cl_len     = cl_index;
3038         vp->v_clen = 0;
3039
3040         for (cl_pushed = 0, cl_index = 0; cl_index < cl_len; cl_index++) {
3041                 /*
3042                  * try to push each cluster in turn...  cluster_push_x may not
3043                  * push the cluster if can_delay is TRUE and the cluster doesn't
3044                  * meet the critera for an immediate push
3045                  */
3046                 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3047                         l_clusters[cl_index].start_pg = 0;
3048                         l_clusters[cl_index].last_pg  = 0;
3049
3050                         cl_pushed++;
3051
3052                         if (push_all == 0)
3053                                 break;
3054                 }
3055         }
3056         if (cl_len > cl_pushed) {
3057                /*
3058                 * we didn't push all of the clusters, so
3059                 * lets try to merge them back in to the vnode
3060                 */
3061                 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3062                         /*
3063                          * we picked up some new clusters while we were trying to
3064                          * push the old ones (I don't think this can happen because
3065                          * I'm holding the lock, but just in case)... the sum of the
3066                          * leftovers plus the new cluster count exceeds our ability
3067                          * to represent them, so fall back to the VHASDIRTY mechanism
3068                          */
3069                         for (cl_index = 0; cl_index < cl_len; cl_index++) {
3070                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3071                                         continue;
3072
3073                                 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3074                                         vp->v_cstart = l_clusters[cl_index].start_pg;
3075                                 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3076                                         vp->v_lastw = l_clusters[cl_index].last_pg;
3077                         }
3078                         vp->v_flag |= VHASDIRTY;
3079                 } else {
3080                         /*
3081                          * we've got room to merge the leftovers back in
3082                          * just append them starting at the next 'hole'
3083                          * represented by vp->v_clen
3084                          */
3085                         for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3086                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3087                                         continue;
3088
3089                                 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3090                                 vp->v_clusters[cl_index1].last_pg  = l_clusters[cl_index].last_pg;
3091
3092                                 if (cl_index1 == 0) {
3093                                         vp->v_cstart = l_clusters[cl_index].start_pg;
3094                                         vp->v_lastw  = l_clusters[cl_index].last_pg;
3095                                 } else {
3096                                         if (l_clusters[cl_index].start_pg < vp->v_cstart)
3097                                                 vp->v_cstart = l_clusters[cl_index].start_pg;
3098                                         if (l_clusters[cl_index].last_pg > vp->v_lastw)
3099                                                 vp->v_lastw = l_clusters[cl_index].last_pg;
3100                                 }
3101                                 cl_index1++;
3102                         }
3103                         /*
3104                          * update the cluster count
3105                          */
3106                         vp->v_clen = cl_index1;
3107                 }
3108         }
3109         return(MAX_CLUSTERS - vp->v_clen);
3110 }
3111
3112
3113
3114 static int
3115 cluster_push_x(vp, EOF, first, last, can_delay)
3116         struct vnode *vp;
3117         off_t  EOF;
3118         daddr_t first;
3119         daddr_t last;
3120         int    can_delay;
3121 {
3122         upl_page_info_t *pl;
3123         upl_t            upl;
3124         vm_offset_t      upl_offset;
3125         int              upl_size;
3126         off_t            upl_f_offset;
3127         int              pages_in_upl;
3128         int              start_pg;
3129         int              last_pg;
3130         int              io_size;
3131         int              io_flags;
3132         int              size;
3133         kern_return_t    kret;
3134
3135
3136         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3137                      vp->v_clen, first, last, EOF, 0);
3138
3139         if ((pages_in_upl = last - first) == 0) {
3140                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
3141
3142                 return (1);
3143         }
3144         upl_size = pages_in_upl * PAGE_SIZE;
3145         upl_f_offset = ((off_t)first) * PAGE_SIZE_64;
3146
3147         if (upl_f_offset + upl_size >= EOF) {
3148
3149                 if (upl_f_offset >= EOF) {
3150                         /*
3151                          * must have truncated the file and missed
3152                          * clearing a dangling cluster (i.e. it's completely
3153                          * beyond the new EOF
3154                          */
3155                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3156
3157                         return(1);
3158                 }
3159                 size = EOF - upl_f_offset;
3160
3161                 upl_size = (size + (PAGE_SIZE - 1) ) & ~(PAGE_SIZE - 1);
3162                 pages_in_upl = upl_size / PAGE_SIZE;
3163         } else {
3164                 if (can_delay && (pages_in_upl < (MAX_UPL_TRANSFER - (MAX_UPL_TRANSFER / 2))))
3165                         return(0);
3166                 size = upl_size;
3167         }
3168         kret = ubc_create_upl(vp,
3169                                 upl_f_offset,
3170                                 upl_size,
3171                                 &upl,
3172                                 &pl,
3173                                 UPL_RET_ONLY_DIRTY);
3174         if (kret != KERN_SUCCESS)
3175                 panic("cluster_push: failed to get pagelist");
3176
3177         if (can_delay) {
3178                 int  num_of_dirty;
3179
3180                 for (num_of_dirty = 0, start_pg = 0; start_pg < pages_in_upl; start_pg++) {
3181                         if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3182                                 num_of_dirty++;
3183                 }
3184                 if (num_of_dirty < pages_in_upl / 2) {
3185                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3186
3187                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 0, 2, num_of_dirty, (pages_in_upl / 2), 0);
3188
3189                         return(0);
3190                 }
3191         }
3192         last_pg = 0;
3193
3194         while (size) {
3195
3196                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3197                         if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3198                                 break;
3199                 }
3200                 if (start_pg > last_pg) {
3201                         io_size = (start_pg - last_pg) * PAGE_SIZE;
3202
3203                         ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size,
3204                                         UPL_ABORT_FREE_ON_EMPTY);
3205
3206                         if (io_size < size)
3207                                 size -= io_size;
3208                         else
3209                                 break;
3210                 }
3211                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3212                         if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
3213                                 break;
3214                 }
3215                 upl_offset = start_pg * PAGE_SIZE;
3216
3217                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3218
3219                 if (vp->v_flag & VNOCACHE_DATA)
3220                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC | CL_DUMP;
3221                 else
3222                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
3223
3224                 while (vp->v_numoutput >= ASYNC_THROTTLE) {
3225                         vp->v_flag |= VTHROTTLED;
3226                         tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
3227                 }
3228                 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0);
3229
3230                 size -= io_size;
3231         }
3232         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3233
3234         return(1);
3235 }