bsd/vfs/vfs_cluster.c

   1
   2 /*
   3  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   4  *
   5  * @APPLE_LICENSE_HEADER_START@
   6  *
   7  * The contents of this file constitute Original Code as defined in and
   8  * are subject to the Apple Public Source License Version 1.1 (the
   9  * "License").  You may not use this file except in compliance with the
  10  * License.  Please obtain a copy of the License at
  11  * http://www.apple.com/publicsource and read it before using this file.
  12  *
  13  * This Original Code and all software distributed under the License are
  14  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  18  * License for the specific language governing rights and limitations
  19  * under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  24 /*
  25  * Copyright (c) 1993
  26  *      The Regents of the University of California.  All rights reserved.
  27  *
  28  * Redistribution and use in source and binary forms, with or without
  29  * modification, are permitted provided that the following conditions
  30  * are met:
  31  * 1. Redistributions of source code must retain the above copyright
  32  *    notice, this list of conditions and the following disclaimer.
  33  * 2. Redistributions in binary form must reproduce the above copyright
  34  *    notice, this list of conditions and the following disclaimer in the
  35  *    documentation and/or other materials provided with the distribution.
  36  * 3. All advertising materials mentioning features or use of this software
  37  *    must display the following acknowledgement:
  38  *      This product includes software developed by the University of
  39  *      California, Berkeley and its contributors.
  40  * 4. Neither the name of the University nor the names of its contributors
  41  *    may be used to endorse or promote products derived from this software
  42  *    without specific prior written permission.
  43  *
  44  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  45  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  46  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  47  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  48  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  49  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  50  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  51  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  52  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  53  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  54  * SUCH DAMAGE.
  55  *
  56  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  57  */
  58
  59 #include <sys/param.h>
  60 #include <sys/proc.h>
  61 #include <sys/buf.h>
  62 #include <sys/vnode.h>
  63 #include <sys/mount.h>
  64 #include <sys/trace.h>
  65 #include <sys/malloc.h>
  66 #include <sys/resourcevar.h>
  67 #include <libkern/libkern.h>
  68
  69 #include <sys/ubc.h>
  70 #include <vm/vm_pageout.h>
  71
  72 #include <sys/kdebug.h>
  73
  74 #define CL_READ      0x01
  75 #define CL_ASYNC     0x02
  76 #define CL_COMMIT    0x04
  77 #define CL_PAGEOUT   0x10
  78 #define CL_AGE       0x20
  79 #define CL_DUMP      0x40
  80 #define CL_NOZERO    0x80
  81 #define CL_PAGEIN    0x100
  82 #define CL_DEV_MEMORY 0x200
  83
  84 static void cluster_zero(upl_t upl, vm_offset_t   upl_offset,
  85                 int size, struct buf *bp);
  86 static int cluster_read_x(struct vnode *vp, struct uio *uio,
  87                 off_t filesize, int devblocksize, int flags);
  88 static int cluster_write_x(struct vnode *vp, struct uio *uio,
  89                 off_t oldEOF, off_t newEOF, off_t headOff,
  90                 off_t tailOff, int devblocksize, int flags);
  91 static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
  92                 off_t filesize, int devblocksize, int flags);
  93 static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
  94                 off_t newEOF, int devblocksize, int flags);
  95 static int cluster_phys_read(struct vnode *vp, struct uio *uio,
  96                 off_t filesize);
  97 static int cluster_phys_write(struct vnode *vp, struct uio *uio, off_t newEOF);
  98 static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
  99 static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all);
 100
 101
 102 /*
 103  * throttle the number of async writes that
 104  * can be outstanding on a single vnode
 105  * before we issue a synchronous write
 106  */
 107 #define ASYNC_THROTTLE  9
 108
 109 static int
 110 cluster_iodone(bp)
 111         struct buf *bp;
 112 {
 113         int         b_flags;
 114         int         error;
 115         int         total_size;
 116         int         total_resid;
 117         int         upl_offset;
 118         int         zero_offset;
 119         upl_t       upl;
 120         struct buf *cbp;
 121         struct buf *cbp_head;
 122         struct buf *cbp_next;
 123         struct buf *real_bp;
 124         struct vnode *vp;
 125         int         commit_size;
 126         int         pg_offset;
 127
 128
 129         cbp_head = (struct buf *)(bp->b_trans_head);
 130
 131         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 132                      (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 133
 134         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 135                 /*
 136                  * all I/O requests that are part of this transaction
 137                  * have to complete before we can process it
 138                  */
 139                 if ( !(cbp->b_flags & B_DONE)) {
 140
 141                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 142                                      (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
 143
 144                         return 0;
 145                 }
 146         }
 147         error       = 0;
 148         total_size  = 0;
 149         total_resid = 0;
 150
 151         cbp        = cbp_head;
 152         upl_offset = cbp->b_uploffset;
 153         upl        = cbp->b_pagelist;
 154         b_flags    = cbp->b_flags;
 155         real_bp    = cbp->b_real_bp;
 156         vp         = cbp->b_vp;
 157         zero_offset= cbp->b_validend;
 158
 159         while (cbp) {
 160                 if (cbp->b_vectorcount > 1)
 161                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 162
 163                 if ((cbp->b_flags & B_ERROR) && error == 0)
 164                         error = cbp->b_error;
 165
 166                 total_resid += cbp->b_resid;
 167                 total_size  += cbp->b_bcount;
 168
 169                 cbp_next = cbp->b_trans_next;
 170
 171                 free_io_buf(cbp);
 172
 173                 cbp = cbp_next;
 174         }
 175         if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
 176                 vp->v_flag &= ~VTHROTTLED;
 177                 wakeup((caddr_t)&vp->v_numoutput);
 178         }
 179         if (zero_offset)
 180                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 181
 182         if ((b_flags & B_NEED_IODONE) && real_bp) {
 183                 if (error) {
 184                         real_bp->b_flags |= B_ERROR;
 185                         real_bp->b_error = error;
 186                 }
 187                 real_bp->b_resid = total_resid;
 188
 189                 biodone(real_bp);
 190         }
 191         if (error == 0 && total_resid)
 192                 error = EIO;
 193
 194         if (b_flags & B_COMMIT_UPL) {
 195                 pg_offset   = upl_offset & PAGE_MASK;
 196                 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 197
 198                 if (error || (b_flags & B_NOCACHE)) {
 199                         int upl_abort_code;
 200
 201                         if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
 202                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 203                         else if (b_flags & B_PGIN)
 204                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 205                         else
 206                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 207
 208                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 209                                         upl_abort_code);
 210
 211                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 212                                      (int)upl, upl_offset - pg_offset, commit_size,
 213                                      0x80000000|upl_abort_code, 0);
 214
 215                 } else {
 216                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 217
 218                         if ( !(b_flags & B_PAGEOUT))
 219                                 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
 220                         if (b_flags & B_AGE)
 221                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 222
 223                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 224                                         upl_commit_flags);
 225
 226                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 227                                      (int)upl, upl_offset - pg_offset, commit_size,
 228                                      upl_commit_flags, 0);
 229                 }
 230         } else
 231                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 232                              (int)upl, upl_offset, 0, error, 0);
 233
 234         return (error);
 235 }
 236
 237
 238 static void
 239 cluster_zero(upl, upl_offset, size, bp)
 240         upl_t         upl;
 241         vm_offset_t   upl_offset;
 242         int           size;
 243         struct buf   *bp;
 244 {
 245         vm_offset_t   io_addr = 0;
 246         int           must_unmap = 0;
 247         kern_return_t kret;
 248
 249         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
 250                      upl_offset, size, (int)bp, 0, 0);
 251
 252         if (bp == NULL || bp->b_data == NULL) {
 253                 kret = ubc_upl_map(upl, &io_addr);
 254
 255                 if (kret != KERN_SUCCESS)
 256                         panic("cluster_zero: ubc_upl_map() failed with (%d)", kret);
 257                 if (io_addr == 0)
 258                         panic("cluster_zero: ubc_upl_map() mapped 0");
 259
 260                 must_unmap = 1;
 261         } else
 262                 io_addr = (vm_offset_t)bp->b_data;
 263         bzero((caddr_t)(io_addr + upl_offset), size);
 264
 265         if (must_unmap) {
 266                 kret = ubc_upl_unmap(upl);
 267
 268                 if (kret != KERN_SUCCESS)
 269                         panic("cluster_zero: kernel_upl_unmap failed");
 270         }
 271 }
 272
 273 static int
 274 cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp)
 275         struct vnode *vp;
 276         upl_t         upl;
 277         vm_offset_t   upl_offset;
 278         off_t         f_offset;
 279         int           non_rounded_size;
 280         int           devblocksize;
 281         int           flags;
 282         struct buf   *real_bp;
 283 {
 284         struct buf   *cbp;
 285         struct iovec *iovp;
 286         u_int           size;
 287         int           io_flags;
 288         int           error = 0;
 289         int           retval = 0;
 290         struct buf   *cbp_head = 0;
 291         struct buf   *cbp_tail = 0;
 292         upl_page_info_t *pl;
 293         int buf_count = 0;
 294         int pg_count;
 295         int pg_offset;
 296         u_int max_iosize;
 297         u_int max_vectors;
 298         int priv;
 299         int zero_offset = 0;
 300
 301         if (flags & CL_READ) {
 302                 io_flags = (B_VECTORLIST | B_READ);
 303
 304                 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
 305         } else {
 306                 io_flags = (B_VECTORLIST | B_WRITEINPROG);
 307
 308                 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
 309         }
 310         pl = ubc_upl_pageinfo(upl);
 311
 312         if (flags & CL_ASYNC)
 313                 io_flags |= (B_CALL | B_ASYNC);
 314         if (flags & CL_AGE)
 315                 io_flags |= B_AGE;
 316         if (flags & CL_DUMP)
 317                 io_flags |= B_NOCACHE;
 318         if (flags & CL_PAGEIN)
 319                 io_flags |= B_PGIN;
 320
 321         if (devblocksize)
 322                 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
 323         else
 324                 size = non_rounded_size;
 325
 326
 327         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 328                      (int)f_offset, size, upl_offset, flags, 0);
 329
 330         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 331                 /*
 332                  * then we are going to end up
 333                  * with a page that we can't complete (the file size wasn't a multiple
 334                  * of PAGE_SIZE and we're trying to read to the end of the file
 335                  * so we'll go ahead and zero out the portion of the page we can't
 336                  * read in from the file
 337                  */
 338                 zero_offset = upl_offset + non_rounded_size;
 339         }
 340         while (size) {
 341                 size_t io_size;
 342                 int vsize;
 343                 int i;
 344                 int pl_index;
 345                 int pg_resid;
 346                 int num_contig;
 347                 daddr_t lblkno;
 348                 daddr_t blkno;
 349
 350                 if (size > max_iosize)
 351                         io_size = max_iosize;
 352                 else
 353                         io_size = size;
 354
 355                 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, &io_size, NULL)) {
 356                         if (error == EOPNOTSUPP)
 357                                 panic("VOP_CMAP Unimplemented");
 358                         break;
 359                 }
 360
 361                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 362                              (int)f_offset, (int)blkno, io_size, zero_offset, 0);
 363
 364                 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
 365                         if (flags & CL_PAGEOUT) {
 366                                 error = EINVAL;
 367                                 break;
 368                         };
 369
 370                         /* Try paging out the page individually before
 371                            giving up entirely and dumping it (it could
 372                            be mapped in a "hole" and require allocation
 373                            before the I/O:
 374                          */
 375                          ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE_64, UPL_ABORT_FREE_ON_EMPTY);
 376                          if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
 377                                 error = EINVAL;
 378                                 break;
 379                          };
 380
 381                         upl_offset += PAGE_SIZE_64;
 382                         f_offset   += PAGE_SIZE_64;
 383                         size       -= PAGE_SIZE_64;
 384                         continue;
 385                 }
 386                 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
 387                 /*
 388                  * we have now figured out how much I/O we can do - this is in 'io_size'
 389                  * pl_index represents the first page in the 'upl' that the I/O will occur for
 390                  * pg_offset is the starting point in the first page for the I/O
 391                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 392                  */
 393                 pl_index  = upl_offset / PAGE_SIZE;
 394                 pg_offset = upl_offset & PAGE_MASK;
 395                 pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 396
 397                 if (flags & CL_DEV_MEMORY) {
 398                         /*
 399                          * currently, can't deal with reading 'holes' in file
 400                          */
 401                         if ((long)blkno == -1) {
 402                                 error = EINVAL;
 403                                 break;
 404                         }
 405                         /*
 406                          * treat physical requests as one 'giant' page
 407                          */
 408                         pg_count = 1;
 409                 }
 410                 if ((flags & CL_READ) && (long)blkno == -1) {
 411                         int bytes_to_zero;
 412
 413                         /*
 414                          * if we're reading and blkno == -1, then we've got a
 415                          * 'hole' in the file that we need to deal with by zeroing
 416                          * out the affected area in the upl
 417                          */
 418                         if (zero_offset && io_size == size) {
 419                                 /*
 420                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
 421                                  * than 'zero_offset' will be non-zero
 422                                  * if the 'hole' returned by VOP_CMAP extends all the way to the eof
 423                                  * (indicated by the io_size finishing off the I/O request for this UPL)
 424                                  * than we're not going to issue an I/O for the
 425                                  * last page in this upl... we need to zero both the hole and the tail
 426                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
 427                                  */
 428                                 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
 429
 430                                 zero_offset = 0;
 431                         } else
 432                                 bytes_to_zero = io_size;
 433
 434                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
 435
 436                         if (cbp_head)
 437                                 /*
 438                                  * if there is a current I/O chain pending
 439                                  * then the first page of the group we just zero'd
 440                                  * will be handled by the I/O completion if the zero
 441                                  * fill started in the middle of the page
 442                                  */
 443                                 pg_count = (io_size - pg_offset) / PAGE_SIZE;
 444                         else {
 445                                 /*
 446                                  * no pending I/O to pick up that first page
 447                                  * so, we have to make sure it gets committed
 448                                  * here.
 449                                  * set the pg_offset to 0 so that the upl_commit_range
 450                                  * starts with this page
 451                                  */
 452                                 pg_count = (io_size + pg_offset) / PAGE_SIZE;
 453                                 pg_offset = 0;
 454                         }
 455                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 456                                 /*
 457                                  * if we're done with the request for this UPL
 458                                  * then we have to make sure to commit the last page
 459                                  * even if we only partially zero-filled it
 460                                  */
 461                                 pg_count++;
 462
 463                         if (pg_count) {
 464                                 if (pg_offset)
 465                                         pg_resid = PAGE_SIZE - pg_offset;
 466                                 else
 467                                         pg_resid = 0;
 468
 469                                 if (flags & CL_COMMIT)
 470                                         ubc_upl_commit_range(upl,
 471                                                         (upl_offset + pg_resid) & ~PAGE_MASK,
 472                                                         pg_count * PAGE_SIZE,
 473                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 474                         }
 475                         upl_offset += io_size;
 476                         f_offset   += io_size;
 477                         size       -= io_size;
 478
 479                         if (cbp_head && pg_count)
 480                                 goto start_io;
 481                         continue;
 482
 483                 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
 484                         real_bp->b_blkno = blkno;
 485                 }
 486
 487                 if (pg_count > 1) {
 488                         if (pg_count > max_vectors) {
 489                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 490
 491                                 if (io_size < 0) {
 492                                         io_size = PAGE_SIZE - pg_offset;
 493                                         pg_count = 1;
 494                                 } else
 495                                         pg_count = max_vectors;
 496                         }
 497                         /*
 498                          * we need to allocate space for the vector list
 499                          */
 500                         if (pg_count > 1) {
 501                                 iovp = (struct iovec *)_MALLOC(sizeof(struct iovec) * pg_count,
 502                                                                M_SEGMENT, M_NOWAIT);
 503
 504                                 if (iovp == (struct iovec *) 0) {
 505                                         /*
 506                                          * if the allocation fails, then throttle down to a single page
 507                                          */
 508                                         io_size = PAGE_SIZE - pg_offset;
 509                                         pg_count = 1;
 510                                 }
 511                         }
 512                 }
 513
 514                 /* Throttle the speculative IO */
 515                 if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 516                         priv = 0;
 517                 else
 518                         priv = 1;
 519
 520                 cbp = alloc_io_buf(vp, priv);
 521
 522                 if (pg_count == 1)
 523                         /*
 524                          * we use the io vector that's reserved in the buffer header
 525                          * this insures we can always issue an I/O even in a low memory
 526                          * condition that prevents the _MALLOC from succeeding... this
 527                          * is necessary to prevent deadlocks with the pager
 528                          */
 529                         iovp = (struct iovec *)(&cbp->b_vects[0]);
 530
 531                 cbp->b_vectorlist  = (void *)iovp;
 532                 cbp->b_vectorcount = pg_count;
 533
 534                 if (flags & CL_DEV_MEMORY) {
 535
 536                         iovp->iov_len  = io_size;
 537                         iovp->iov_base = (caddr_t)upl_phys_page(pl, 0);
 538
 539                         if (iovp->iov_base == (caddr_t) 0) {
 540                                 free_io_buf(cbp);
 541                                 error = EINVAL;
 542                         } else
 543                                 iovp->iov_base += upl_offset;
 544                 } else {
 545
 546                   for (i = 0, vsize = io_size; i < pg_count; i++, iovp++) {
 547                         int     psize;
 548
 549                         psize = PAGE_SIZE - pg_offset;
 550
 551                         if (psize > vsize)
 552                                 psize = vsize;
 553
 554                         iovp->iov_len  = psize;
 555                         iovp->iov_base = (caddr_t)upl_phys_page(pl, pl_index + i);
 556
 557                         if (iovp->iov_base == (caddr_t) 0) {
 558                                 if (pg_count > 1)
 559                                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 560                                 free_io_buf(cbp);
 561
 562                                 error = EINVAL;
 563                                 break;
 564                         }
 565                         iovp->iov_base += pg_offset;
 566                         pg_offset = 0;
 567
 568                         if (flags & CL_PAGEOUT) {
 569                                 int         s;
 570                                 struct buf *bp;
 571
 572                                 s = splbio();
 573                                 if (bp = incore(vp, lblkno + i)) {
 574                                         if (!ISSET(bp->b_flags, B_BUSY)) {
 575                                                 bremfree(bp);
 576                                                 SET(bp->b_flags, (B_BUSY | B_INVAL));
 577                                                 splx(s);
 578                                                 brelse(bp);
 579                                         } else
 580                                                 panic("BUSY bp found in cluster_io");
 581                                 }
 582                                 splx(s);
 583                         }
 584                         vsize -= psize;
 585                     }
 586                 }
 587                 if (error)
 588                         break;
 589
 590                 if (flags & CL_ASYNC)
 591                         cbp->b_iodone = (void *)cluster_iodone;
 592                 cbp->b_flags |= io_flags;
 593
 594                 cbp->b_lblkno = lblkno;
 595                 cbp->b_blkno  = blkno;
 596                 cbp->b_bcount = io_size;
 597                 cbp->b_pagelist  = upl;
 598                 cbp->b_uploffset = upl_offset;
 599                 cbp->b_trans_next = (struct buf *)0;
 600
 601                 if (flags & CL_READ)
 602                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 603                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 604                 else
 605                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 606                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 607
 608                 if (cbp_head) {
 609                         cbp_tail->b_trans_next = cbp;
 610                         cbp_tail = cbp;
 611                 } else {
 612                         cbp_head = cbp;
 613                         cbp_tail = cbp;
 614                 }
 615                 (struct buf *)(cbp->b_trans_head) = cbp_head;
 616                 buf_count++;
 617
 618                 upl_offset += io_size;
 619                 f_offset   += io_size;
 620                 size       -= io_size;
 621
 622                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
 623                         /*
 624                          * if we have no more I/O to issue or
 625                          * the current I/O we've prepared fully
 626                          * completes the last page in this request
 627                          * and it's either an ASYNC request or
 628                          * we've already accumulated more than 8 I/O's into
 629                          * this transaction and it's not an I/O directed to
 630                          * special DEVICE memory
 631                          * then go ahead and issue the I/O
 632                          */
 633 start_io:
 634                         if (flags & CL_COMMIT)
 635                                 cbp_head->b_flags |= B_COMMIT_UPL;
 636                         if (flags & CL_PAGEOUT)
 637                                 cbp_head->b_flags |= B_PAGEOUT;
 638                         if (flags & CL_PAGEIN)
 639                                 cbp_head->b_flags |= B_PGIN;
 640
 641                         if (real_bp) {
 642                                 cbp_head->b_flags |= B_NEED_IODONE;
 643                                 cbp_head->b_real_bp = real_bp;
 644                         } else
 645                                 cbp_head->b_real_bp = (struct buf *)NULL;
 646
 647                         if (size == 0) {
 648                                 /*
 649                                  * we're about to issue the last I/O for this upl
 650                                  * if this was a read to the eof and the eof doesn't
 651                                  * finish on a page boundary, than we need to zero-fill
 652                                  * the rest of the page....
 653                                  */
 654                                 cbp_head->b_validend = zero_offset;
 655                         } else
 656                                 cbp_head->b_validend = 0;
 657
 658                         for (cbp = cbp_head; cbp;) {
 659                                 struct buf * cbp_next;
 660
 661                                 if (io_flags & B_WRITEINPROG)
 662                                         cbp->b_vp->v_numoutput++;
 663
 664                                 cbp_next = cbp->b_trans_next;
 665
 666                                 (void) VOP_STRATEGY(cbp);
 667                                 cbp = cbp_next;
 668                         }
 669                         if ( !(flags & CL_ASYNC)) {
 670                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 671                                         biowait(cbp);
 672
 673                                 if (error = cluster_iodone(cbp_head)) {
 674                                         if ((flags & CL_PAGEOUT) && (error == ENXIO))
 675                                                 retval = 0;     /* drop the error */
 676                                         else
 677                                                 retval = error;
 678                                         error  = 0;
 679                                 }
 680                         }
 681                         cbp_head = (struct buf *)0;
 682                         cbp_tail = (struct buf *)0;
 683
 684                         buf_count = 0;
 685                 }
 686         }
 687         if (error) {
 688                 int abort_size;
 689
 690                 for (cbp = cbp_head; cbp;) {
 691                         struct buf * cbp_next;
 692
 693                         if (cbp->b_vectorcount > 1)
 694                                 _FREE(cbp->b_vectorlist, M_SEGMENT);
 695                         upl_offset -= cbp->b_bcount;
 696                         size       += cbp->b_bcount;
 697
 698                         cbp_next = cbp->b_trans_next;
 699                         free_io_buf(cbp);
 700                         cbp = cbp_next;
 701                 }
 702                 pg_offset  = upl_offset & PAGE_MASK;
 703                 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 704
 705                 if (flags & CL_COMMIT) {
 706                         int upl_abort_code;
 707
 708                         if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
 709                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 710                         else if (flags & CL_PAGEIN)
 711                             upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 712                         else
 713                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 714
 715                         ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
 716                                                 upl_abort_code);
 717
 718                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
 719                                      (int)upl, upl_offset - pg_offset, abort_size, error, 0);
 720                 }
 721                 if (real_bp) {
 722                         real_bp->b_flags |= B_ERROR;
 723                         real_bp->b_error  = error;
 724
 725                         biodone(real_bp);
 726                 }
 727                 if (retval == 0)
 728                         retval = error;
 729         }
 730         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
 731                      (int)f_offset, size, upl_offset, retval, 0);
 732
 733         return (retval);
 734 }
 735
 736
 737 static int
 738 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
 739         struct vnode *vp;
 740         off_t         f_offset;
 741         u_int         size;
 742         off_t         filesize;
 743         int           devblocksize;
 744 {
 745         int           pages_to_fetch;
 746         int           skipped_pages;
 747
 748         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
 749                      (int)f_offset, size, (int)filesize, 0, 0);
 750
 751         if (f_offset >= filesize) {
 752                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 753                              (int)f_offset, 0, 0, 0, 0);
 754                 return(0);
 755         }
 756         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
 757                 size = MAX_UPL_TRANSFER * PAGE_SIZE;
 758         else
 759                 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
 760
 761         if ((off_t)size > (filesize - f_offset))
 762                 size = filesize - f_offset;
 763
 764         pages_to_fetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
 765
 766         for (skipped_pages = 0; skipped_pages < pages_to_fetch; skipped_pages++) {
 767                 if (ubc_page_op(vp, f_offset, 0, 0, 0) != KERN_SUCCESS)
 768                         break;
 769                 f_offset += PAGE_SIZE;
 770                 size     -= PAGE_SIZE;
 771         }
 772         if (skipped_pages < pages_to_fetch)
 773                 advisory_read(vp, filesize, f_offset, size, devblocksize);
 774
 775         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 776                      (int)f_offset + (pages_to_fetch * PAGE_SIZE), skipped_pages, 0, 1, 0);
 777
 778         return (pages_to_fetch);
 779 }
 780
 781
 782
 783 static void
 784 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
 785         struct vnode *vp;
 786         daddr_t       b_lblkno;
 787         daddr_t       e_lblkno;
 788         off_t         filesize;
 789         int           devblocksize;
 790 {
 791         daddr_t       r_lblkno;
 792         off_t         f_offset;
 793         int           size_of_prefetch;
 794         int           max_pages;
 795
 796         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
 797                      b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
 798
 799         if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
 800                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 801                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
 802                 return;
 803         }
 804
 805         if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
 806                                  (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
 807                 vp->v_ralen = 0;
 808                 vp->v_maxra = 0;
 809
 810                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 811                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
 812
 813                 return;
 814         }
 815         max_pages = MAX_UPL_TRANSFER;
 816
 817         vp->v_ralen = vp->v_ralen ? min(max_pages, vp->v_ralen << 1) : 1;
 818
 819         if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
 820                 vp->v_ralen = min(max_pages, (e_lblkno + 1) - b_lblkno);
 821
 822         if (e_lblkno < vp->v_maxra) {
 823                 if ((vp->v_maxra - e_lblkno) > max(max_pages / 16, 4)) {
 824
 825                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 826                                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
 827                         return;
 828                 }
 829         }
 830         r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
 831         f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
 832
 833         if (f_offset < filesize) {
 834                 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
 835
 836                 if (size_of_prefetch)
 837                         vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
 838         }
 839         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 840                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
 841 }
 842
 843 int
 844 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 845         struct vnode *vp;
 846         upl_t         upl;
 847         vm_offset_t   upl_offset;
 848         off_t         f_offset;
 849         int           size;
 850         off_t         filesize;
 851         int           devblocksize;
 852         int           flags;
 853 {
 854         int           io_size;
 855         int           pg_size;
 856         off_t         max_size;
 857         int local_flags = CL_PAGEOUT;
 858
 859         if ((flags & UPL_IOSYNC) == 0)
 860                 local_flags |= CL_ASYNC;
 861         if ((flags & UPL_NOCOMMIT) == 0)
 862                 local_flags |= CL_COMMIT;
 863
 864
 865         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
 866                      (int)f_offset, size, (int)filesize, local_flags, 0);
 867
 868         /*
 869          * If they didn't specify any I/O, then we are done...
 870          * we can't issue an abort because we don't know how
 871          * big the upl really is
 872          */
 873         if (size <= 0)
 874                 return (EINVAL);
 875
 876         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
 877                 if (local_flags & CL_COMMIT)
 878                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 879                 return (EROFS);
 880         }
 881         /*
 882          * can't page-in from a negative offset
 883          * or if we're starting beyond the EOF
 884          * or if the file offset isn't page aligned
 885          * or the size requested isn't a multiple of PAGE_SIZE
 886          */
 887         if (f_offset < 0 || f_offset >= filesize ||
 888            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 889                 if (local_flags & CL_COMMIT)
 890                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 891                 return (EINVAL);
 892         }
 893         max_size = filesize - f_offset;
 894
 895         if (size < max_size)
 896                 io_size = size;
 897         else
 898                 io_size = max_size;
 899
 900         pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 901
 902         if (size > pg_size) {
 903                 if (local_flags & CL_COMMIT)
 904                         ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
 905                                         UPL_ABORT_FREE_ON_EMPTY);
 906         }
 907         while (vp->v_numoutput >= ASYNC_THROTTLE) {
 908                 vp->v_flag |= VTHROTTLED;
 909                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_pageout", 0);
 910         }
 911
 912         return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
 913                            local_flags, (struct buf *)0));
 914 }
 915
 916 int
 917 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 918         struct vnode *vp;
 919         upl_t         upl;
 920         vm_offset_t   upl_offset;
 921         off_t         f_offset;
 922         int           size;
 923         off_t         filesize;
 924         int           devblocksize;
 925         int           flags;
 926 {
 927         u_int         io_size;
 928         int           rounded_size;
 929         off_t         max_size;
 930         int           retval;
 931         int           local_flags = 0;
 932
 933         if (upl == NULL || size < 0)
 934                 panic("cluster_pagein: NULL upl passed in");
 935
 936         if ((flags & UPL_IOSYNC) == 0)
 937                 local_flags |= CL_ASYNC;
 938         if ((flags & UPL_NOCOMMIT) == 0)
 939                 local_flags |= CL_COMMIT;
 940
 941
 942         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
 943                      (int)f_offset, size, (int)filesize, local_flags, 0);
 944
 945         /*
 946          * can't page-in from a negative offset
 947          * or if we're starting beyond the EOF
 948          * or if the file offset isn't page aligned
 949          * or the size requested isn't a multiple of PAGE_SIZE
 950          */
 951         if (f_offset < 0 || f_offset >= filesize ||
 952            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
 953                 if (local_flags & CL_COMMIT)
 954                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
 955                 return (EINVAL);
 956         }
 957         max_size = filesize - f_offset;
 958
 959         if (size < max_size)
 960                 io_size = size;
 961         else
 962                 io_size = max_size;
 963
 964         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 965
 966         if (size > rounded_size && (local_flags & CL_COMMIT))
 967                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
 968                                     size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
 969
 970         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
 971                            local_flags | CL_READ | CL_PAGEIN, (struct buf *)0);
 972
 973         if (retval == 0) {
 974                 int b_lblkno;
 975                 int e_lblkno;
 976
 977                 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
 978                 e_lblkno = (int)
 979                         ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
 980
 981                 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
 982                         /*
 983                          * we haven't read the last page in of the file yet
 984                          * so let's try to read ahead if we're in
 985                          * a sequential access pattern
 986                          */
 987                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
 988                 }
 989                 vp->v_lastr = e_lblkno;
 990         }
 991         return (retval);
 992 }
 993
 994 int
 995 cluster_bp(bp)
 996         struct buf *bp;
 997 {
 998         off_t  f_offset;
 999         int    flags;
1000
1001         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1002                      (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1003
1004         if (bp->b_pagelist == (upl_t) 0)
1005                 panic("cluster_bp: can't handle NULL upl yet\n");
1006         if (bp->b_flags & B_READ)
1007                 flags = CL_ASYNC | CL_READ;
1008         else
1009                 flags = CL_ASYNC;
1010
1011         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1012
1013         return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp));
1014 }
1015
1016 int
1017 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1018         struct vnode *vp;
1019         struct uio   *uio;
1020         off_t         oldEOF;
1021         off_t         newEOF;
1022         off_t         headOff;
1023         off_t         tailOff;
1024         int           devblocksize;
1025         int           flags;
1026 {
1027         int           prev_resid;
1028         int           clip_size;
1029         off_t         max_io_size;
1030         struct iovec  *iov;
1031         vm_offset_t   upl_offset;
1032         int           upl_size;
1033         int           pages_in_pl;
1034         upl_page_info_t *pl;
1035         int           upl_flags;
1036         upl_t         upl;
1037         int           retval = 0;
1038
1039
1040         if ((!uio) || (uio->uio_segflg != UIO_USERSPACE) || (!(vp->v_flag & VNOCACHE_DATA)))
1041           {
1042             retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1043             return(retval);
1044           }
1045
1046         while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1047           {
1048             /* we know we have a resid, so this is safe */
1049             iov = uio->uio_iov;
1050             while (iov->iov_len == 0) {
1051               uio->uio_iov++;
1052               uio->uio_iovcnt--;
1053               iov = uio->uio_iov;
1054             }
1055
1056             /*
1057              * We check every vector target and if it is physically
1058              * contiguous space, we skip the sanity checks.
1059              */
1060
1061             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1062             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1063             pages_in_pl = 0;
1064             upl_flags = UPL_QUERY_OBJECT_TYPE;
1065             if ((vm_map_get_upl(current_map(),
1066                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1067                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1068               {
1069                 /*
1070                  * the user app must have passed in an invalid address
1071                  */
1072                 return (EFAULT);
1073               }
1074
1075             if (upl_flags & UPL_PHYS_CONTIG)
1076               {
1077                 /*
1078                  * since the interface to the IOKit below us uses physical block #'s and
1079                  * block counts to specify the I/O, we can't handle anything that isn't
1080                  * devblocksize aligned
1081                  */
1082                 if ((uio->uio_offset & (devblocksize - 1)) || (uio->uio_resid & (devblocksize - 1)))
1083                     return(EINVAL);
1084
1085                 if (flags & IO_HEADZEROFILL)
1086                   {
1087                     flags &= ~IO_HEADZEROFILL;
1088
1089                     if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1090                         return(retval);
1091                   }
1092
1093                 retval = cluster_phys_write(vp, uio, newEOF);
1094
1095                 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1096                   {
1097                     retval = cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL);
1098                     return(retval);
1099                   }
1100               }
1101             else if ((uio->uio_resid < 4 * PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1102               {
1103                 /*
1104                  * We set a threshhold of 4 pages to decide if the nocopy
1105                  * write loop is worth the trouble...
1106                  * we also come here if we're trying to zero the head and/or tail
1107                  * of a partially written page, and the user source is not a physically contiguous region
1108                  */
1109                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1110                 return(retval);
1111               }
1112             else if (uio->uio_offset & PAGE_MASK_64)
1113               {
1114                 /* Bring the file offset write up to a pagesize boundary */
1115                 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1116                 if (uio->uio_resid < clip_size)
1117                   clip_size = uio->uio_resid;
1118                 /*
1119                  * Fake the resid going into the cluster_write_x call
1120                  * and restore it on the way out.
1121                  */
1122                 prev_resid = uio->uio_resid;
1123                 uio->uio_resid = clip_size;
1124                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1125                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1126               }
1127             else if ((int)iov->iov_base & PAGE_MASK_64)
1128               {
1129                 clip_size = iov->iov_len;
1130                 prev_resid = uio->uio_resid;
1131                 uio->uio_resid = clip_size;
1132                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1133                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1134               }
1135             else
1136               {
1137                 /*
1138                  * If we come in here, we know the offset into
1139                  * the file is on a pagesize boundary
1140                  */
1141
1142                 max_io_size = newEOF - uio->uio_offset;
1143                 clip_size = uio->uio_resid;
1144                 if (iov->iov_len < clip_size)
1145                   clip_size = iov->iov_len;
1146                 if (max_io_size < clip_size)
1147                   clip_size = max_io_size;
1148
1149                 if (clip_size < PAGE_SIZE)
1150                   {
1151                     /*
1152                      * Take care of tail end of write in this vector
1153                      */
1154                     prev_resid = uio->uio_resid;
1155                     uio->uio_resid = clip_size;
1156                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1157                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1158                   }
1159                 else
1160                   {
1161                     /* round clip_size down to a multiple of pagesize */
1162                     clip_size = clip_size & ~(PAGE_MASK);
1163                     prev_resid = uio->uio_resid;
1164                     uio->uio_resid = clip_size;
1165                     retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1166                     if ((retval == 0) && uio->uio_resid)
1167                       retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1168                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1169                   }
1170               } /* end else */
1171           } /* end while */
1172         return(retval);
1173 }
1174
1175 static int
1176 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1177         struct vnode *vp;
1178         struct uio   *uio;
1179         off_t         newEOF;
1180         int           devblocksize;
1181         int           flags;
1182 {
1183         upl_t            upl;
1184         upl_page_info_t  *pl;
1185         off_t            upl_f_offset;
1186         vm_offset_t      upl_offset;
1187         off_t            max_io_size;
1188         int              io_size;
1189         int              upl_size;
1190         int              upl_needed_size;
1191         int              pages_in_pl;
1192         int              upl_flags;
1193         kern_return_t    kret;
1194         struct iovec     *iov;
1195         int              i;
1196         int              force_data_sync;
1197         int              error  = 0;
1198
1199         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1200                      (int)uio->uio_offset, (int)uio->uio_resid,
1201                      (int)newEOF, devblocksize, 0);
1202
1203         /*
1204          * When we enter this routine, we know
1205          *  -- the offset into the file is on a pagesize boundary
1206          *  -- the resid is a page multiple
1207          *  -- the resid will not exceed iov_len
1208          */
1209         cluster_try_push(vp, newEOF, 0, 1);
1210
1211         iov = uio->uio_iov;
1212
1213         while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1214           io_size = uio->uio_resid;
1215
1216           if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1217             io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1218
1219           upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1220           upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1221
1222           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1223                        (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1224
1225           for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
1226             {
1227               pages_in_pl = 0;
1228               upl_size = upl_needed_size;
1229               upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1230                           UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1231
1232               kret = vm_map_get_upl(current_map(),
1233                                     (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1234                                     &upl_size,
1235                                         &upl,
1236                                         NULL,
1237                                         &pages_in_pl,
1238                                         &upl_flags,
1239                                         force_data_sync);
1240
1241               if (kret != KERN_SUCCESS)
1242                 {
1243                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1244                                0, 0, 0, kret, 0);
1245
1246                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1247                                (int)uio->uio_offset, (int)uio->uio_resid, kret, 1, 0);
1248
1249                   /* cluster_nocopy_write: failed to get pagelist */
1250                   /* do not return kret here */
1251                   return(0);
1252                 }
1253
1254               pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1255               pages_in_pl = upl_size / PAGE_SIZE;
1256
1257               for(i=0; i < pages_in_pl; i++)
1258                 {
1259                   if (!upl_valid_page(pl, i))
1260                     break;
1261                 }
1262
1263               if (i == pages_in_pl)
1264                 break;
1265
1266                 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1267                                 UPL_ABORT_FREE_ON_EMPTY);
1268             }
1269
1270           if (force_data_sync >= 3)
1271             {
1272               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1273                            i, pages_in_pl, upl_size, kret, 0);
1274
1275               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1276                            (int)uio->uio_offset, (int)uio->uio_resid, kret, 2, 0);
1277               return(0);
1278             }
1279
1280           /*
1281            * Consider the possibility that upl_size wasn't satisfied.
1282            */
1283           if (upl_size != upl_needed_size)
1284             io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1285
1286           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1287                        (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1288
1289           if (io_size == 0)
1290             {
1291               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1292                                    UPL_ABORT_FREE_ON_EMPTY);
1293               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1294                      (int)uio->uio_offset, uio->uio_resid, 0, 3, 0);
1295
1296               return(0);
1297             }
1298
1299           /*
1300            * Now look for pages already in the cache
1301            * and throw them away.
1302            */
1303
1304           upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
1305           max_io_size = io_size;
1306
1307           while (max_io_size) {
1308
1309             /*
1310              * Flag UPL_POP_DUMP says if the page is found
1311              * in the page cache it must be thrown away.
1312              */
1313             ubc_page_op(vp,
1314                         upl_f_offset,
1315                         UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1316                         0, 0);
1317             max_io_size  -= PAGE_SIZE;
1318             upl_f_offset += PAGE_SIZE;
1319           }
1320
1321           /*
1322            * issue a synchronous write to cluster_io
1323            */
1324
1325           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1326                        (int)upl_offset, (int)uio->uio_offset, io_size, 0, 0);
1327
1328           error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1329                              io_size, devblocksize, 0, (struct buf *)0);
1330
1331           if (error == 0) {
1332             /*
1333              * The cluster_io write completed successfully,
1334              * update the uio structure.
1335              */
1336             iov->iov_base += io_size;
1337             iov->iov_len -= io_size;
1338             uio->uio_resid -= io_size;
1339             uio->uio_offset += io_size;
1340           }
1341           /*
1342            * always 'commit' the I/O via the abort primitive whether the I/O
1343            * succeeded cleanly or not... this is necessary to insure that
1344            * we preserve the state of the DIRTY flag on the pages used to
1345            * provide the data for the I/O... the state of this flag SHOULD
1346            * NOT be changed by a write
1347            */
1348           ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1349                               UPL_ABORT_FREE_ON_EMPTY);
1350
1351
1352           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1353                        (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1354
1355         } /* end while */
1356
1357
1358         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1359                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1360
1361         return (error);
1362 }
1363
1364 static int
1365 cluster_phys_write(vp, uio, newEOF)
1366         struct vnode *vp;
1367         struct uio   *uio;
1368         off_t        newEOF;
1369 {
1370         upl_t            upl;
1371         vm_offset_t      upl_offset;
1372         int              io_size;
1373         int              upl_size;
1374         int              upl_needed_size;
1375         int              pages_in_pl;
1376         int              upl_flags;
1377         kern_return_t    kret;
1378         struct iovec     *iov;
1379         int              error  = 0;
1380
1381         /*
1382          * When we enter this routine, we know
1383          *  -- the resid will not exceed iov_len
1384          *  -- the vector target address is physcially contiguous
1385          */
1386         cluster_try_push(vp, newEOF, 0, 1);
1387
1388         iov = uio->uio_iov;
1389         io_size = iov->iov_len;
1390         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1391         upl_needed_size = upl_offset + io_size;
1392
1393         pages_in_pl = 0;
1394         upl_size = upl_needed_size;
1395         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1396                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1397
1398         kret = vm_map_get_upl(current_map(),
1399                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1400                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1401
1402         if (kret != KERN_SUCCESS)
1403           {
1404             /* cluster_phys_write: failed to get pagelist */
1405               /* note: return kret here */
1406               return(EINVAL);
1407           }
1408
1409         /*
1410          * Consider the possibility that upl_size wasn't satisfied.
1411          * This is a failure in the physical memory case.
1412          */
1413         if (upl_size < upl_needed_size)
1414           {
1415             kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1416             return(EINVAL);
1417           }
1418
1419         /*
1420          * issue a synchronous write to cluster_io
1421          */
1422
1423         error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1424                            io_size, 0, CL_DEV_MEMORY, (struct buf *)0);
1425
1426         if (error == 0) {
1427           /*
1428            * The cluster_io write completed successfully,
1429            * update the uio structure and commit.
1430            */
1431
1432           ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
1433
1434           iov->iov_base += io_size;
1435           iov->iov_len -= io_size;
1436           uio->uio_resid -= io_size;
1437           uio->uio_offset += io_size;
1438         }
1439         else
1440           ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1441
1442         return (error);
1443 }
1444
1445 static int
1446 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1447         struct vnode *vp;
1448         struct uio   *uio;
1449         off_t         oldEOF;
1450         off_t         newEOF;
1451         off_t         headOff;
1452         off_t         tailOff;
1453         int           devblocksize;
1454         int           flags;
1455 {
1456         upl_page_info_t *pl;
1457         upl_t            upl;
1458         vm_offset_t      upl_offset;
1459         int              upl_size;
1460         off_t            upl_f_offset;
1461         int              pages_in_upl;
1462         int              start_offset;
1463         int              xfer_resid;
1464         int              io_size;
1465         int              io_flags;
1466         vm_offset_t      io_address;
1467         int              io_offset;
1468         int              bytes_to_zero;
1469         int              bytes_to_move;
1470         kern_return_t    kret;
1471         int              retval = 0;
1472         int              uio_resid;
1473         long long        total_size;
1474         long long        zero_cnt;
1475         off_t            zero_off;
1476         long long        zero_cnt1;
1477         off_t            zero_off1;
1478         daddr_t          start_blkno;
1479         daddr_t          last_blkno;
1480
1481         if (uio) {
1482                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1483                              (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1484
1485                 uio_resid = uio->uio_resid;
1486         } else {
1487                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1488                              0, 0, (int)oldEOF, (int)newEOF, 0);
1489
1490                 uio_resid = 0;
1491         }
1492         zero_cnt  = 0;
1493         zero_cnt1 = 0;
1494
1495         if (flags & IO_HEADZEROFILL) {
1496                 /*
1497                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1498                  * so we zero fill the intervening space between the old EOF and the offset
1499                  * where the next chunk of real data begins.... ftruncate will also use this
1500                  * routine to zero fill to the new EOF when growing a file... in this case, the
1501                  * uio structure will not be provided
1502                  */
1503                 if (uio) {
1504                         if (headOff < uio->uio_offset) {
1505                                 zero_cnt = uio->uio_offset - headOff;
1506                                 zero_off = headOff;
1507                         }
1508                 } else if (headOff < newEOF) {
1509                         zero_cnt = newEOF - headOff;
1510                         zero_off = headOff;
1511                 }
1512         }
1513         if (flags & IO_TAILZEROFILL) {
1514                 if (uio) {
1515                         zero_off1 = uio->uio_offset + uio->uio_resid;
1516
1517                         if (zero_off1 < tailOff)
1518                                 zero_cnt1 = tailOff - zero_off1;
1519                 }
1520         }
1521         if (zero_cnt == 0 && uio == (struct uio *) 0)
1522           {
1523             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1524                          retval, 0, 0, 0, 0);
1525             return (0);
1526           }
1527
1528         while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1529                 /*
1530                  * for this iteration of the loop, figure out where our starting point is
1531                  */
1532                 if (zero_cnt) {
1533                         start_offset = (int)(zero_off & PAGE_MASK_64);
1534                         upl_f_offset = zero_off - start_offset;
1535                 } else if (uio_resid) {
1536                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1537                         upl_f_offset = uio->uio_offset - start_offset;
1538                 } else {
1539                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1540                         upl_f_offset = zero_off1 - start_offset;
1541                 }
1542                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1543                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1544
1545                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1546                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1547
1548                 /*
1549                  * compute the size of the upl needed to encompass
1550                  * the requested write... limit each call to cluster_io
1551                  * to the maximum UPL size... cluster_io will clip if
1552                  * this exceeds the maximum io_size for the device,
1553                  * make sure to account for
1554                  * a starting offset that's not page aligned
1555                  */
1556                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1557
1558                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1559                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1560
1561                 pages_in_upl = upl_size / PAGE_SIZE;
1562                 io_size      = upl_size - start_offset;
1563
1564                 if ((long long)io_size > total_size)
1565                         io_size = total_size;
1566
1567                 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1568                 last_blkno  = start_blkno + pages_in_upl;
1569
1570                 kret = ubc_create_upl(vp,
1571                                                         upl_f_offset,
1572                                                         upl_size,
1573                                                         &upl,
1574                                                         &pl,
1575                                                         UPL_FLAGS_NONE);
1576                 if (kret != KERN_SUCCESS)
1577                         panic("cluster_write: failed to get pagelist");
1578
1579                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
1580                         (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1581
1582                 if (start_offset && !upl_valid_page(pl, 0)) {
1583                         int   read_size;
1584
1585                         /*
1586                          * we're starting in the middle of the first page of the upl
1587                          * and the page isn't currently valid, so we're going to have
1588                          * to read it in first... this is a synchronous operation
1589                          */
1590                         read_size = PAGE_SIZE;
1591
1592                         if ((upl_f_offset + read_size) > newEOF)
1593                                 read_size = newEOF - upl_f_offset;
1594
1595                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
1596                                             CL_READ, (struct buf *)0);
1597                         if (retval) {
1598                                 /*
1599                                  * we had an error during the read which causes us to abort
1600                                  * the current cluster_write request... before we do, we need
1601                                  * to release the rest of the pages in the upl without modifying
1602                                  * there state and mark the failed page in error
1603                                  */
1604                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1605                                 ubc_upl_abort_range(upl, 0, upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1606
1607                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1608                                              (int)upl, 0, 0, retval, 0);
1609                                 break;
1610                         }
1611                 }
1612                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1613                         /*
1614                          * the last offset we're writing to in this upl does not end on a page
1615                          * boundary... if it's not beyond the old EOF, then we'll also need to
1616                          * pre-read this page in if it isn't already valid
1617                          */
1618                         upl_offset = upl_size - PAGE_SIZE;
1619
1620                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1621                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1622                                 int   read_size;
1623
1624                                 read_size = PAGE_SIZE;
1625
1626                                 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1627                                         read_size = newEOF - (upl_f_offset + upl_offset);
1628
1629                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
1630                                                     CL_READ, (struct buf *)0);
1631                                 if (retval) {
1632                                         /*
1633                                          * we had an error during the read which causes us to abort
1634                                          * the current cluster_write request... before we do, we
1635                                          * need to release the rest of the pages in the upl without
1636                                          * modifying there state and mark the failed page in error
1637                                          */
1638                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1639                                         ubc_upl_abort_range(upl, 0,          upl_size,  UPL_ABORT_FREE_ON_EMPTY);
1640
1641                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1642                                                      (int)upl, 0, 0, retval, 0);
1643                                         break;
1644                                 }
1645                         }
1646                 }
1647                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
1648                         panic("cluster_write: ubc_upl_map failed\n");
1649                 xfer_resid = io_size;
1650                 io_offset = start_offset;
1651
1652                 while (zero_cnt && xfer_resid) {
1653
1654                         if (zero_cnt < (long long)xfer_resid)
1655                                 bytes_to_zero = zero_cnt;
1656                         else
1657                                 bytes_to_zero = xfer_resid;
1658
1659                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1660                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1661
1662                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1663                                              (int)upl_f_offset + io_offset, bytes_to_zero,
1664                                              (int)io_offset, xfer_resid, 0);
1665                         } else {
1666                                 int zero_pg_index;
1667
1668                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1669                                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1670
1671                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1672                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1673
1674                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1675                                                      (int)upl_f_offset + io_offset, bytes_to_zero,
1676                                                      (int)io_offset, xfer_resid, 0);
1677
1678                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1679                                            !upl_dirty_page(pl, zero_pg_index)) {
1680                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1681
1682                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1683                                                      (int)upl_f_offset + io_offset, bytes_to_zero,
1684                                                      (int)io_offset, xfer_resid, 0);
1685                                 }
1686                         }
1687                         xfer_resid -= bytes_to_zero;
1688                         zero_cnt   -= bytes_to_zero;
1689                         zero_off   += bytes_to_zero;
1690                         io_offset  += bytes_to_zero;
1691                 }
1692                 if (xfer_resid && uio_resid) {
1693                         bytes_to_move = min(uio_resid, xfer_resid);
1694
1695                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1696                                      (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1697
1698                         retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1699
1700
1701                         if (retval) {
1702                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1703                                         panic("cluster_write: kernel_upl_unmap failed\n");
1704
1705                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1706
1707                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1708                                              (int)upl, 0, 0, retval, 0);
1709                         } else {
1710                                 uio_resid  -= bytes_to_move;
1711                                 xfer_resid -= bytes_to_move;
1712                                 io_offset  += bytes_to_move;
1713                         }
1714                 }
1715                 while (xfer_resid && zero_cnt1 && retval == 0) {
1716
1717                         if (zero_cnt1 < (long long)xfer_resid)
1718                                 bytes_to_zero = zero_cnt1;
1719                         else
1720                                 bytes_to_zero = xfer_resid;
1721
1722                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1723                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1724
1725                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1726                                              (int)upl_f_offset + io_offset,
1727                                              bytes_to_zero, (int)io_offset, xfer_resid, 0);
1728                         } else {
1729                                 int zero_pg_index;
1730
1731                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1732                                 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1733
1734                                 if ( !upl_valid_page(pl, zero_pg_index)) {
1735                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1736
1737                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1738                                                      (int)upl_f_offset + io_offset,
1739                                                      bytes_to_zero, (int)io_offset, xfer_resid, 0);
1740
1741                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1742                                            !upl_dirty_page(pl, zero_pg_index)) {
1743                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1744
1745                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1746                                                      (int)upl_f_offset + io_offset,
1747                                                      bytes_to_zero, (int)io_offset, xfer_resid, 0);
1748                                 }
1749                         }
1750                         xfer_resid -= bytes_to_zero;
1751                         zero_cnt1  -= bytes_to_zero;
1752                         zero_off1  += bytes_to_zero;
1753                         io_offset  += bytes_to_zero;
1754                 }
1755
1756                 if (retval == 0) {
1757                         int cl_index;
1758                         int can_delay;
1759
1760                         io_size += start_offset;
1761
1762                         if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1763                                 /*
1764                                  * if we're extending the file with this write
1765                                  * we'll zero fill the rest of the page so that
1766                                  * if the file gets extended again in such a way as to leave a
1767                                  * hole starting at this EOF, we'll have zero's in the correct spot
1768                                  */
1769                                 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1770
1771                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1772                                              (int)upl_f_offset + io_size,
1773                                              upl_size - io_size, 0, 0, 0);
1774                         }
1775                         if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1776                                 panic("cluster_write: kernel_upl_unmap failed\n");
1777
1778                         if (flags & IO_SYNC)
1779                                 /*
1780                                  * if the IO_SYNC flag is set than we need to
1781                                  * bypass any clusters and immediately issue
1782                                  * the I/O
1783                                  */
1784                                 goto issue_io;
1785
1786                         if (vp->v_clen == 0)
1787                                 /*
1788                                  * no clusters currently present
1789                                  */
1790                                 goto start_new_cluster;
1791
1792                         /*
1793                          * keep track of the overall dirty page
1794                          * range we've developed
1795                          * in case we have to fall back to the
1796                          * VHASDIRTY method of flushing
1797                          */
1798                         if (vp->v_flag & VHASDIRTY)
1799                                 goto delay_io;
1800
1801                         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
1802                                 /*
1803                                  * we have an existing cluster... see if this write will extend it nicely
1804                                  */
1805                                 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
1806                                         /*
1807                                          * the current write starts at or after the current cluster
1808                                          */
1809                                         if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1810                                                 /*
1811                                                  * we have a write that fits entirely
1812                                                  * within the existing cluster limits
1813                                                  */
1814                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg)
1815                                                         /*
1816                                                          * update our idea of where the cluster ends
1817                                                          */
1818                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
1819                                                 break;
1820                                         }
1821                                         if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1822                                                 /*
1823                                                  * we have a write that starts in the middle of the current cluster
1824                                                  * but extends beyond the cluster's limit
1825                                                  * we'll clip the current cluster if we actually
1826                                                  * overlap with the new write
1827                                                  * and start a new cluster with the current write
1828                                                  */
1829                                                  if (vp->v_clusters[cl_index].last_pg > start_blkno)
1830                                                         vp->v_clusters[cl_index].last_pg = start_blkno;
1831                                         }
1832                                         /*
1833                                          * we also get here for the case where the current write starts
1834                                          * beyond the limit of the existing cluster
1835                                          *
1836                                          * in either case, we'll check the remaining clusters before
1837                                          * starting a new one
1838                                          */
1839                                 } else {
1840                                         /*
1841                                          * the current write starts in front of the current cluster
1842                                          */
1843                                         if ((vp->v_clusters[cl_index].last_pg - start_blkno) <=  MAX_UPL_TRANSFER) {
1844                                                 /*
1845                                                  * we can just merge the old cluster
1846                                                  * with the new request and leave it
1847                                                  * in the cache
1848                                                  */
1849                                                 vp->v_clusters[cl_index].start_pg = start_blkno;
1850
1851                                                 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
1852                                                         /*
1853                                                          * the current write completely
1854                                                          * envelops the existing cluster
1855                                                          */
1856                                                         vp->v_clusters[cl_index].last_pg = last_blkno;
1857                                                 }
1858                                                 break;
1859                                         }
1860
1861                                         /*
1862                                          * if we were to combine this write with the current cluster
1863                                          * we would exceed the cluster size limit.... so,
1864                                          * let's see if there's any overlap of the new I/O with
1865                                          * the existing cluster...
1866                                          *
1867                                          */
1868                                         if (last_blkno > vp->v_clusters[cl_index].start_pg)
1869                                                 /*
1870                                                  * the current write extends into the existing cluster
1871                                                  * clip the current cluster by moving the start position
1872                                                  * to where the current write ends
1873                                                  */
1874                                                 vp->v_clusters[cl_index].start_pg = last_blkno;
1875                                         /*
1876                                          * if we get here, there was no way to merge
1877                                          * the new I/O with this cluster and
1878                                          * keep it under our maximum cluster length
1879                                          * we'll check the remaining clusters before starting a new one
1880                                          */
1881                                 }
1882                         }
1883                         if (cl_index < vp->v_clen)
1884                                 /*
1885                                  * we found an existing cluster that we
1886                                  * could merger this I/O into
1887                                  */
1888                                 goto delay_io;
1889
1890                         if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
1891                                 /*
1892                                  * we didn't find an existing cluster to
1893                                  * merge into, but there's room to start
1894                                  * a new one
1895                                  */
1896                                 goto start_new_cluster;
1897
1898                         /*
1899                          * no exisitng cluster to merge with and no
1900                          * room to start a new one... we'll try
1901                          * pushing the existing ones... if none of
1902                          * them are able to be pushed, we'll have
1903                          * to fall back on the VHASDIRTY mechanism
1904                          * cluster_try_push will set v_clen to the
1905                          * number of remaining clusters if it is
1906                          * unable to push all of them
1907                          */
1908                         if (vp->v_flag & VNOCACHE_DATA)
1909                                 can_delay = 0;
1910                         else
1911                                 can_delay = 1;
1912
1913                         if (cluster_try_push(vp, newEOF, 0, 0) == 0) {
1914                                 vp->v_flag |= VHASDIRTY;
1915                                 goto delay_io;
1916                         }
1917 start_new_cluster:
1918                         if (vp->v_clen == 0) {
1919                                 vp->v_ciosiz = devblocksize;
1920                                 vp->v_cstart = start_blkno;
1921                                 vp->v_lastw  = last_blkno;
1922                         }
1923                         vp->v_clusters[vp->v_clen].start_pg = start_blkno;
1924                         vp->v_clusters[vp->v_clen].last_pg  = last_blkno;
1925                         vp->v_clen++;
1926 delay_io:
1927                         /*
1928                          * make sure we keep v_cstart and v_lastw up to
1929                          * date in case we have to fall back on the
1930                          * V_HASDIRTY mechanism (or we've already entered it)
1931                          */
1932                         if (start_blkno < vp->v_cstart)
1933                                 vp->v_cstart = start_blkno;
1934                         if (last_blkno > vp->v_lastw)
1935                                 vp->v_lastw = last_blkno;
1936
1937                         ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1938                         continue;
1939 issue_io:
1940                         /*
1941                          * in order to maintain some semblance of coherency with mapped writes
1942                          * we need to write the cluster back out as a multiple of the PAGESIZE
1943                          * unless the cluster encompasses the last page of the file... in this
1944                          * case we'll round out to the nearest device block boundary
1945                          */
1946                         io_size = upl_size;
1947
1948                         if ((upl_f_offset + io_size) > newEOF) {
1949                                 io_size = newEOF - upl_f_offset;
1950                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
1951                         }
1952
1953                         if (flags & IO_SYNC)
1954                                 io_flags = CL_COMMIT | CL_AGE;
1955                         else
1956                                 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
1957
1958                         if (vp->v_flag & VNOCACHE_DATA)
1959                                 io_flags |= CL_DUMP;
1960
1961                         while (vp->v_numoutput >= ASYNC_THROTTLE) {
1962                                 vp->v_flag |= VTHROTTLED;
1963                                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
1964                         }
1965                         retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
1966                                             io_flags, (struct buf *)0);
1967                 }
1968         }
1969         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1970                      retval, 0, 0, 0, 0);
1971
1972         return (retval);
1973 }
1974
1975 int
1976 cluster_read(vp, uio, filesize, devblocksize, flags)
1977         struct vnode *vp;
1978         struct uio   *uio;
1979         off_t         filesize;
1980         int           devblocksize;
1981         int           flags;
1982 {
1983         int           prev_resid;
1984         int           clip_size;
1985         off_t         max_io_size;
1986         struct iovec  *iov;
1987         vm_offset_t   upl_offset;
1988         int           upl_size;
1989         int           pages_in_pl;
1990         upl_page_info_t *pl;
1991         int           upl_flags;
1992         upl_t         upl;
1993         int           retval = 0;
1994
1995         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
1996                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
1997
1998         /*
1999          * We set a threshhold of 4 pages to decide if the nocopy
2000          * read loop is worth the trouble...
2001          */
2002
2003         if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
2004           {
2005             retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2006             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2007                          (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2008             return(retval);
2009           }
2010
2011         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2012           {
2013             /* we know we have a resid, so this is safe */
2014             iov = uio->uio_iov;
2015             while (iov->iov_len == 0) {
2016               uio->uio_iov++;
2017               uio->uio_iovcnt--;
2018               iov = uio->uio_iov;
2019             }
2020
2021             /*
2022              * We check every vector target and if it is physically
2023              * contiguous space, we skip the sanity checks.
2024              */
2025
2026             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
2027             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
2028             pages_in_pl = 0;
2029             upl_flags = UPL_QUERY_OBJECT_TYPE;
2030             if((vm_map_get_upl(current_map(),
2031                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2032                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
2033               {
2034                 /*
2035                  * the user app must have passed in an invalid address
2036                  */
2037                 return (EFAULT);
2038               }
2039
2040             if (upl_flags & UPL_PHYS_CONTIG)
2041               {
2042                 retval = cluster_phys_read(vp, uio, filesize);
2043               }
2044             else if (uio->uio_resid < 4 * PAGE_SIZE)
2045               {
2046                 /*
2047                  * We set a threshhold of 4 pages to decide if the nocopy
2048                  * read loop is worth the trouble...
2049                  */
2050                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2051                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2052                              (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2053                 return(retval);
2054               }
2055             else if (uio->uio_offset & PAGE_MASK_64)
2056               {
2057                 /* Bring the file offset read up to a pagesize boundary */
2058                 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2059                 if (uio->uio_resid < clip_size)
2060                   clip_size = uio->uio_resid;
2061                 /*
2062                  * Fake the resid going into the cluster_read_x call
2063                  * and restore it on the way out.
2064                  */
2065                 prev_resid = uio->uio_resid;
2066                 uio->uio_resid = clip_size;
2067                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2068                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2069               }
2070             else if ((int)iov->iov_base & PAGE_MASK_64)
2071               {
2072                 clip_size = iov->iov_len;
2073                 prev_resid = uio->uio_resid;
2074                 uio->uio_resid = clip_size;
2075                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2076                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2077               }
2078             else
2079               {
2080                 /*
2081                  * If we come in here, we know the offset into
2082                  * the file is on a pagesize boundary
2083                  */
2084
2085                 max_io_size = filesize - uio->uio_offset;
2086                 clip_size = uio->uio_resid;
2087                 if (iov->iov_len < clip_size)
2088                   clip_size = iov->iov_len;
2089                 if (max_io_size < clip_size)
2090                   clip_size = (int)max_io_size;
2091
2092                 if (clip_size < PAGE_SIZE)
2093                   {
2094                     /*
2095                      * Take care of the tail end of the read in this vector.
2096                      */
2097                     prev_resid = uio->uio_resid;
2098                     uio->uio_resid = clip_size;
2099                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2100                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2101                   }
2102                 else
2103                   {
2104                     /* round clip_size down to a multiple of pagesize */
2105                     clip_size = clip_size & ~(PAGE_MASK);
2106                     prev_resid = uio->uio_resid;
2107                     uio->uio_resid = clip_size;
2108                     retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2109                     if ((retval==0) && uio->uio_resid)
2110                       retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2111                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2112                   }
2113               } /* end else */
2114           } /* end while */
2115
2116         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2117                      (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2118
2119         return(retval);
2120 }
2121
2122 static int
2123 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2124         struct vnode *vp;
2125         struct uio   *uio;
2126         off_t         filesize;
2127         int           devblocksize;
2128         int           flags;
2129 {
2130         upl_page_info_t *pl;
2131         upl_t            upl;
2132         vm_offset_t      upl_offset;
2133         int              upl_size;
2134         off_t            upl_f_offset;
2135         int              start_offset;
2136         int              start_pg;
2137         int              last_pg;
2138         int              uio_last;
2139         int              pages_in_upl;
2140         off_t            max_size;
2141         int              io_size;
2142         vm_offset_t      io_address;
2143         kern_return_t    kret;
2144         int              segflg;
2145         int              error  = 0;
2146         int              retval = 0;
2147         int              b_lblkno;
2148         int              e_lblkno;
2149
2150         b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
2151
2152         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2153                 /*
2154                  * compute the size of the upl needed to encompass
2155                  * the requested read... limit each call to cluster_io
2156                  * to the maximum UPL size... cluster_io will clip if
2157                  * this exceeds the maximum io_size for the device,
2158                  * make sure to account for
2159                  * a starting offset that's not page aligned
2160                  */
2161                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2162                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2163                 max_size     = filesize - uio->uio_offset;
2164
2165                 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2166                         io_size = uio->uio_resid;
2167                 else
2168                         io_size = max_size;
2169
2170                 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
2171                         segflg = uio->uio_segflg;
2172
2173                         uio->uio_segflg = UIO_PHYS_USERSPACE;
2174
2175                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2176                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2177
2178                         while (io_size && retval == 0) {
2179                                 int         xsize;
2180                                 vm_offset_t paddr;
2181
2182                                 if (ubc_page_op(vp,
2183                                                 upl_f_offset,
2184                                                 UPL_POP_SET | UPL_POP_BUSY,
2185                                                 &paddr, 0) != KERN_SUCCESS)
2186                                         break;
2187
2188                                 xsize = PAGE_SIZE - start_offset;
2189
2190                                 if (xsize > io_size)
2191                                         xsize = io_size;
2192
2193                                 retval = uiomove((caddr_t)(paddr + start_offset), xsize, uio);
2194
2195                                 ubc_page_op(vp, upl_f_offset,
2196                                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2197
2198                                 io_size     -= xsize;
2199                                 start_offset = (int)
2200                                         (uio->uio_offset & PAGE_MASK_64);
2201                                 upl_f_offset = uio->uio_offset - start_offset;
2202                         }
2203                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2204                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2205
2206                         uio->uio_segflg = segflg;
2207
2208                         if (retval)
2209                                 break;
2210
2211                         if (io_size == 0) {
2212                                 /*
2213                                  * we're already finished with this read request
2214                                  * let's see if we should do a read-ahead
2215                                  */
2216                                 e_lblkno = (int)
2217                                         ((uio->uio_offset - 1) / PAGE_SIZE_64);
2218
2219                                 if (!(vp->v_flag & VRAOFF))
2220                                         /*
2221                                          * let's try to read ahead if we're in
2222                                          * a sequential access pattern
2223                                          */
2224                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2225                                 vp->v_lastr = e_lblkno;
2226
2227                                 break;
2228                         }
2229                         max_size = filesize - uio->uio_offset;
2230                 }
2231                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2232                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2233                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2234                 pages_in_upl = upl_size / PAGE_SIZE;
2235
2236                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2237                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2238
2239                 kret = ubc_create_upl(vp,
2240                                                 upl_f_offset,
2241                                                 upl_size,
2242                                                 &upl,
2243                                                 &pl,
2244                                                 UPL_FLAGS_NONE);
2245                 if (kret != KERN_SUCCESS)
2246                         panic("cluster_read: failed to get pagelist");
2247
2248                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2249                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2250
2251                 /*
2252                  * scan from the beginning of the upl looking for the first
2253                  * non-valid page.... this will become the first page in
2254                  * the request we're going to make to 'cluster_io'... if all
2255                  * of the pages are valid, we won't call through to 'cluster_io'
2256                  */
2257                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2258                         if (!upl_valid_page(pl, start_pg))
2259                                 break;
2260                 }
2261
2262                 /*
2263                  * scan from the starting invalid page looking for a valid
2264                  * page before the end of the upl is reached, if we
2265                  * find one, then it will be the last page of the request to
2266                  * 'cluster_io'
2267                  */
2268                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2269                         if (upl_valid_page(pl, last_pg))
2270                                 break;
2271                 }
2272
2273                 if (start_pg < last_pg) {
2274                         /*
2275                          * we found a range of 'invalid' pages that must be filled
2276                          * if the last page in this range is the last page of the file
2277                          * we may have to clip the size of it to keep from reading past
2278                          * the end of the last physical block associated with the file
2279                          */
2280                         upl_offset = start_pg * PAGE_SIZE;
2281                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2282
2283                         if ((upl_f_offset + upl_offset + io_size) > filesize)
2284                                 io_size = filesize - (upl_f_offset + upl_offset);
2285
2286                         /*
2287                          * issue a synchronous read to cluster_io
2288                          */
2289
2290                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2291                                            io_size, devblocksize, CL_READ, (struct buf *)0);
2292                 }
2293                 if (error == 0) {
2294                         /*
2295                          * if the read completed successfully, or there was no I/O request
2296                          * issued, than map the upl into kernel address space and
2297                          * move the data into user land.... we'll first add on any 'valid'
2298                          * pages that were present in the upl when we acquired it.
2299                          */
2300                         u_int  val_size;
2301                         u_int  size_of_prefetch;
2302
2303                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2304                                 if (!upl_valid_page(pl, uio_last))
2305                                         break;
2306                         }
2307                         /*
2308                          * compute size to transfer this round,  if uio->uio_resid is
2309                          * still non-zero after this uiomove, we'll loop around and
2310                          * set up for another I/O.
2311                          */
2312                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2313
2314                         if (max_size < val_size)
2315                                 val_size = max_size;
2316
2317                         if (uio->uio_resid < val_size)
2318                                 val_size = uio->uio_resid;
2319
2320                         e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2321
2322                         if (size_of_prefetch = (uio->uio_resid - val_size)) {
2323                                 /*
2324                                  * if there's still I/O left to do for this request, then issue a
2325                                  * pre-fetch I/O... the I/O wait time will overlap
2326                                  * with the copying of the data
2327                                  */
2328                                 cluster_rd_prefetch(vp, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
2329                         } else {
2330                                 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2331                                         /*
2332                                          * let's try to read ahead if we're in
2333                                          * a sequential access pattern
2334                                          */
2335                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2336                                 vp->v_lastr = e_lblkno;
2337                         }
2338                         if (uio->uio_segflg == UIO_USERSPACE) {
2339                                 int       offset;
2340
2341                                 segflg = uio->uio_segflg;
2342
2343                                 uio->uio_segflg = UIO_PHYS_USERSPACE;
2344
2345
2346                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2347                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2348
2349                                 offset = start_offset;
2350
2351                                 while (val_size && retval == 0) {
2352                                         int       csize;
2353                                         int       i;
2354                                         caddr_t   paddr;
2355
2356                                         i = offset / PAGE_SIZE;
2357                                         csize = min(PAGE_SIZE - start_offset, val_size);
2358
2359                                         paddr = (caddr_t)upl_phys_page(pl, i) + start_offset;
2360
2361                                         retval = uiomove(paddr, csize, uio);
2362
2363                                         val_size    -= csize;
2364                                         offset      += csize;
2365                                         start_offset = offset & PAGE_MASK;
2366                                 }
2367                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2368                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2369
2370                                 uio->uio_segflg = segflg;
2371                         }
2372                         else
2373                         {
2374                                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
2375                                         panic("cluster_read: ubc_upl_map() failed\n");
2376
2377                                 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2378
2379                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
2380                                         panic("cluster_read: ubc_upl_unmap() failed\n");
2381                         }
2382                 }
2383                 if (start_pg < last_pg) {
2384                         /*
2385                          * compute the range of pages that we actually issued an I/O for
2386                          * and either commit them as valid if the I/O succeeded
2387                          * or abort them if the I/O failed
2388                          */
2389                         io_size = (last_pg - start_pg) * PAGE_SIZE;
2390
2391                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2392                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2393
2394                         if (error || (vp->v_flag & VNOCACHE_DATA))
2395                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2396                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2397                         else
2398                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2399                                                 UPL_COMMIT_CLEAR_DIRTY
2400                                                 | UPL_COMMIT_FREE_ON_EMPTY
2401                                                 | UPL_COMMIT_INACTIVATE);
2402
2403                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2404                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2405                 }
2406                 if ((last_pg - start_pg) < pages_in_upl) {
2407                         int cur_pg;
2408                         int commit_flags;
2409
2410                         /*
2411                          * the set of pages that we issued an I/O for did not encompass
2412                          * the entire upl... so just release these without modifying
2413                          * there state
2414                          */
2415                         if (error)
2416                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2417                         else {
2418                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2419                                              (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2420
2421                                 if (start_pg) {
2422                                         /*
2423                                          * we found some already valid pages at the beginning of
2424                                          * the upl commit these back to the inactive list with
2425                                          * reference cleared
2426                                          */
2427                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2428                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2429                                                                    | UPL_COMMIT_INACTIVATE;
2430
2431                                                 if (upl_dirty_page(pl, cur_pg))
2432                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2433
2434                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2435                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2436                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2437                                                 else
2438                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2439                                                                 PAGE_SIZE, commit_flags);
2440                                         }
2441                                 }
2442                                 if (last_pg < uio_last) {
2443                                         /*
2444                                          * we found some already valid pages immediately after the
2445                                          * pages we issued I/O for, commit these back to the
2446                                          * inactive list with reference cleared
2447                                          */
2448                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2449                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
2450                                                                                 | UPL_COMMIT_INACTIVATE;
2451
2452                                                 if (upl_dirty_page(pl, cur_pg))
2453                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2454
2455                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2456                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2457                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2458                                                 else
2459                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2460                                                                 PAGE_SIZE, commit_flags);
2461                                         }
2462                                 }
2463                                 if (uio_last < pages_in_upl) {
2464                                         /*
2465                                          * there were some invalid pages beyond the valid pages
2466                                          * that we didn't issue an I/O for, just release them
2467                                          * unchanged
2468                                          */
2469                                         ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2470                                                             (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
2471                                 }
2472
2473                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2474                                         (int)upl, -1, -1, 0, 0);
2475                         }
2476                 }
2477                 if (retval == 0)
2478                         retval = error;
2479         }
2480
2481         return (retval);
2482 }
2483
2484 static int
2485 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2486         struct vnode *vp;
2487         struct uio   *uio;
2488         off_t         filesize;
2489         int           devblocksize;
2490         int           flags;
2491 {
2492         upl_t            upl;
2493         upl_page_info_t  *pl;
2494         off_t            upl_f_offset;
2495         vm_offset_t      upl_offset;
2496         off_t            start_upl_f_offset;
2497         off_t            max_io_size;
2498         int              io_size;
2499         int              upl_size;
2500         int              upl_needed_size;
2501         int              pages_in_pl;
2502         vm_offset_t      paddr;
2503         int              upl_flags;
2504         kern_return_t    kret;
2505         int              segflg;
2506         struct iovec     *iov;
2507         int              i;
2508         int              force_data_sync;
2509         int              error  = 0;
2510         int              retval = 0;
2511
2512         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2513                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2514
2515         /*
2516          * When we enter this routine, we know
2517          *  -- the offset into the file is on a pagesize boundary
2518          *  -- the resid is a page multiple
2519          *  -- the resid will not exceed iov_len
2520          */
2521
2522         iov = uio->uio_iov;
2523         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2524
2525           max_io_size = filesize - uio->uio_offset;
2526
2527           if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2528               io_size = max_io_size;
2529           else
2530               io_size = uio->uio_resid;
2531
2532           /*
2533            * We don't come into this routine unless
2534            * UIO_USERSPACE is set.
2535            */
2536           segflg = uio->uio_segflg;
2537
2538           uio->uio_segflg = UIO_PHYS_USERSPACE;
2539
2540           /*
2541            * First look for pages already in the cache
2542            * and move them to user space.
2543            */
2544           while (io_size && (retval == 0)) {
2545             upl_f_offset = uio->uio_offset;
2546
2547             /*
2548              * If this call fails, it means the page is not
2549              * in the page cache.
2550              */
2551             if (ubc_page_op(vp, upl_f_offset,
2552                             UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS)
2553               break;
2554
2555             retval = uiomove((caddr_t)(paddr), PAGE_SIZE, uio);
2556
2557             ubc_page_op(vp, upl_f_offset,
2558                         UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2559
2560             io_size     -= PAGE_SIZE;
2561             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
2562                            (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2563           }
2564
2565           uio->uio_segflg = segflg;
2566
2567           if (retval)
2568             {
2569               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2570                            (int)uio->uio_offset, uio->uio_resid, 2, retval, 0);
2571               return(retval);
2572             }
2573
2574           /* If we are already finished with this read, then return */
2575           if (io_size == 0)
2576             {
2577
2578               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2579                            (int)uio->uio_offset, uio->uio_resid, 3, io_size, 0);
2580               return(0);
2581             }
2582
2583           max_io_size = io_size;
2584           if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2585             max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2586
2587           start_upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
2588           upl_f_offset = start_upl_f_offset;
2589           io_size = 0;
2590
2591           while(io_size < max_io_size)
2592             {
2593
2594               if(ubc_page_op(vp, upl_f_offset,
2595                                 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS)
2596               {
2597                         ubc_page_op(vp, upl_f_offset,
2598                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2599                         break;
2600               }
2601
2602                   /*
2603                    * Build up the io request parameters.
2604                    */
2605
2606                   io_size += PAGE_SIZE;
2607                   upl_f_offset += PAGE_SIZE;
2608                 }
2609
2610               if (io_size == 0)
2611                 return(retval);
2612
2613           upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2614           upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2615
2616           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2617                        (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
2618
2619           for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
2620             {
2621               pages_in_pl = 0;
2622               upl_size = upl_needed_size;
2623               upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2624
2625               kret = vm_map_get_upl(current_map(),
2626                                     (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2627                                     &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2628
2629               if (kret != KERN_SUCCESS)
2630                 {
2631                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2632                                (int)upl_offset, upl_size, io_size, kret, 0);
2633
2634                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2635                                (int)uio->uio_offset, uio->uio_resid, 4, retval, 0);
2636
2637                   /* cluster_nocopy_read: failed to get pagelist */
2638                   /* do not return kret here */
2639                   return(retval);
2640                 }
2641
2642               pages_in_pl = upl_size / PAGE_SIZE;
2643               pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2644
2645               for(i=0; i < pages_in_pl; i++)
2646                 {
2647                   if (!upl_valid_page(pl, i))
2648                     break;
2649                 }
2650               if (i == pages_in_pl)
2651                 break;
2652
2653               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2654                                   UPL_ABORT_FREE_ON_EMPTY);
2655             }
2656
2657           if (force_data_sync >= 3)
2658             {
2659                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2660                                (int)upl_offset, upl_size, io_size, kret, 0);
2661
2662                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2663                                (int)uio->uio_offset, uio->uio_resid, 5, retval, 0);
2664               return(retval);
2665             }
2666           /*
2667            * Consider the possibility that upl_size wasn't satisfied.
2668            */
2669           if (upl_size != upl_needed_size)
2670             io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2671
2672           if (io_size == 0)
2673             {
2674               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2675                                    UPL_ABORT_FREE_ON_EMPTY);
2676               return(retval);
2677             }
2678
2679           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2680                        (int)upl_offset, upl_size, io_size, kret, 0);
2681
2682           /*
2683            * issue a synchronous read to cluster_io
2684            */
2685
2686           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2687                        (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
2688
2689           error = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
2690                              io_size, devblocksize, CL_READ| CL_NOZERO, (struct buf *)0);
2691
2692           if (error == 0) {
2693             /*
2694              * The cluster_io read completed successfully,
2695              * update the uio structure and commit.
2696              */
2697
2698             ubc_upl_commit_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2699                                         UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
2700
2701             iov->iov_base += io_size;
2702             iov->iov_len -= io_size;
2703             uio->uio_resid -= io_size;
2704             uio->uio_offset += io_size;
2705           }
2706           else {
2707             ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2708                                    UPL_ABORT_FREE_ON_EMPTY);
2709           }
2710
2711           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2712                        (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
2713
2714           if (retval == 0)
2715             retval = error;
2716
2717         } /* end while */
2718
2719
2720         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2721                      (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2722
2723         return (retval);
2724 }
2725
2726
2727 static int
2728 cluster_phys_read(vp, uio, filesize)
2729         struct vnode *vp;
2730         struct uio   *uio;
2731         off_t        filesize;
2732 {
2733         upl_t            upl;
2734         vm_offset_t      upl_offset;
2735         off_t            max_size;
2736         int              io_size;
2737         int              upl_size;
2738         int              upl_needed_size;
2739         int              pages_in_pl;
2740         int              upl_flags;
2741         kern_return_t    kret;
2742         struct iovec     *iov;
2743         int              error;
2744
2745         /*
2746          * When we enter this routine, we know
2747          *  -- the resid will not exceed iov_len
2748          *  -- the target address is physically contiguous
2749          */
2750
2751         iov = uio->uio_iov;
2752
2753         max_size = filesize - uio->uio_offset;
2754
2755         if (max_size < (off_t)((unsigned int)iov->iov_len))
2756             io_size = max_size;
2757         else
2758             io_size = iov->iov_len;
2759
2760         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2761         upl_needed_size = upl_offset + io_size;
2762
2763         pages_in_pl = 0;
2764         upl_size = upl_needed_size;
2765         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2766
2767         kret = vm_map_get_upl(current_map(),
2768                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2769                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
2770
2771         if (kret != KERN_SUCCESS)
2772           {
2773             /* cluster_phys_read: failed to get pagelist */
2774             return(EINVAL);
2775           }
2776
2777         /*
2778          * Consider the possibility that upl_size wasn't satisfied.
2779          */
2780         if (upl_size < upl_needed_size)
2781           {
2782             ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2783             return(EINVAL);
2784           }
2785
2786         /*
2787          * issue a synchronous read to cluster_io
2788          */
2789
2790         error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2791                            io_size, 0, CL_READ| CL_NOZERO | CL_DEV_MEMORY, (struct buf *)0);
2792
2793         if (error == 0)
2794           {
2795             /*
2796              * The cluster_io read completed successfully,
2797              * update the uio structure and commit.
2798              */
2799
2800             ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
2801
2802             iov->iov_base += io_size;
2803             iov->iov_len -= io_size;
2804             uio->uio_resid -= io_size;
2805             uio->uio_offset += io_size;
2806           }
2807         else
2808             ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2809
2810         return (error);
2811 }
2812
2813 /*
2814  * generate advisory I/O's in the largest chunks possible
2815  * the completed pages will be released into the VM cache
2816  */
2817 int
2818 advisory_read(vp, filesize, f_offset, resid, devblocksize)
2819         struct vnode *vp;
2820         off_t         filesize;
2821         off_t         f_offset;
2822         int           resid;
2823         int           devblocksize;
2824 {
2825         upl_page_info_t *pl;
2826         upl_t            upl;
2827         vm_offset_t      upl_offset;
2828         int              upl_size;
2829         off_t            upl_f_offset;
2830         int              start_offset;
2831         int              start_pg;
2832         int              last_pg;
2833         int              pages_in_upl;
2834         off_t            max_size;
2835         int              io_size;
2836         kern_return_t    kret;
2837         int              retval = 0;
2838         int              issued_io;
2839
2840         if (!UBCINFOEXISTS(vp))
2841                 return(EINVAL);
2842
2843         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
2844                      (int)f_offset, resid, (int)filesize, devblocksize, 0);
2845
2846         while (resid && f_offset < filesize && retval == 0) {
2847                 /*
2848                  * compute the size of the upl needed to encompass
2849                  * the requested read... limit each call to cluster_io
2850                  * to the maximum UPL size... cluster_io will clip if
2851                  * this exceeds the maximum io_size for the device,
2852                  * make sure to account for
2853                  * a starting offset that's not page aligned
2854                  */
2855                 start_offset = (int)(f_offset & PAGE_MASK_64);
2856                 upl_f_offset = f_offset - (off_t)start_offset;
2857                 max_size     = filesize - f_offset;
2858
2859                 if (resid < max_size)
2860                         io_size = resid;
2861                 else
2862                         io_size = max_size;
2863
2864                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2865                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2866                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2867                 pages_in_upl = upl_size / PAGE_SIZE;
2868
2869                 kret = ubc_create_upl(vp,
2870                                                 upl_f_offset,
2871                                                 upl_size,
2872                                                 &upl,
2873                                                 &pl,
2874                                                 UPL_RET_ONLY_ABSENT);
2875                 if (kret != KERN_SUCCESS)
2876                         return(retval);
2877                 issued_io = 0;
2878
2879                 /*
2880                  * before we start marching forward, we must make sure we end on
2881                  * a present page, otherwise we will be working with a freed
2882                  * upl
2883                  */
2884                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
2885                         if (upl_page_present(pl, last_pg))
2886                                 break;
2887                 }
2888                 pages_in_upl = last_pg + 1;
2889
2890
2891                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
2892                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2893
2894
2895                 for (last_pg = 0; last_pg < pages_in_upl; ) {
2896                         /*
2897                          * scan from the beginning of the upl looking for the first
2898                          * page that is present.... this will become the first page in
2899                          * the request we're going to make to 'cluster_io'... if all
2900                          * of the pages are absent, we won't call through to 'cluster_io'
2901                          */
2902                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
2903                                 if (upl_page_present(pl, start_pg))
2904                                         break;
2905                         }
2906
2907                         /*
2908                          * scan from the starting present page looking for an absent
2909                          * page before the end of the upl is reached, if we
2910                          * find one, then it will terminate the range of pages being
2911                          * presented to 'cluster_io'
2912                          */
2913                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2914                                 if (!upl_page_present(pl, last_pg))
2915                                         break;
2916                         }
2917
2918                         if (last_pg > start_pg) {
2919                                 /*
2920                                  * we found a range of pages that must be filled
2921                                  * if the last page in this range is the last page of the file
2922                                  * we may have to clip the size of it to keep from reading past
2923                                  * the end of the last physical block associated with the file
2924                                  */
2925                                 upl_offset = start_pg * PAGE_SIZE;
2926                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
2927
2928                                 if ((upl_f_offset + upl_offset + io_size) > filesize)
2929                                         io_size = filesize - (upl_f_offset + upl_offset);
2930
2931                                 /*
2932                                  * issue an asynchronous read to cluster_io
2933                                  */
2934                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
2935                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0);
2936
2937                                 issued_io = 1;
2938                         }
2939                 }
2940                 if (issued_io == 0)
2941                         ubc_upl_abort(upl, 0);
2942
2943                 io_size = upl_size - start_offset;
2944
2945                 if (io_size > resid)
2946                         io_size = resid;
2947                 f_offset += io_size;
2948                 resid    -= io_size;
2949         }
2950
2951         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
2952                      (int)f_offset, resid, retval, 0, 0);
2953
2954         return(retval);
2955 }
2956
2957
2958 int
2959 cluster_push(vp)
2960         struct vnode *vp;
2961 {
2962         int  retval;
2963
2964         if (!UBCINFOEXISTS(vp) || vp->v_clen == 0) {
2965                 vp->v_flag &= ~VHASDIRTY;
2966                 return(0);
2967         }
2968
2969         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
2970                      vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
2971
2972         if (vp->v_flag & VHASDIRTY) {
2973                 daddr_t start_pg;
2974                 daddr_t last_pg;
2975                 daddr_t end_pg;
2976
2977                 start_pg = vp->v_cstart;
2978                 end_pg   = vp->v_lastw;
2979
2980                 vp->v_flag &= ~VHASDIRTY;
2981                 vp->v_clen = 0;
2982
2983                 while (start_pg < end_pg) {
2984                         last_pg = start_pg + MAX_UPL_TRANSFER;
2985
2986                         if (last_pg > end_pg)
2987                                 last_pg = end_pg;
2988
2989                         cluster_push_x(vp, ubc_getsize(vp), start_pg, last_pg, 0);
2990
2991                         start_pg = last_pg;
2992                 }
2993                 return (1);
2994         }
2995         retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
2996
2997         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
2998                      vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
2999
3000         return (retval);
3001 }
3002
3003
3004 static int
3005 cluster_try_push(vp, EOF, can_delay, push_all)
3006         struct vnode *vp;
3007         off_t  EOF;
3008         int    can_delay;
3009         int    push_all;
3010 {
3011         int cl_index;
3012         int cl_index1;
3013         int min_index;
3014         int cl_len;
3015         int cl_total;
3016         int cl_pushed;
3017         struct v_cluster l_clusters[MAX_CLUSTERS];
3018
3019         /*
3020          * make a local 'sorted' copy of the clusters
3021          * and clear vp->v_clen so that new clusters can
3022          * be developed
3023          */
3024         for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3025                 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3026                         if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3027                                 continue;
3028                         if (min_index == -1)
3029                                 min_index = cl_index1;
3030                         else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3031                                 min_index = cl_index1;
3032                 }
3033                 if (min_index == -1)
3034                         break;
3035                 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3036                 l_clusters[cl_index].last_pg  = vp->v_clusters[min_index].last_pg;
3037
3038                 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3039         }
3040         cl_len     = cl_index;
3041         vp->v_clen = 0;
3042
3043         for (cl_pushed = 0, cl_index = 0; cl_index < cl_len; cl_index++) {
3044                 /*
3045                  * try to push each cluster in turn...  cluster_push_x may not
3046                  * push the cluster if can_delay is TRUE and the cluster doesn't
3047                  * meet the critera for an immediate push
3048                  */
3049                 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3050                         l_clusters[cl_index].start_pg = 0;
3051                         l_clusters[cl_index].last_pg  = 0;
3052
3053                         cl_pushed++;
3054
3055                         if (push_all == 0)
3056                                 break;
3057                 }
3058         }
3059         if (cl_len > cl_pushed) {
3060                /*
3061                 * we didn't push all of the clusters, so
3062                 * lets try to merge them back in to the vnode
3063                 */
3064                 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3065                         /*
3066                          * we picked up some new clusters while we were trying to
3067                          * push the old ones (I don't think this can happen because
3068                          * I'm holding the lock, but just in case)... the sum of the
3069                          * leftovers plus the new cluster count exceeds our ability
3070                          * to represent them, so fall back to the VHASDIRTY mechanism
3071                          */
3072                         for (cl_index = 0; cl_index < cl_len; cl_index++) {
3073                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3074                                         continue;
3075
3076                                 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3077                                         vp->v_cstart = l_clusters[cl_index].start_pg;
3078                                 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3079                                         vp->v_lastw = l_clusters[cl_index].last_pg;
3080                         }
3081                         vp->v_flag |= VHASDIRTY;
3082                 } else {
3083                         /*
3084                          * we've got room to merge the leftovers back in
3085                          * just append them starting at the next 'hole'
3086                          * represented by vp->v_clen
3087                          */
3088                         for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3089                                 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3090                                         continue;
3091
3092                                 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3093                                 vp->v_clusters[cl_index1].last_pg  = l_clusters[cl_index].last_pg;
3094
3095                                 if (cl_index1 == 0) {
3096                                         vp->v_cstart = l_clusters[cl_index].start_pg;
3097                                         vp->v_lastw  = l_clusters[cl_index].last_pg;
3098                                 } else {
3099                                         if (l_clusters[cl_index].start_pg < vp->v_cstart)
3100                                                 vp->v_cstart = l_clusters[cl_index].start_pg;
3101                                         if (l_clusters[cl_index].last_pg > vp->v_lastw)
3102                                                 vp->v_lastw = l_clusters[cl_index].last_pg;
3103                                 }
3104                                 cl_index1++;
3105                         }
3106                         /*
3107                          * update the cluster count
3108                          */
3109                         vp->v_clen = cl_index1;
3110                 }
3111         }
3112         return(MAX_CLUSTERS - vp->v_clen);
3113 }
3114
3115
3116
3117 static int
3118 cluster_push_x(vp, EOF, first, last, can_delay)
3119         struct vnode *vp;
3120         off_t  EOF;
3121         daddr_t first;
3122         daddr_t last;
3123         int    can_delay;
3124 {
3125         upl_page_info_t *pl;
3126         upl_t            upl;
3127         vm_offset_t      upl_offset;
3128         int              upl_size;
3129         off_t            upl_f_offset;
3130         int              pages_in_upl;
3131         int              start_pg;
3132         int              last_pg;
3133         int              io_size;
3134         int              io_flags;
3135         int              size;
3136         kern_return_t    kret;
3137
3138
3139         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3140                      vp->v_clen, first, last, EOF, 0);
3141
3142         if ((pages_in_upl = last - first) == 0) {
3143                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
3144
3145                 return (1);
3146         }
3147         upl_size = pages_in_upl * PAGE_SIZE;
3148         upl_f_offset = ((off_t)first) * PAGE_SIZE_64;
3149
3150         if (upl_f_offset + upl_size >= EOF) {
3151
3152                 if (upl_f_offset >= EOF) {
3153                         /*
3154                          * must have truncated the file and missed
3155                          * clearing a dangling cluster (i.e. it's completely
3156                          * beyond the new EOF
3157                          */
3158                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3159
3160                         return(1);
3161                 }
3162                 size = EOF - upl_f_offset;
3163
3164                 upl_size = (size + (PAGE_SIZE - 1) ) & ~(PAGE_SIZE - 1);
3165                 pages_in_upl = upl_size / PAGE_SIZE;
3166         } else {
3167                 if (can_delay && (pages_in_upl < (MAX_UPL_TRANSFER - (MAX_UPL_TRANSFER / 2))))
3168                         return(0);
3169                 size = upl_size;
3170         }
3171         kret = ubc_create_upl(vp,
3172                                 upl_f_offset,
3173                                 upl_size,
3174                                 &upl,
3175                                 &pl,
3176                                 UPL_RET_ONLY_DIRTY);
3177         if (kret != KERN_SUCCESS)
3178                 panic("cluster_push: failed to get pagelist");
3179
3180         if (can_delay) {
3181                 int  num_of_dirty;
3182
3183                 for (num_of_dirty = 0, start_pg = 0; start_pg < pages_in_upl; start_pg++) {
3184                         if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3185                                 num_of_dirty++;
3186                 }
3187                 if (num_of_dirty < pages_in_upl / 2) {
3188                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3189
3190                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 0, 2, num_of_dirty, (pages_in_upl / 2), 0);
3191
3192                         return(0);
3193                 }
3194         }
3195         last_pg = 0;
3196
3197         while (size) {
3198
3199                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3200                         if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3201                                 break;
3202                 }
3203                 if (start_pg > last_pg) {
3204                         io_size = (start_pg - last_pg) * PAGE_SIZE;
3205
3206                         ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size,
3207                                         UPL_ABORT_FREE_ON_EMPTY);
3208
3209                         if (io_size < size)
3210                                 size -= io_size;
3211                         else
3212                                 break;
3213                 }
3214                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3215                         if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
3216                                 break;
3217                 }
3218                 upl_offset = start_pg * PAGE_SIZE;
3219
3220                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3221
3222                 if (vp->v_flag & VNOCACHE_DATA)
3223                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC | CL_DUMP;
3224                 else
3225                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
3226
3227                 while (vp->v_numoutput >= ASYNC_THROTTLE) {
3228                         vp->v_flag |= VTHROTTLED;
3229                         tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
3230                 }
3231                 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0);
3232
3233                 size -= io_size;
3234         }
3235         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3236
3237         return(1);
3238 }