bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * Redistribution and use in source and binary forms, with or without
  28  * modification, are permitted provided that the following conditions
  29  * are met:
  30  * 1. Redistributions of source code must retain the above copyright
  31  *    notice, this list of conditions and the following disclaimer.
  32  * 2. Redistributions in binary form must reproduce the above copyright
  33  *    notice, this list of conditions and the following disclaimer in the
  34  *    documentation and/or other materials provided with the distribution.
  35  * 3. All advertising materials mentioning features or use of this software
  36  *    must display the following acknowledgement:
  37  *      This product includes software developed by the University of
  38  *      California, Berkeley and its contributors.
  39  * 4. Neither the name of the University nor the names of its contributors
  40  *    may be used to endorse or promote products derived from this software
  41  *    without specific prior written permission.
  42  *
  43  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  46  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  53  * SUCH DAMAGE.
  54  *
  55  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  56  */
  57
  58 #include <sys/param.h>
  59 #include <sys/proc.h>
  60 #include <sys/buf.h>
  61 #include <sys/vnode.h>
  62 #include <sys/mount.h>
  63 #include <sys/trace.h>
  64 #include <sys/malloc.h>
  65 #include <sys/resourcevar.h>
  66 #include <libkern/libkern.h>
  67
  68 #include <sys/ubc.h>
  69 #include <vm/vm_pageout.h>
  70
  71 #include <sys/kdebug.h>
  72
  73 #define CL_READ      0x01
  74 #define CL_ASYNC     0x02
  75 #define CL_COMMIT    0x04
  76 #define CL_NOMAP     0x08
  77 #define CL_PAGEOUT   0x10
  78 #define CL_AGE       0x20
  79 #define CL_DUMP      0x40
  80 #define CL_NOZERO    0x80
  81 #define CL_PAGEIN    0x100
  82 #define CL_DEV_MEMORY 0x200
  83
  84 /*
  85  * throttle the number of async writes that
  86  * can be outstanding on a single vnode
  87  * before we issue a synchronous write
  88  */
  89 #define ASYNC_THROTTLE  9
  90
  91 static int
  92 cluster_iodone(bp)
  93         struct buf *bp;
  94 {
  95         int         b_flags;
  96         int         error;
  97         int         total_size;
  98         int         total_resid;
  99         int         upl_offset;
 100         upl_t       upl;
 101         struct buf *cbp;
 102         struct buf *cbp_head;
 103         struct buf *cbp_next;
 104         struct buf *real_bp;
 105         struct vnode *vp;
 106         int         commit_size;
 107         int         pg_offset;
 108
 109
 110         cbp_head = (struct buf *)(bp->b_trans_head);
 111
 112         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 113                      cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 114
 115         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 116                 /*
 117                  * all I/O requests that are part of this transaction
 118                  * have to complete before we can process it
 119                  */
 120                 if ( !(cbp->b_flags & B_DONE)) {
 121
 122                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 123                                      cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
 124
 125                         return 0;
 126                 }
 127         }
 128         error       = 0;
 129         total_size  = 0;
 130         total_resid = 0;
 131
 132         cbp        = cbp_head;
 133         upl_offset = cbp->b_uploffset;
 134         upl        = cbp->b_pagelist;
 135         b_flags    = cbp->b_flags;
 136         real_bp    = cbp->b_real_bp;
 137         vp         = cbp->b_vp;
 138
 139         while (cbp) {
 140                 if (cbp->b_vectorcount > 1)
 141                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 142
 143                 if ((cbp->b_flags & B_ERROR) && error == 0)
 144                         error = cbp->b_error;
 145
 146                 total_resid += cbp->b_resid;
 147                 total_size  += cbp->b_bcount;
 148
 149                 cbp_next = cbp->b_trans_next;
 150
 151                 free_io_buf(cbp);
 152
 153                 cbp = cbp_next;
 154         }
 155         if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
 156                 vp->v_flag &= ~VTHROTTLED;
 157                 wakeup((caddr_t)&vp->v_numoutput);
 158         }
 159         if ((b_flags & B_NEED_IODONE) && real_bp) {
 160                 if (error) {
 161                         real_bp->b_flags |= B_ERROR;
 162                         real_bp->b_error = error;
 163                 }
 164                 real_bp->b_resid = total_resid;
 165
 166                 biodone(real_bp);
 167         }
 168         if (error == 0 && total_resid)
 169                 error = EIO;
 170
 171         if (b_flags & B_COMMIT_UPL) {
 172                 pg_offset   = upl_offset & PAGE_MASK;
 173                 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 174
 175                 if (error || (b_flags & B_NOCACHE)) {
 176                         int upl_abort_code;
 177
 178                         if (b_flags & B_PAGEOUT)
 179                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 180                         else if (b_flags & B_PGIN)
 181                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 182                         else
 183                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 184
 185                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 186                                         upl_abort_code);
 187
 188                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 189                                      upl, upl_offset - pg_offset, commit_size,
 190                                      0x80000000|upl_abort_code, 0);
 191
 192                 } else {
 193                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 194
 195                         if ( !(b_flags & B_PAGEOUT))
 196                                 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
 197                         if (b_flags & B_AGE)
 198                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 199
 200                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 201                                         upl_commit_flags);
 202
 203                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 204                                      upl, upl_offset - pg_offset, commit_size,
 205                                      upl_commit_flags, 0);
 206                 }
 207         } else
 208                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 209                              upl, upl_offset, 0, error, 0);
 210
 211         return (error);
 212 }
 213
 214
 215 static void
 216 cluster_zero(upl, upl_offset, size, flags, bp)
 217         upl_t         upl;
 218         vm_offset_t   upl_offset;
 219         int           size;
 220         int           flags;
 221         struct buf   *bp;
 222 {
 223         vm_offset_t   io_addr = 0;
 224         kern_return_t kret;
 225
 226         if ( !(flags & CL_NOMAP)) {
 227                 kret = ubc_upl_map(upl, &io_addr);
 228
 229                 if (kret != KERN_SUCCESS)
 230                         panic("cluster_zero: ubc_upl_map() failed with (%d)", kret);
 231                 if (io_addr == 0)
 232                         panic("cluster_zero: ubc_upl_map() mapped 0");
 233         } else
 234                 io_addr = (vm_offset_t)bp->b_data;
 235         bzero((caddr_t)(io_addr + upl_offset), size);
 236
 237         if ( !(flags & CL_NOMAP)) {
 238                 kret = ubc_upl_unmap(upl);
 239
 240                 if (kret != KERN_SUCCESS)
 241                         panic("cluster_zero: kernel_upl_unmap failed");
 242         }
 243 }
 244
 245 static int
 246 cluster_io(vp, upl, upl_offset, f_offset, size, flags, real_bp)
 247         struct vnode *vp;
 248         upl_t         upl;
 249         vm_offset_t   upl_offset;
 250         off_t         f_offset;
 251         int           size;
 252         int           flags;
 253         struct buf   *real_bp;
 254 {
 255         struct buf   *cbp;
 256         struct iovec *iovp;
 257         int           io_flags;
 258         int           error = 0;
 259         int           retval = 0;
 260         struct buf   *cbp_head = 0;
 261         struct buf   *cbp_tail = 0;
 262         upl_page_info_t *pl;
 263         int pg_count;
 264         int pg_offset;
 265         int max_iosize;
 266         int max_vectors;
 267         int priv;
 268
 269         if (flags & CL_READ) {
 270                 io_flags = (B_VECTORLIST | B_READ);
 271
 272                 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
 273         } else {
 274                 io_flags = (B_VECTORLIST | B_WRITEINPROG);
 275
 276                 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
 277         }
 278         pl = ubc_upl_pageinfo(upl);
 279
 280         if (flags & CL_ASYNC)
 281                 io_flags |= (B_CALL | B_ASYNC);
 282         if (flags & CL_AGE)
 283                 io_flags |= B_AGE;
 284         if (flags & CL_DUMP)
 285                 io_flags |= B_NOCACHE;
 286         if (flags & CL_PAGEIN)
 287                 io_flags |= B_PGIN;
 288
 289
 290         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 291                      (int)f_offset, size, upl_offset, flags, 0);
 292
 293         if ((flags & CL_READ) && ((upl_offset + size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 294                 /*
 295                  * then we are going to end up
 296                  * with a page that we can't complete (the file size wasn't a multiple
 297                  * of PAGE_SIZE and we're trying to read to the end of the file
 298                  * so we'll go ahead and zero out the portion of the page we can't
 299                  * read in from the file
 300                  */
 301                 cluster_zero(upl, upl_offset + size, PAGE_SIZE - ((upl_offset + size) & PAGE_MASK), flags, real_bp);
 302
 303                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
 304                              upl_offset + size, PAGE_SIZE - ((upl_offset + size) & PAGE_MASK),
 305                              flags, real_bp, 0);
 306         }
 307         while (size) {
 308                 size_t io_size;
 309                 int vsize;
 310                 int i;
 311                 int pl_index;
 312                 int pg_resid;
 313                 int num_contig;
 314                 daddr_t lblkno;
 315                 daddr_t blkno;
 316
 317                 if (size > max_iosize)
 318                         io_size = max_iosize;
 319                 else
 320                         io_size = size;
 321
 322                 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, &io_size, NULL)) {
 323                         if (error == EOPNOTSUPP)
 324                                 panic("VOP_CMAP Unimplemented");
 325                         break;
 326                 }
 327
 328                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 329                              (int)f_offset, (int)blkno, io_size, 0, 0);
 330
 331                 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
 332                         if (flags & CL_PAGEOUT) {
 333                                 error = EINVAL;
 334                                 break;
 335                         };
 336
 337                         /* Try paging out the page individually before
 338                            giving up entirely and dumping it (it could
 339                            be mapped in a "hole" and require allocation
 340                            before the I/O:
 341                          */
 342                          ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE_64, UPL_ABORT_FREE_ON_EMPTY);
 343                          if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
 344                                 error = EINVAL;
 345                                 break;
 346                          };
 347
 348                         upl_offset += PAGE_SIZE_64;
 349                         f_offset   += PAGE_SIZE_64;
 350                         size       -= PAGE_SIZE_64;
 351                         continue;
 352                 }
 353                 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
 354                 /*
 355                  * we have now figured out how much I/O we can do - this is in 'io_size'
 356                  * pl_index represents the first page in the 'upl' that the I/O will occur for
 357                  * pg_offset is the starting point in the first page for the I/O
 358                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 359                  */
 360                 pl_index  = upl_offset / PAGE_SIZE;
 361                 pg_offset = upl_offset & PAGE_MASK;
 362                 pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 363
 364                 if (flags & CL_DEV_MEMORY) {
 365                         /*
 366                          * currently, can't deal with reading 'holes' in file
 367                          */
 368                         if ((long)blkno == -1) {
 369                                 error = EINVAL;
 370                                 break;
 371                         }
 372                         /*
 373                          * treat physical requests as one 'giant' page
 374                          */
 375                         pg_count = 1;
 376                 }
 377                 if ((flags & CL_READ) && (long)blkno == -1) {
 378                         /*
 379                          * if we're reading and blkno == -1, then we've got a
 380                          * 'hole' in the file that we need to deal with by zeroing
 381                          * out the affected area in the upl
 382                          */
 383                         cluster_zero(upl, upl_offset, io_size, flags, real_bp);
 384
 385                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
 386                                      upl_offset, io_size, flags, real_bp, 0);
 387
 388                         pg_count = (io_size - pg_offset) / PAGE_SIZE;
 389
 390                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 391                                 pg_count++;
 392
 393                         if (pg_count) {
 394                                 if (pg_offset)
 395                                         pg_resid = PAGE_SIZE - pg_offset;
 396                                 else
 397                                         pg_resid = 0;
 398                                 if (flags & CL_COMMIT)
 399                                         ubc_upl_commit_range(upl,
 400                                                         upl_offset + pg_resid,
 401                                                         pg_count * PAGE_SIZE,
 402                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 403                         }
 404                         upl_offset += io_size;
 405                         f_offset   += io_size;
 406                         size       -= io_size;
 407
 408                         if (cbp_head && pg_count)
 409                                 goto start_io;
 410                         continue;
 411                 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
 412                         real_bp->b_blkno = blkno;
 413                 }
 414
 415                 if (pg_count > 1) {
 416                         if (pg_count > max_vectors) {
 417                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 418
 419                                 if (io_size < 0) {
 420                                         io_size = PAGE_SIZE - pg_offset;
 421                                         pg_count = 1;
 422                                 } else
 423                                         pg_count = max_vectors;
 424                         }
 425                         /*
 426                          * we need to allocate space for the vector list
 427                          */
 428                         if (pg_count > 1) {
 429                                 iovp = (struct iovec *)_MALLOC(sizeof(struct iovec) * pg_count,
 430                                                                M_SEGMENT, M_NOWAIT);
 431
 432                                 if (iovp == (struct iovec *) 0) {
 433                                         /*
 434                                          * if the allocation fails, then throttle down to a single page
 435                                          */
 436                                         io_size = PAGE_SIZE - pg_offset;
 437                                         pg_count = 1;
 438                                 }
 439                         }
 440                 }
 441
 442                 /* Throttle the speculative IO */
 443                 if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 444                         priv = 0;
 445                 else
 446                         priv = 1;
 447
 448                 cbp = alloc_io_buf(vp, priv);
 449
 450                 if (pg_count == 1)
 451                         /*
 452                          * we use the io vector that's reserved in the buffer header
 453                          * this insures we can always issue an I/O even in a low memory
 454                          * condition that prevents the _MALLOC from succeeding... this
 455                          * is necessary to prevent deadlocks with the pager
 456                          */
 457                         iovp = (struct iovec *)(&cbp->b_vects[0]);
 458
 459                 cbp->b_vectorlist  = (void *)iovp;
 460                 cbp->b_vectorcount = pg_count;
 461
 462                 if (flags & CL_DEV_MEMORY) {
 463
 464                         iovp->iov_len  = io_size;
 465                         iovp->iov_base = (caddr_t)upl_phys_page(pl, 0);
 466
 467                         if (iovp->iov_base == (caddr_t) 0) {
 468                                 free_io_buf(cbp);
 469                                 error = EINVAL;
 470                         } else
 471                                 iovp->iov_base += upl_offset;
 472                 } else {
 473
 474                   for (i = 0, vsize = io_size; i < pg_count; i++, iovp++) {
 475                         int     psize;
 476
 477                         psize = PAGE_SIZE - pg_offset;
 478
 479                         if (psize > vsize)
 480                                 psize = vsize;
 481
 482                         iovp->iov_len  = psize;
 483                         iovp->iov_base = (caddr_t)upl_phys_page(pl, pl_index + i);
 484
 485                         if (iovp->iov_base == (caddr_t) 0) {
 486                                 if (pg_count > 1)
 487                                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 488                                 free_io_buf(cbp);
 489
 490                                 error = EINVAL;
 491                                 break;
 492                         }
 493                         iovp->iov_base += pg_offset;
 494                         pg_offset = 0;
 495
 496                         if (flags & CL_PAGEOUT) {
 497                                 int         s;
 498                                 struct buf *bp;
 499
 500                                 s = splbio();
 501                                 if (bp = incore(vp, lblkno + i)) {
 502                                         if (!ISSET(bp->b_flags, B_BUSY)) {
 503                                                 bremfree(bp);
 504                                                 SET(bp->b_flags, (B_BUSY | B_INVAL));
 505                                                 splx(s);
 506                                                 brelse(bp);
 507                                         } else
 508                                                 panic("BUSY bp found in cluster_io");
 509                                 }
 510                                 splx(s);
 511                         }
 512                         vsize -= psize;
 513                     }
 514                 }
 515                 if (error)
 516                         break;
 517
 518                 if (flags & CL_ASYNC)
 519                         cbp->b_iodone = (void *)cluster_iodone;
 520                 cbp->b_flags |= io_flags;
 521
 522                 cbp->b_lblkno = lblkno;
 523                 cbp->b_blkno  = blkno;
 524                 cbp->b_bcount = io_size;
 525                 cbp->b_pagelist  = upl;
 526                 cbp->b_uploffset = upl_offset;
 527                 cbp->b_trans_next = (struct buf *)0;
 528
 529                 if (flags & CL_READ)
 530                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 531                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 532                 else
 533                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 534                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 535
 536                 if (cbp_head) {
 537                         cbp_tail->b_trans_next = cbp;
 538                         cbp_tail = cbp;
 539                 } else {
 540                         cbp_head = cbp;
 541                         cbp_tail = cbp;
 542                 }
 543                 (struct buf *)(cbp->b_trans_head) = cbp_head;
 544
 545                 upl_offset += io_size;
 546                 f_offset   += io_size;
 547                 size       -= io_size;
 548
 549                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY)) || size == 0) {
 550                         /*
 551                          * if we have no more I/O to issue or
 552                          * the current I/O we've prepared fully
 553                          * completes the last page in this request
 554                          * or it's been completed via a zero-fill
 555                          * due to a 'hole' in the file
 556                          * then go ahead and issue the I/O
 557                          */
 558 start_io:
 559                         if (flags & CL_COMMIT)
 560                                 cbp_head->b_flags |= B_COMMIT_UPL;
 561                         if (flags & CL_PAGEOUT)
 562                                 cbp_head->b_flags |= B_PAGEOUT;
 563                         if (flags & CL_PAGEIN)
 564                                 cbp_head->b_flags |= B_PGIN;
 565
 566                         if (real_bp) {
 567                                 cbp_head->b_flags |= B_NEED_IODONE;
 568                                 cbp_head->b_real_bp = real_bp;
 569                         }
 570
 571                         for (cbp = cbp_head; cbp;) {
 572                                 struct buf * cbp_next;
 573
 574                                 if (io_flags & B_WRITEINPROG)
 575                                         cbp->b_vp->v_numoutput++;
 576
 577                                 cbp_next = cbp->b_trans_next;
 578
 579                                 (void) VOP_STRATEGY(cbp);
 580                                 cbp = cbp_next;
 581                         }
 582                         if ( !(flags & CL_ASYNC)) {
 583                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 584                                         biowait(cbp);
 585
 586                                 if (error = cluster_iodone(cbp_head)) {
 587                                         retval = error;
 588                                         error  = 0;
 589                                 }
 590                         }
 591                         cbp_head = (struct buf *)0;
 592                         cbp_tail = (struct buf *)0;
 593                 }
 594         }
 595         if (error) {
 596                 int abort_size;
 597
 598                 for (cbp = cbp_head; cbp;) {
 599                         struct buf * cbp_next;
 600
 601                         if (cbp->b_vectorcount > 1)
 602                                 _FREE(cbp->b_vectorlist, M_SEGMENT);
 603                         upl_offset -= cbp->b_bcount;
 604                         size       += cbp->b_bcount;
 605
 606                         cbp_next = cbp->b_trans_next;
 607                         free_io_buf(cbp);
 608                         cbp = cbp_next;
 609                 }
 610                 pg_offset  = upl_offset & PAGE_MASK;
 611                 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 612
 613                 if (flags & CL_COMMIT) {
 614                         int upl_abort_code;
 615
 616                         if (flags & CL_PAGEOUT)
 617                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 618                         else if (flags & CL_PAGEIN)
 619                             upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 620                         else
 621                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 622
 623                         ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
 624                                                 upl_abort_code);
 625
 626                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
 627                                      upl, upl_offset - pg_offset, abort_size, error, 0);
 628                 }
 629                 if (real_bp) {
 630                         real_bp->b_flags |= B_ERROR;
 631                         real_bp->b_error  = error;
 632
 633                         biodone(real_bp);
 634                 }
 635                 if (retval == 0)
 636                         retval = error;
 637         }
 638         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
 639                      (int)f_offset, size, upl_offset, retval, 0);
 640
 641         return (retval);
 642 }
 643
 644
 645 static int
 646 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
 647         struct vnode *vp;
 648         off_t         f_offset;
 649         u_int         size;
 650         off_t         filesize;
 651         int           devblocksize;
 652 {
 653         upl_t         upl;
 654         upl_page_info_t *pl;
 655         int           pages_in_upl;
 656         int           start_pg;
 657         int           last_pg;
 658         int           last_valid;
 659         int           io_size;
 660
 661
 662         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
 663                      (int)f_offset, size, (int)filesize, 0, 0);
 664
 665         if (f_offset >= filesize) {
 666                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 667                              (int)f_offset, 0, 0, 0, 0);
 668                 return(0);
 669         }
 670         if (ubc_page_op(vp, f_offset, 0, 0, 0) == KERN_SUCCESS) {
 671                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 672                              (int)f_offset, 0, 0, 0, 0);
 673                 return(1);
 674         }
 675         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
 676                 size = MAX_UPL_TRANSFER * PAGE_SIZE;
 677         else
 678                 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
 679
 680         if ((off_t)size > (filesize - f_offset))
 681                 size = ((filesize - f_offset) + (devblocksize - 1)) & ~(devblocksize - 1);
 682
 683         pages_in_upl = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
 684
 685         ubc_create_upl(vp,
 686                         f_offset,
 687                         pages_in_upl * PAGE_SIZE,
 688                                 &upl,
 689                                 &pl,
 690                                 UPL_FLAGS_NONE);
 691
 692         if (upl == (upl_t) 0)
 693                 return(0);
 694
 695         /*
 696          * scan from the beginning of the upl looking for the first
 697          * non-valid page.... this will become the first page in
 698          * the request we're going to make to 'cluster_io'... if all
 699          * of the pages are valid, we won't call through to 'cluster_io'
 700          */
 701         for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
 702                 if (!upl_valid_page(pl, start_pg))
 703                         break;
 704         }
 705
 706         /*
 707          * scan from the starting invalid page looking for a valid
 708          * page before the end of the upl is reached, if we
 709          * find one, then it will be the last page of the request to
 710          * 'cluster_io'
 711          */
 712         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
 713                 if (upl_valid_page(pl, last_pg))
 714                         break;
 715         }
 716
 717         /*
 718          * if we find any more free valid pages at the tail of the upl
 719          * than update maxra accordingly....
 720          */
 721         for (last_valid = last_pg; last_valid < pages_in_upl; last_valid++) {
 722                 if (!upl_valid_page(pl, last_valid))
 723                         break;
 724         }
 725         if (start_pg < last_pg) {
 726                 vm_offset_t   upl_offset;
 727
 728                 /*
 729                  * we found a range of 'invalid' pages that must be filled
 730                  * 'size' has already been clipped to the LEOF
 731                  * make sure it's at least a multiple of the device block size
 732                  */
 733                 upl_offset = start_pg * PAGE_SIZE;
 734                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
 735
 736                 if ((upl_offset + io_size) > size) {
 737                         io_size = size - upl_offset;
 738
 739                         KERNEL_DEBUG(0xd001000, upl_offset, size, io_size, 0, 0);
 740                 }
 741                 cluster_io(vp, upl, upl_offset, f_offset + upl_offset, io_size,
 742                            CL_READ | CL_COMMIT | CL_ASYNC | CL_AGE, (struct buf *)0);
 743         }
 744         if (start_pg) {
 745                 /*
 746                  * start_pg of non-zero indicates we found some already valid pages
 747                  * at the beginning of the upl.... we need to release these without
 748                  * modifying there state
 749                  */
 750                 ubc_upl_abort_range(upl, 0, start_pg * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
 751
 752                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 50)) | DBG_FUNC_NONE,
 753                              upl, 0, start_pg * PAGE_SIZE, 0, 0);
 754         }
 755         if (last_pg < pages_in_upl) {
 756                 /*
 757                  * the set of pages that we issued an I/O for did not extend all the
 758                  * way to the end of the upl... so just release them without modifying
 759                  * there state
 760                  */
 761                 ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, (pages_in_upl - last_pg) * PAGE_SIZE,
 762                                 UPL_ABORT_FREE_ON_EMPTY);
 763
 764                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 50)) | DBG_FUNC_NONE,
 765                              upl, last_pg * PAGE_SIZE, (pages_in_upl - last_pg) * PAGE_SIZE, 0, 0);
 766         }
 767
 768         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 769                      (int)f_offset + (last_valid * PAGE_SIZE), 0, 0, 0, 0);
 770
 771         return(last_valid);
 772 }
 773
 774
 775
 776 static void
 777 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
 778         struct vnode *vp;
 779         daddr_t       b_lblkno;
 780         daddr_t       e_lblkno;
 781         off_t         filesize;
 782         int           devblocksize;
 783 {
 784         daddr_t       r_lblkno;
 785         off_t         f_offset;
 786         int           size_of_prefetch;
 787         int           max_pages;
 788
 789         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
 790                      b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
 791
 792         if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
 793                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 794                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
 795                 return;
 796         }
 797
 798         if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) && b_lblkno != (vp->v_maxra + 1))) {
 799                 vp->v_ralen = 0;
 800                 vp->v_maxra = 0;
 801
 802                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 803                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
 804
 805                 return;
 806         }
 807         max_pages = MAX_UPL_TRANSFER;
 808
 809         vp->v_ralen = vp->v_ralen ? min(max_pages, vp->v_ralen << 1) : 1;
 810
 811         if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
 812                 vp->v_ralen = min(max_pages, (e_lblkno + 1) - b_lblkno);
 813
 814         if (e_lblkno < vp->v_maxra) {
 815                 if ((vp->v_maxra - e_lblkno) > (max_pages / 4)) {
 816
 817                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 818                                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
 819                         return;
 820                 }
 821         }
 822         r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
 823         f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
 824
 825         size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
 826
 827         if (size_of_prefetch)
 828                 vp->v_maxra = r_lblkno + (size_of_prefetch - 1);
 829
 830         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 831                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
 832 }
 833
 834
 835 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 836         struct vnode *vp;
 837         upl_t         upl;
 838         vm_offset_t   upl_offset;
 839         off_t         f_offset;
 840         int           size;
 841         off_t         filesize;
 842         int           devblocksize;
 843         int           flags;
 844 {
 845         int           io_size;
 846         int           pg_size;
 847         off_t         max_size;
 848         int local_flags = CL_PAGEOUT;
 849
 850         if ((flags & UPL_IOSYNC) == 0)
 851                 local_flags |= CL_ASYNC;
 852         if ((flags & UPL_NOCOMMIT) == 0)
 853                 local_flags |= CL_COMMIT;
 854
 855         if (upl == (upl_t) 0)
 856                 panic("cluster_pageout: can't handle NULL upl yet\n");
 857
 858
 859         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
 860                      (int)f_offset, size, (int)filesize, local_flags, 0);
 861
 862         /*
 863          * If they didn't specify any I/O, then we are done...
 864          * we can't issue an abort because we don't know how
 865          * big the upl really is
 866          */
 867         if (size <= 0)
 868                 return (EINVAL);
 869
 870         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
 871                 if (local_flags & CL_COMMIT)
 872                         ubc_upl_abort_range(upl, upl_offset, size,
 873                                                 UPL_ABORT_FREE_ON_EMPTY);
 874                 return (EROFS);
 875         }
 876         /*
 877          * can't page-in from a negative offset
 878          * or if we're starting beyond the EOF
 879          * or if the file offset isn't page aligned
 880          * or the size requested isn't a multiple of PAGE_SIZE
 881          */
 882         if (f_offset < 0 || f_offset >= filesize ||
 883            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 884                 if (local_flags & CL_COMMIT)
 885                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 886                 return (EINVAL);
 887         }
 888         max_size = filesize - f_offset;
 889
 890         if (size < max_size)
 891                 io_size = size;
 892         else
 893                 io_size = (max_size + (devblocksize - 1)) & ~(devblocksize - 1);
 894
 895         pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 896
 897         if (size > pg_size) {
 898                 if (local_flags & CL_COMMIT)
 899                         ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
 900                                         UPL_ABORT_FREE_ON_EMPTY);
 901         }
 902         while (vp->v_numoutput >= ASYNC_THROTTLE) {
 903                 vp->v_flag |= VTHROTTLED;
 904                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_pageout", 0);
 905         }
 906
 907         return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
 908                            local_flags, (struct buf *)0));
 909 }
 910
 911
 912 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 913         struct vnode *vp;
 914         upl_t         upl;
 915         vm_offset_t   upl_offset;
 916         off_t         f_offset;
 917         int           size;
 918         off_t         filesize;
 919         int           devblocksize;
 920         int           flags;
 921 {
 922         u_int         io_size;
 923         int           pg_size;
 924         off_t         max_size;
 925         int           retval;
 926         int           local_flags = 0;
 927
 928
 929         /*
 930          * If they didn't ask for any data, then we are done...
 931          * we can't issue an abort because we don't know how
 932          * big the upl really is
 933          */
 934         if (size <= 0)
 935                 return (EINVAL);
 936
 937         if ((flags & UPL_NOCOMMIT) == 0)
 938                 local_flags = CL_COMMIT;
 939
 940         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
 941                      (int)f_offset, size, (int)filesize, local_flags, 0);
 942
 943         /*
 944          * can't page-in from a negative offset
 945          * or if we're starting beyond the EOF
 946          * or if the file offset isn't page aligned
 947          * or the size requested isn't a multiple of PAGE_SIZE
 948          */
 949         if (f_offset < 0 || f_offset >= filesize ||
 950            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 951                 if (local_flags & CL_COMMIT)
 952                         ubc_upl_abort_range(upl, upl_offset, size,
 953                                         UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY);
 954                 return (EINVAL);
 955         }
 956         max_size = filesize - f_offset;
 957
 958         if (size < max_size)
 959                 io_size = size;
 960         else
 961                 io_size = (max_size + (devblocksize - 1)) & ~(devblocksize - 1);
 962
 963         pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 964
 965         if (upl == (upl_t) 0) {
 966                 ubc_create_upl( vp,
 967                                                 f_offset,
 968                                                 pg_size,
 969                                                 &upl,
 970                                                 NULL,
 971                                                 UPL_FLAGS_NONE);
 972
 973                 if (upl == (upl_t) 0)
 974                         return (EINVAL);
 975
 976                 upl_offset = (vm_offset_t)0;
 977                 size = pg_size;
 978         }
 979         if (size > pg_size) {
 980                 if (local_flags & CL_COMMIT)
 981                         ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
 982                                         UPL_ABORT_FREE_ON_EMPTY);
 983         }
 984
 985         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
 986                             local_flags | CL_READ | CL_PAGEIN, (struct buf *)0);
 987
 988         if (retval == 0) {
 989                 int b_lblkno;
 990                 int e_lblkno;
 991
 992                 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
 993                 e_lblkno = (int)
 994                         ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
 995
 996                 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) {
 997                         /*
 998                          * we haven't read the last page in of the file yet
 999                          * so let's try to read ahead if we're in
1000                          * a sequential access pattern
1001                          */
1002                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1003                 }
1004                 vp->v_lastr = e_lblkno;
1005         }
1006         return (retval);
1007 }
1008
1009
1010 cluster_bp(bp)
1011         struct buf *bp;
1012 {
1013         off_t  f_offset;
1014         int    flags;
1015
1016         if (bp->b_pagelist == (upl_t) 0)
1017                 panic("cluster_bp: can't handle NULL upl yet\n");
1018         if (bp->b_flags & B_READ)
1019                 flags = CL_ASYNC | CL_NOMAP | CL_READ;
1020         else
1021                 flags = CL_ASYNC | CL_NOMAP;
1022
1023         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1024
1025         return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, flags, bp));
1026 }
1027
1028
1029 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1030         struct vnode *vp;
1031         struct uio   *uio;
1032         off_t         oldEOF;
1033         off_t         newEOF;
1034         off_t         headOff;
1035         off_t         tailOff;
1036         int           devblocksize;
1037         int           flags;
1038 {
1039         int           prev_resid;
1040         int           clip_size;
1041         off_t         max_io_size;
1042         struct iovec  *iov;
1043         vm_offset_t   upl_offset;
1044         int           upl_size;
1045         int           pages_in_pl;
1046         upl_page_info_t *pl;
1047         int           upl_flags;
1048         upl_t         upl;
1049         int           retval = 0;
1050
1051
1052         if ((!uio) || (uio->uio_segflg != UIO_USERSPACE) || (!(vp->v_flag & VNOCACHE_DATA)))
1053           {
1054             retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1055             return(retval);
1056           }
1057
1058         while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1059           {
1060             /* we know we have a resid, so this is safe */
1061             iov = uio->uio_iov;
1062             while (iov->iov_len == 0) {
1063               uio->uio_iov++;
1064               uio->uio_iovcnt--;
1065               iov = uio->uio_iov;
1066             }
1067
1068             /*
1069              * We check every vector target and if it is physically
1070              * contiguous space, we skip the sanity checks.
1071              */
1072
1073             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1074             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1075             pages_in_pl = 0;
1076             upl_flags = UPL_QUERY_OBJECT_TYPE;
1077             if ((vm_map_get_upl(current_map(),
1078                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1079                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1080               {
1081                 /*
1082                  * the user app must have passed in an invalid address
1083                  */
1084                 return (EFAULT);
1085               }
1086
1087             if (upl_flags & UPL_PHYS_CONTIG)
1088               {
1089                 /*
1090                  * since the interface to the IOKit below us uses physical block #'s and
1091                  * block counts to specify the I/O, we can't handle anything that isn't
1092                  * devblocksize aligned
1093                  */
1094                 if ((uio->uio_offset & (devblocksize - 1)) || (uio->uio_resid & (devblocksize - 1)))
1095                     return(EINVAL);
1096
1097                 if (flags & IO_HEADZEROFILL)
1098                   {
1099                     flags &= ~IO_HEADZEROFILL;
1100
1101                     if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1102                         return(retval);
1103                   }
1104
1105                 retval = cluster_phys_write(vp, uio);
1106
1107                 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1108                   {
1109                     retval = cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL);
1110                     return(retval);
1111                   }
1112               }
1113             else if ((uio->uio_resid < 4 * PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1114               {
1115                 /*
1116                  * We set a threshhold of 4 pages to decide if the nocopy
1117                  * write loop is worth the trouble...
1118                  * we also come here if we're trying to zero the head and/or tail
1119                  * of a partially written page, and the user source is not a physically contiguous region
1120                  */
1121                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1122                 return(retval);
1123               }
1124             else if (uio->uio_offset & PAGE_MASK_64)
1125               {
1126                 /* Bring the file offset write up to a pagesize boundary */
1127                 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1128                 if (uio->uio_resid < clip_size)
1129                   clip_size = uio->uio_resid;
1130                 /*
1131                  * Fake the resid going into the cluster_write_x call
1132                  * and restore it on the way out.
1133                  */
1134                 prev_resid = uio->uio_resid;
1135                 uio->uio_resid = clip_size;
1136                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1137                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1138               }
1139             else if ((int)iov->iov_base & PAGE_MASK_64)
1140               {
1141                 clip_size = iov->iov_len;
1142                 prev_resid = uio->uio_resid;
1143                 uio->uio_resid = clip_size;
1144                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1145                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1146               }
1147             else
1148               {
1149                 /*
1150                  * If we come in here, we know the offset into
1151                  * the file is on a pagesize boundary
1152                  */
1153
1154                 max_io_size = newEOF - uio->uio_offset;
1155                 clip_size = uio->uio_resid;
1156                 if (iov->iov_len < clip_size)
1157                   clip_size = iov->iov_len;
1158                 if (max_io_size < clip_size)
1159                   clip_size = max_io_size;
1160
1161                 if (clip_size < PAGE_SIZE)
1162                   {
1163                     /*
1164                      * Take care of tail end of write in this vector
1165                      */
1166                     prev_resid = uio->uio_resid;
1167                     uio->uio_resid = clip_size;
1168                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1169                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1170                   }
1171                 else
1172                   {
1173                     /* round clip_size down to a multiple of pagesize */
1174                     clip_size = clip_size & ~(PAGE_MASK);
1175                     prev_resid = uio->uio_resid;
1176                     uio->uio_resid = clip_size;
1177                     retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1178                     if ((retval == 0) && uio->uio_resid)
1179                       retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1180                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1181                   }
1182               } /* end else */
1183           } /* end while */
1184         return(retval);
1185 }
1186
1187 static
1188 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1189         struct vnode *vp;
1190         struct uio   *uio;
1191         off_t         newEOF;
1192         int           devblocksize;
1193         int           flags;
1194 {
1195         upl_t            upl;
1196         upl_page_info_t  *pl;
1197         off_t            upl_f_offset;
1198         vm_offset_t      upl_offset;
1199         off_t            max_io_size;
1200         int              io_size;
1201         int              upl_size;
1202         int              upl_needed_size;
1203         int              pages_in_pl;
1204         int              upl_flags;
1205         kern_return_t    kret;
1206         struct iovec     *iov;
1207         int              i;
1208         int              force_data_sync;
1209         int              error  = 0;
1210
1211         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1212                      (int)uio->uio_offset, (int)uio->uio_resid,
1213                      (int)newEOF, devblocksize, 0);
1214
1215         /*
1216          * When we enter this routine, we know
1217          *  -- the offset into the file is on a pagesize boundary
1218          *  -- the resid is a page multiple
1219          *  -- the resid will not exceed iov_len
1220          */
1221
1222         iov = uio->uio_iov;
1223
1224         while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1225           io_size = uio->uio_resid;
1226
1227           if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1228             io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1229
1230           upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1231           upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1232
1233           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1234                        (int)upl_offset, upl_needed_size, iov->iov_base, io_size, 0);
1235
1236           for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
1237             {
1238               pages_in_pl = 0;
1239               upl_size = upl_needed_size;
1240               upl_flags = UPL_COPYOUT_FROM | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1241
1242               kret = vm_map_get_upl(current_map(),
1243                                     (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1244                                     &upl_size,
1245                                         &upl,
1246                                         NULL,
1247                                         &pages_in_pl,
1248                                         &upl_flags,
1249                                         force_data_sync);
1250
1251               if (kret != KERN_SUCCESS)
1252                 {
1253                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1254                                0, 0, 0, kret, 0);
1255
1256                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1257                                (int)uio->uio_offset, (int)uio->uio_resid, kret, 1, 0);
1258
1259                   /* cluster_nocopy_write: failed to get pagelist */
1260                   /* do not return kret here */
1261                   return(0);
1262                 }
1263
1264               pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1265               pages_in_pl = upl_size / PAGE_SIZE;
1266
1267               for(i=0; i < pages_in_pl; i++)
1268                 {
1269                   if (!upl_valid_page(pl, i))
1270                     break;
1271                 }
1272
1273               if (i == pages_in_pl)
1274                 break;
1275
1276                 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1277                                 UPL_ABORT_FREE_ON_EMPTY);
1278             }
1279
1280           if (force_data_sync >= 3)
1281             {
1282               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1283                            i, pages_in_pl, upl_size, kret, 0);
1284
1285               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1286                            (int)uio->uio_offset, (int)uio->uio_resid, kret, 2, 0);
1287               return(0);
1288             }
1289
1290           /*
1291            * Consider the possibility that upl_size wasn't satisfied.
1292            */
1293           if (upl_size != upl_needed_size)
1294             io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1295
1296           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1297                        (int)upl_offset, upl_size, iov->iov_base, io_size, 0);
1298
1299           if (io_size == 0)
1300             {
1301               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1302                                    UPL_ABORT_FREE_ON_EMPTY);
1303               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1304                      (int)uio->uio_offset, uio->uio_resid, 0, 3, 0);
1305
1306               return(0);
1307             }
1308
1309           /*
1310            * Now look for pages already in the cache
1311            * and throw them away.
1312            */
1313
1314           upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
1315           max_io_size = io_size;
1316
1317           while (max_io_size) {
1318
1319             /*
1320              * Flag UPL_POP_DUMP says if the page is found
1321              * in the page cache it must be thrown away.
1322              */
1323             ubc_page_op(vp,
1324                         upl_f_offset,
1325                         UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1326                         0, 0);
1327             max_io_size  -= PAGE_SIZE;
1328             upl_f_offset += PAGE_SIZE;
1329           }
1330
1331           /*
1332            * issue a synchronous write to cluster_io
1333            */
1334
1335           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1336                        (int)upl_offset, (int)uio->uio_offset, io_size, 0, 0);
1337
1338           error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1339                              io_size, 0, (struct buf *)0);
1340
1341           if (error == 0) {
1342             /*
1343              * The cluster_io write completed successfully,
1344              * update the uio structure.
1345              */
1346             iov->iov_base += io_size;
1347             iov->iov_len -= io_size;
1348             uio->uio_resid -= io_size;
1349             uio->uio_offset += io_size;
1350           }
1351           /*
1352            * always 'commit' the I/O via the abort primitive whether the I/O
1353            * succeeded cleanly or not... this is necessary to insure that
1354            * we preserve the state of the DIRTY flag on the pages used to
1355            * provide the data for the I/O... the state of this flag SHOULD
1356            * NOT be changed by a write
1357            */
1358           ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1359                               UPL_ABORT_FREE_ON_EMPTY);
1360
1361
1362           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1363                        (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1364
1365         } /* end while */
1366
1367
1368         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1369                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1370
1371         return (error);
1372 }
1373
1374 static
1375 cluster_phys_write(vp, uio)
1376         struct vnode *vp;
1377         struct uio   *uio;
1378 {
1379         upl_t            upl;
1380         vm_offset_t      upl_offset;
1381         int              io_size;
1382         int              upl_size;
1383         int              upl_needed_size;
1384         int              pages_in_pl;
1385         int              upl_flags;
1386         kern_return_t    kret;
1387         struct iovec     *iov;
1388         int              error  = 0;
1389
1390         /*
1391          * When we enter this routine, we know
1392          *  -- the resid will not exceed iov_len
1393          *  -- the vector target address is physcially contiguous
1394          */
1395
1396         iov = uio->uio_iov;
1397         io_size = iov->iov_len;
1398         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1399         upl_needed_size = upl_offset + io_size;
1400
1401         pages_in_pl = 0;
1402         upl_size = upl_needed_size;
1403         upl_flags = UPL_COPYOUT_FROM | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1404
1405         kret = vm_map_get_upl(current_map(),
1406                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1407                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1408
1409         if (kret != KERN_SUCCESS)
1410           {
1411             /* cluster_phys_write: failed to get pagelist */
1412               /* note: return kret here */
1413               return(EINVAL);
1414           }
1415
1416         /*
1417          * Consider the possibility that upl_size wasn't satisfied.
1418          * This is a failure in the physical memory case.
1419          */
1420         if (upl_size < upl_needed_size)
1421           {
1422             kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1423             return(EINVAL);
1424           }
1425
1426         /*
1427          * issue a synchronous write to cluster_io
1428          */
1429
1430         error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1431                            io_size, CL_DEV_MEMORY, (struct buf *)0);
1432
1433         if (error == 0) {
1434           /*
1435            * The cluster_io write completed successfully,
1436            * update the uio structure and commit.
1437            */
1438
1439           ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
1440
1441           iov->iov_base += io_size;
1442           iov->iov_len -= io_size;
1443           uio->uio_resid -= io_size;
1444           uio->uio_offset += io_size;
1445         }
1446         else
1447           ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1448
1449         return (error);
1450 }
1451
1452 static
1453 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1454         struct vnode *vp;
1455         struct uio   *uio;
1456         off_t         oldEOF;
1457         off_t         newEOF;
1458         off_t         headOff;
1459         off_t         tailOff;
1460         int           devblocksize;
1461         int           flags;
1462 {
1463         upl_page_info_t *pl;
1464         upl_t            upl;
1465         vm_offset_t      upl_offset;
1466         int              upl_size;
1467         off_t            upl_f_offset;
1468         int              pages_in_upl;
1469         int              start_offset;
1470         int              xfer_resid;
1471         int              io_size;
1472         int              io_size_before_rounding;
1473         int              io_flags;
1474         vm_offset_t      io_address;
1475         int              io_offset;
1476         int              bytes_to_zero;
1477         int              bytes_to_move;
1478         kern_return_t    kret;
1479         int              retval = 0;
1480         int              uio_resid;
1481         long long        total_size;
1482         long long        zero_cnt;
1483         off_t            zero_off;
1484         long long        zero_cnt1;
1485         off_t            zero_off1;
1486         daddr_t          start_blkno;
1487         daddr_t          last_blkno;
1488
1489         if (uio) {
1490                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1491                              (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1492
1493                 uio_resid = uio->uio_resid;
1494         } else {
1495                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1496                              0, 0, (int)oldEOF, (int)newEOF, 0);
1497
1498                 uio_resid = 0;
1499         }
1500         zero_cnt  = 0;
1501         zero_cnt1 = 0;
1502
1503         if (flags & IO_HEADZEROFILL) {
1504                 /*
1505                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1506                  * so we zero fill the intervening space between the old EOF and the offset
1507                  * where the next chunk of real data begins.... ftruncate will also use this
1508                  * routine to zero fill to the new EOF when growing a file... in this case, the
1509                  * uio structure will not be provided
1510                  */
1511                 if (uio) {
1512                         if (headOff < uio->uio_offset) {
1513                                 zero_cnt = uio->uio_offset - headOff;
1514                                 zero_off = headOff;
1515                         }
1516                 } else if (headOff < newEOF) {
1517                         zero_cnt = newEOF - headOff;
1518                         zero_off = headOff;
1519                 }
1520         }
1521         if (flags & IO_TAILZEROFILL) {
1522                 if (uio) {
1523                         zero_off1 = uio->uio_offset + uio->uio_resid;
1524
1525                         if (zero_off1 < tailOff)
1526                                 zero_cnt1 = tailOff - zero_off1;
1527                 }
1528         }
1529         if (zero_cnt == 0 && uio == (struct uio *) 0)
1530           {
1531             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1532                          retval, 0, 0, 0, 0);
1533             return (0);
1534           }
1535
1536         while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1537                 /*
1538                  * for this iteration of the loop, figure out where our starting point is
1539                  */
1540                 if (zero_cnt) {
1541                         start_offset = (int)(zero_off & PAGE_MASK_64);
1542                         upl_f_offset = zero_off - start_offset;
1543                 } else if (uio_resid) {
1544                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1545                         upl_f_offset = uio->uio_offset - start_offset;
1546                 } else {
1547                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1548                         upl_f_offset = zero_off1 - start_offset;
1549                 }
1550                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1551                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1552
1553                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1554                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1555
1556                 /*
1557                  * compute the size of the upl needed to encompass
1558                  * the requested write... limit each call to cluster_io
1559                  * to the maximum UPL size... cluster_io will clip if
1560                  * this exceeds the maximum io_size for the device,
1561                  * make sure to account for
1562                  * a starting offset that's not page aligned
1563                  */
1564                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1565
1566                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1567                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1568
1569                 pages_in_upl = upl_size / PAGE_SIZE;
1570                 io_size      = upl_size - start_offset;
1571
1572                 if ((long long)io_size > total_size)
1573                         io_size = total_size;
1574
1575                 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1576                 last_blkno  = start_blkno + pages_in_upl;
1577
1578                 kret = ubc_create_upl(vp,
1579                                                         upl_f_offset,
1580                                                         upl_size,
1581                                                         &upl,
1582                                                         &pl,
1583                                                         UPL_FLAGS_NONE);
1584                 if (kret != KERN_SUCCESS)
1585                         panic("cluster_write: failed to get pagelist");
1586
1587                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
1588                         upl, (int)upl_f_offset, upl_size, start_offset, 0);
1589
1590                 if (start_offset && !upl_valid_page(pl, 0)) {
1591                         int   read_size;
1592
1593                         /*
1594                          * we're starting in the middle of the first page of the upl
1595                          * and the page isn't currently valid, so we're going to have
1596                          * to read it in first... this is a synchronous operation
1597                          */
1598                         read_size = PAGE_SIZE;
1599
1600                         if ((upl_f_offset + read_size) > newEOF) {
1601                                 read_size = newEOF - upl_f_offset;
1602                                 read_size = (read_size + (devblocksize - 1)) & ~(devblocksize - 1);
1603                         }
1604                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
1605                                             CL_READ, (struct buf *)0);
1606                         if (retval) {
1607                                 /*
1608                                  * we had an error during the read which causes us to abort
1609                                  * the current cluster_write request... before we do, we need
1610                                  * to release the rest of the pages in the upl without modifying
1611                                  * there state and mark the failed page in error
1612                                  */
1613                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1614                                 ubc_upl_abort(upl, 0);
1615
1616                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1617                                              upl, 0, 0, retval, 0);
1618                                 break;
1619                         }
1620                 }
1621                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1622                         /*
1623                          * the last offset we're writing to in this upl does not end on a page
1624                          * boundary... if it's not beyond the old EOF, then we'll also need to
1625                          * pre-read this page in if it isn't already valid
1626                          */
1627                         upl_offset = upl_size - PAGE_SIZE;
1628
1629                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1630                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1631                                 int   read_size;
1632
1633                                 read_size = PAGE_SIZE;
1634
1635                                 if ((upl_f_offset + upl_offset + read_size) > newEOF) {
1636                                         read_size = newEOF - (upl_f_offset + upl_offset);
1637                                         read_size = (read_size + (devblocksize - 1)) & ~(devblocksize - 1);
1638                                 }
1639                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
1640                                                     CL_READ, (struct buf *)0);
1641                                 if (retval) {
1642                                         /*
1643                                          * we had an error during the read which causes us to abort
1644                                          * the current cluster_write request... before we do, we
1645                                          * need to release the rest of the pages in the upl without
1646                                          * modifying there state and mark the failed page in error
1647                                          */
1648                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE,
1649                                                         UPL_ABORT_DUMP_PAGES);
1650                                         ubc_upl_abort(upl, 0);
1651
1652                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1653                                                      upl, 0, 0, retval, 0);
1654                                         break;
1655                                 }
1656                         }
1657                 }
1658                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
1659                         panic("cluster_write: ubc_upl_map failed\n");
1660                 xfer_resid = io_size;
1661                 io_offset = start_offset;
1662
1663                 while (zero_cnt && xfer_resid) {
1664
1665                         if (zero_cnt < (long long)xfer_resid)
1666                                 bytes_to_zero = zero_cnt;
1667                         else
1668                                 bytes_to_zero = xfer_resid;
1669
1670                         if ( !(flags & IO_NOZEROVALID)) {
1671                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1672
1673                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1674                                              (int)upl_f_offset + io_offset, bytes_to_zero,
1675                                              (int)zero_cnt, xfer_resid, 0);
1676                         } else {
1677                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1678
1679                                 if ( !upl_valid_page(pl, (int)(zero_off / PAGE_SIZE_64))) {
1680                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1681
1682                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1683                                                      (int)upl_f_offset + io_offset, bytes_to_zero,
1684                                                      (int)zero_cnt, xfer_resid, 0);
1685                                 }
1686                         }
1687                         xfer_resid -= bytes_to_zero;
1688                         zero_cnt   -= bytes_to_zero;
1689                         zero_off   += bytes_to_zero;
1690                         io_offset  += bytes_to_zero;
1691                 }
1692                 if (xfer_resid && uio_resid) {
1693                         bytes_to_move = min(uio_resid, xfer_resid);
1694
1695                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1696                                      (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1697
1698                         retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1699
1700                         if (retval) {
1701                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1702                                         panic("cluster_write: kernel_upl_unmap failed\n");
1703                                 ubc_upl_abort(upl, UPL_ABORT_DUMP_PAGES);
1704
1705                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1706                                              upl, 0, 0, retval, 0);
1707                         } else {
1708                                 uio_resid  -= bytes_to_move;
1709                                 xfer_resid -= bytes_to_move;
1710                                 io_offset  += bytes_to_move;
1711                         }
1712                 }
1713                 while (xfer_resid && zero_cnt1 && retval == 0) {
1714
1715                         if (zero_cnt1 < (long long)xfer_resid)
1716                                 bytes_to_zero = zero_cnt1;
1717                         else
1718                                 bytes_to_zero = xfer_resid;
1719
1720                         if ( !(flags & IO_NOZEROVALID)) {
1721                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1722
1723                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1724                                              (int)upl_f_offset + io_offset,
1725                                              bytes_to_zero, (int)zero_cnt1, xfer_resid, 0);
1726                         } else {
1727                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1728                                 if ( !upl_valid_page(pl, (int)(zero_off1 / PAGE_SIZE_64))) {
1729                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1730
1731                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1732                                                      (int)upl_f_offset + io_offset,
1733                                                      bytes_to_zero, (int)zero_cnt1, xfer_resid, 0);
1734                                 }
1735                         }
1736                         xfer_resid -= bytes_to_zero;
1737                         zero_cnt1  -= bytes_to_zero;
1738                         zero_off1  += bytes_to_zero;
1739                         io_offset  += bytes_to_zero;
1740                 }
1741
1742                 if (retval == 0) {
1743                         int must_push;
1744                         int can_delay;
1745
1746                         io_size += start_offset;
1747
1748                         if ((upl_f_offset + io_size) == newEOF && io_size < upl_size) {
1749                                 /*
1750                                  * if we're extending the file with this write
1751                                  * we'll zero fill the rest of the page so that
1752                                  * if the file gets extended again in such a way as to leave a
1753                                  * hole starting at this EOF, we'll have zero's in the correct spot
1754                                  */
1755                                 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1756
1757                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1758                                              (int)upl_f_offset + io_size,
1759                                              upl_size - io_size, 0, 0, 0);
1760                         }
1761                         if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1762                                 panic("cluster_write: kernel_upl_unmap failed\n");
1763
1764                         io_size_before_rounding = io_size;
1765
1766                         if (io_size & (devblocksize - 1))
1767                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
1768
1769                         must_push = 0;
1770                         can_delay = 0;
1771
1772                         if (vp->v_clen) {
1773                                 int newsize;
1774
1775                                 /*
1776                                  * we have an existing cluster... see if this write will extend it nicely
1777                                  */
1778                                 if (start_blkno >= vp->v_cstart) {
1779                                         if (last_blkno <= (vp->v_cstart + vp->v_clen)) {
1780                                                 /*
1781                                                  * we have a write that fits entirely
1782                                                  * within the existing cluster limits
1783                                                  */
1784                                                 if (last_blkno >= vp->v_lastw) {
1785                                                         /*
1786                                                          * if we're extending the dirty region within the cluster
1787                                                          * we need to update the cluster info... we check for blkno
1788                                                          * equality because we may be extending the file with a
1789                                                          * partial write.... this in turn changes our idea of how
1790                                                          * much data to write out (v_ciosiz) for the last page
1791                                                          */
1792                                                         vp->v_lastw = last_blkno;
1793                                                         newsize = io_size + ((start_blkno - vp->v_cstart) * PAGE_SIZE);
1794
1795                                                         if (newsize > vp->v_ciosiz)
1796                                                                 vp->v_ciosiz = newsize;
1797                                                 }
1798                                                 can_delay = 1;
1799                                                 goto finish_io;
1800                                         }
1801                                         if (start_blkno < (vp->v_cstart + vp->v_clen)) {
1802                                                 /*
1803                                                  * we have a write that starts in the middle of the current cluster
1804                                                  * but extends beyond the cluster's limit
1805                                                  * we'll clip the current cluster if we actually
1806                                                  * overlap with the new write and then push it out
1807                                                  * and start a new cluster with the current write
1808                                                  */
1809                                                  if (vp->v_lastw > start_blkno) {
1810                                                         vp->v_lastw = start_blkno;
1811                                                         vp->v_ciosiz = (vp->v_lastw - vp->v_cstart) * PAGE_SIZE;
1812                                                  }
1813                                         }
1814                                         /*
1815                                          * we also get here for the case where the current write starts
1816                                          * beyond the limit of the existing cluster
1817                                          */
1818                                         must_push = 1;
1819                                         goto check_delay;
1820                                 }
1821                                 /*
1822                                  * the current write starts in front of the current cluster
1823                                  */
1824                                 if (last_blkno > vp->v_cstart) {
1825                                         /*
1826                                          * the current write extends into the existing cluster
1827                                          */
1828                                         if ((vp->v_lastw - start_blkno) > vp->v_clen) {
1829                                                 /*
1830                                                  * if we were to combine this write with the current cluster
1831                                                  * we would exceed the cluster size limit....
1832                                                  * clip the current cluster by moving the start position
1833                                                  * to where the current write ends, and then push it
1834                                                  */
1835                                                 vp->v_ciosiz -= (last_blkno - vp->v_cstart) * PAGE_SIZE;
1836                                                 vp->v_cstart = last_blkno;
1837
1838                                                 /*
1839                                                  * round up the io_size to the nearest page size
1840                                                  * since we've coalesced with at least 1 pre-existing
1841                                                  * page in the current cluster... this write may have ended in the
1842                                                  * middle of the page which would cause io_size to give us an
1843                                                  * inaccurate view of how much I/O we actually need to do
1844                                                  */
1845                                                 io_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1846
1847                                                 must_push = 1;
1848                                                 goto check_delay;
1849                                         }
1850                                         /*
1851                                          * we can coalesce the current write with the existing cluster
1852                                          * adjust the cluster info to reflect this
1853                                          */
1854                                         if (last_blkno > vp->v_lastw) {
1855                                                 /*
1856                                                  * the current write completey overlaps
1857                                                  * the existing cluster
1858                                                  */
1859                                                 vp->v_lastw = last_blkno;
1860                                                 vp->v_ciosiz = io_size;
1861                                         } else {
1862                                                 vp->v_ciosiz += (vp->v_cstart - start_blkno) * PAGE_SIZE;
1863
1864                                                 if (io_size > vp->v_ciosiz)
1865                                                         vp->v_ciosiz = io_size;
1866                                         }
1867                                         vp->v_cstart = start_blkno;
1868                                         can_delay = 1;
1869                                         goto finish_io;
1870                                 }
1871                                 /*
1872                                  * this I/O range is entirely in front of the current cluster
1873                                  * so we need to push the current cluster out before beginning
1874                                  * a new one
1875                                  */
1876                                 must_push = 1;
1877                         }
1878 check_delay:
1879                         if (must_push)
1880                                 cluster_push(vp);
1881
1882                         if (io_size_before_rounding < (MAX_UPL_TRANSFER * PAGE_SIZE) && !(flags & IO_SYNC)) {
1883                                 vp->v_clen = MAX_UPL_TRANSFER;
1884                                 vp->v_cstart = start_blkno;
1885                                 vp->v_lastw  = last_blkno;
1886                                 vp->v_ciosiz = io_size;
1887
1888                                 can_delay = 1;
1889                         }
1890 finish_io:
1891                         if (can_delay) {
1892                                 ubc_upl_commit_range(upl, 0, upl_size,
1893                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1894                                 continue;
1895                         }
1896                         if (flags & IO_SYNC)
1897                                 io_flags = CL_COMMIT | CL_AGE;
1898                         else
1899                                 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
1900
1901                         if (vp->v_flag & VNOCACHE_DATA)
1902                                 io_flags |= CL_DUMP;
1903
1904                         while (vp->v_numoutput >= ASYNC_THROTTLE) {
1905                                 vp->v_flag |= VTHROTTLED;
1906                                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
1907                         }
1908                         retval = cluster_io(vp, upl, 0, upl_f_offset, io_size,
1909                                             io_flags, (struct buf *)0);
1910                 }
1911         }
1912         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1913                      retval, 0, 0, 0, 0);
1914
1915         return (retval);
1916 }
1917
1918 cluster_read(vp, uio, filesize, devblocksize, flags)
1919         struct vnode *vp;
1920         struct uio   *uio;
1921         off_t         filesize;
1922         int           devblocksize;
1923         int           flags;
1924 {
1925         int           prev_resid;
1926         int           clip_size;
1927         off_t         max_io_size;
1928         struct iovec  *iov;
1929         vm_offset_t   upl_offset;
1930         int           upl_size;
1931         int           pages_in_pl;
1932         upl_page_info_t *pl;
1933         int           upl_flags;
1934         upl_t         upl;
1935         int           retval = 0;
1936
1937         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
1938                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
1939
1940         /*
1941          * We set a threshhold of 4 pages to decide if the nocopy
1942          * read loop is worth the trouble...
1943          */
1944
1945         if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
1946           {
1947             retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1948             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
1949                          (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
1950             return(retval);
1951           }
1952
1953         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
1954           {
1955             /* we know we have a resid, so this is safe */
1956             iov = uio->uio_iov;
1957             while (iov->iov_len == 0) {
1958               uio->uio_iov++;
1959               uio->uio_iovcnt--;
1960               iov = uio->uio_iov;
1961             }
1962
1963             /*
1964              * We check every vector target and if it is physically
1965              * contiguous space, we skip the sanity checks.
1966              */
1967
1968             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1969             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1970             pages_in_pl = 0;
1971             upl_flags = UPL_QUERY_OBJECT_TYPE;
1972             if((vm_map_get_upl(current_map(),
1973                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1974                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1975               {
1976                 /*
1977                  * the user app must have passed in an invalid address
1978                  */
1979                 return (EFAULT);
1980               }
1981
1982             if (upl_flags & UPL_PHYS_CONTIG)
1983               {
1984                 retval = cluster_phys_read(vp, uio, filesize);
1985               }
1986             else if (uio->uio_resid < 4 * PAGE_SIZE)
1987               {
1988                 /*
1989                  * We set a threshhold of 4 pages to decide if the nocopy
1990                  * read loop is worth the trouble...
1991                  */
1992                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1993                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
1994                              (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
1995                 return(retval);
1996               }
1997             else if (uio->uio_offset & PAGE_MASK_64)
1998               {
1999                 /* Bring the file offset read up to a pagesize boundary */
2000                 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2001                 if (uio->uio_resid < clip_size)
2002                   clip_size = uio->uio_resid;
2003                 /*
2004                  * Fake the resid going into the cluster_read_x call
2005                  * and restore it on the way out.
2006                  */
2007                 prev_resid = uio->uio_resid;
2008                 uio->uio_resid = clip_size;
2009                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2010                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2011               }
2012             else if ((int)iov->iov_base & PAGE_MASK_64)
2013               {
2014                 clip_size = iov->iov_len;
2015                 prev_resid = uio->uio_resid;
2016                 uio->uio_resid = clip_size;
2017                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2018                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2019               }
2020             else
2021               {
2022                 /*
2023                  * If we come in here, we know the offset into
2024                  * the file is on a pagesize boundary
2025                  */
2026
2027                 max_io_size = filesize - uio->uio_offset;
2028                 clip_size = uio->uio_resid;
2029                 if (iov->iov_len < clip_size)
2030                   clip_size = iov->iov_len;
2031                 if (max_io_size < clip_size)
2032                   clip_size = (int)max_io_size;
2033
2034                 if (clip_size < PAGE_SIZE)
2035                   {
2036                     /*
2037                      * Take care of the tail end of the read in this vector.
2038                      */
2039                     prev_resid = uio->uio_resid;
2040                     uio->uio_resid = clip_size;
2041                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2042                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2043                   }
2044                 else
2045                   {
2046                     /* round clip_size down to a multiple of pagesize */
2047                     clip_size = clip_size & ~(PAGE_MASK);
2048                     prev_resid = uio->uio_resid;
2049                     uio->uio_resid = clip_size;
2050                     retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2051                     if ((retval==0) && uio->uio_resid)
2052                       retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2053                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2054                   }
2055               } /* end else */
2056           } /* end while */
2057
2058         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2059                      (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2060
2061         return(retval);
2062 }
2063
2064 static
2065 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2066         struct vnode *vp;
2067         struct uio   *uio;
2068         off_t         filesize;
2069         int           devblocksize;
2070         int           flags;
2071 {
2072         upl_page_info_t *pl;
2073         upl_t            upl;
2074         vm_offset_t      upl_offset;
2075         int              upl_size;
2076         off_t            upl_f_offset;
2077         int              start_offset;
2078         int              start_pg;
2079         int              last_pg;
2080         int              uio_last;
2081         int              pages_in_upl;
2082         off_t            max_size;
2083         int              io_size;
2084         vm_offset_t      io_address;
2085         kern_return_t    kret;
2086         int              segflg;
2087         int              error  = 0;
2088         int              retval = 0;
2089         int              b_lblkno;
2090         int              e_lblkno;
2091
2092         b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
2093
2094         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2095                 /*
2096                  * compute the size of the upl needed to encompass
2097                  * the requested read... limit each call to cluster_io
2098                  * to the maximum UPL size... cluster_io will clip if
2099                  * this exceeds the maximum io_size for the device,
2100                  * make sure to account for
2101                  * a starting offset that's not page aligned
2102                  */
2103                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2104                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2105                 max_size     = filesize - uio->uio_offset;
2106
2107                 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2108                         io_size = uio->uio_resid;
2109                 else
2110                         io_size = max_size;
2111 #ifdef ppc
2112                 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
2113                         segflg = uio->uio_segflg;
2114
2115                         uio->uio_segflg = UIO_PHYS_USERSPACE;
2116
2117                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2118                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2119
2120                         while (io_size && retval == 0) {
2121                                 int         xsize;
2122                                 vm_offset_t paddr;
2123
2124                                 if (ubc_page_op(vp,
2125                                                 upl_f_offset,
2126                                                 UPL_POP_SET | UPL_POP_BUSY,
2127                                                 &paddr, 0) != KERN_SUCCESS)
2128                                         break;
2129
2130                                 xsize = PAGE_SIZE - start_offset;
2131
2132                                 if (xsize > io_size)
2133                                         xsize = io_size;
2134
2135                                 retval = uiomove((caddr_t)(paddr + start_offset), xsize, uio);
2136
2137                                 ubc_page_op(vp, upl_f_offset,
2138                                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2139
2140                                 io_size     -= xsize;
2141                                 start_offset = (int)
2142                                         (uio->uio_offset & PAGE_MASK_64);
2143                                 upl_f_offset = uio->uio_offset - start_offset;
2144                         }
2145                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2146                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2147
2148                         uio->uio_segflg = segflg;
2149
2150                         if (retval)
2151                                 break;
2152
2153                         if (io_size == 0) {
2154                                 /*
2155                                  * we're already finished with this read request
2156                                  * let's see if we should do a read-ahead
2157                                  */
2158                                 e_lblkno = (int)
2159                                         ((uio->uio_offset - 1) / PAGE_SIZE_64);
2160
2161                                 if (!(vp->v_flag & VRAOFF))
2162                                         /*
2163                                          * let's try to read ahead if we're in
2164                                          * a sequential access pattern
2165                                          */
2166                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2167                                 vp->v_lastr = e_lblkno;
2168
2169                                 break;
2170                         }
2171                         max_size = filesize - uio->uio_offset;
2172                 }
2173 #endif
2174                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2175                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2176                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2177                 pages_in_upl = upl_size / PAGE_SIZE;
2178
2179                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2180                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
2181
2182                 kret = ubc_create_upl(vp,
2183                                                 upl_f_offset,
2184                                                 upl_size,
2185                                                 &upl,
2186                                                 &pl,
2187                                                 UPL_FLAGS_NONE);
2188                 if (kret != KERN_SUCCESS)
2189                         panic("cluster_read: failed to get pagelist");
2190
2191                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2192                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
2193
2194                 /*
2195                  * scan from the beginning of the upl looking for the first
2196                  * non-valid page.... this will become the first page in
2197                  * the request we're going to make to 'cluster_io'... if all
2198                  * of the pages are valid, we won't call through to 'cluster_io'
2199                  */
2200                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2201                         if (!upl_valid_page(pl, start_pg))
2202                                 break;
2203                 }
2204
2205                 /*
2206                  * scan from the starting invalid page looking for a valid
2207                  * page before the end of the upl is reached, if we
2208                  * find one, then it will be the last page of the request to
2209                  * 'cluster_io'
2210                  */
2211                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2212                         if (upl_valid_page(pl, last_pg))
2213                                 break;
2214                 }
2215
2216                 if (start_pg < last_pg) {
2217                         /*
2218                          * we found a range of 'invalid' pages that must be filled
2219                          * if the last page in this range is the last page of the file
2220                          * we may have to clip the size of it to keep from reading past
2221                          * the end of the last physical block associated with the file
2222                          */
2223                         upl_offset = start_pg * PAGE_SIZE;
2224                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2225
2226                         if ((upl_f_offset + upl_offset + io_size) > filesize) {
2227                                 io_size = filesize - (upl_f_offset + upl_offset);
2228                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2229                         }
2230                         /*
2231                          * issue a synchronous read to cluster_io
2232                          */
2233
2234                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2235                                            io_size, CL_READ, (struct buf *)0);
2236                 }
2237                 if (error == 0) {
2238                         /*
2239                          * if the read completed successfully, or there was no I/O request
2240                          * issued, than map the upl into kernel address space and
2241                          * move the data into user land.... we'll first add on any 'valid'
2242                          * pages that were present in the upl when we acquired it.
2243                          */
2244                         u_int  val_size;
2245                         u_int  size_of_prefetch;
2246
2247                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2248                                 if (!upl_valid_page(pl, uio_last))
2249                                         break;
2250                         }
2251                         /*
2252                          * compute size to transfer this round,  if uio->uio_resid is
2253                          * still non-zero after this uiomove, we'll loop around and
2254                          * set up for another I/O.
2255                          */
2256                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2257
2258                         if (max_size < val_size)
2259                                 val_size = max_size;
2260
2261                         if (uio->uio_resid < val_size)
2262                                 val_size = uio->uio_resid;
2263
2264                         e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2265
2266                         if (size_of_prefetch = (uio->uio_resid - val_size)) {
2267                                 /*
2268                                  * if there's still I/O left to do for this request, then issue a
2269                                  * pre-fetch I/O... the I/O wait time will overlap
2270                                  * with the copying of the data
2271                                  */
2272                                 cluster_rd_prefetch(vp, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
2273                         } else {
2274                                 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2275                                         /*
2276                                          * let's try to read ahead if we're in
2277                                          * a sequential access pattern
2278                                          */
2279                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2280                                 vp->v_lastr = e_lblkno;
2281                         }
2282 #ifdef ppc
2283                         if (uio->uio_segflg == UIO_USERSPACE) {
2284                                 int       offset;
2285
2286                                 segflg = uio->uio_segflg;
2287
2288                                 uio->uio_segflg = UIO_PHYS_USERSPACE;
2289
2290
2291                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2292                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2293
2294                                 offset = start_offset;
2295
2296                                 while (val_size && retval == 0) {
2297                                         int       csize;
2298                                         int       i;
2299                                         caddr_t   paddr;
2300
2301                                         i = offset / PAGE_SIZE;
2302                                         csize = min(PAGE_SIZE - start_offset, val_size);
2303
2304                                         paddr = (caddr_t)upl_phys_page(pl, i) + start_offset;
2305
2306                                         retval = uiomove(paddr, csize, uio);
2307
2308                                         val_size    -= csize;
2309                                         offset      += csize;
2310                                         start_offset = offset & PAGE_MASK;
2311                                 }
2312                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2313                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2314
2315                                 uio->uio_segflg = segflg;
2316                         } else
2317 #endif
2318                         {
2319                                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
2320                                         panic("cluster_read: ubc_upl_map() failed\n");
2321
2322                                 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2323
2324                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
2325                                         panic("cluster_read: ubc_upl_unmap() failed\n");
2326                         }
2327                 }
2328                 if (start_pg < last_pg) {
2329                         /*
2330                          * compute the range of pages that we actually issued an I/O for
2331                          * and either commit them as valid if the I/O succeeded
2332                          * or abort them if the I/O failed
2333                          */
2334                         io_size = (last_pg - start_pg) * PAGE_SIZE;
2335
2336                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2337                                      upl, start_pg * PAGE_SIZE, io_size, error, 0);
2338
2339                         if (error || (vp->v_flag & VNOCACHE_DATA))
2340                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2341                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2342                         else
2343                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2344                                                 UPL_COMMIT_CLEAR_DIRTY
2345                                                 | UPL_COMMIT_FREE_ON_EMPTY
2346                                                 | UPL_COMMIT_INACTIVATE);
2347
2348                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2349                                      upl, start_pg * PAGE_SIZE, io_size, error, 0);
2350                 }
2351                 if ((last_pg - start_pg) < pages_in_upl) {
2352                         int cur_pg;
2353                         int commit_flags;
2354
2355                         /*
2356                          * the set of pages that we issued an I/O for did not encompass
2357                          * the entire upl... so just release these without modifying
2358                          * there state
2359                          */
2360                         if (error)
2361                                 ubc_upl_abort(upl, 0);
2362                         else {
2363                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2364                                              upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2365
2366                                 if (start_pg) {
2367                                         /*
2368                                          * we found some already valid pages at the beginning of
2369                                          * the upl commit these back to the inactive list with
2370                                          * reference cleared
2371                                          */
2372                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2373                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2374                                                                    | UPL_COMMIT_INACTIVATE;
2375
2376                                                 if (upl_dirty_page(pl, cur_pg))
2377                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2378
2379                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2380                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2381                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2382                                                 else
2383                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2384                                                                 PAGE_SIZE, commit_flags);
2385                                         }
2386                                 }
2387                                 if (last_pg < uio_last) {
2388                                         /*
2389                                          * we found some already valid pages immediately after the
2390                                          * pages we issued I/O for, commit these back to the
2391                                          * inactive list with reference cleared
2392                                          */
2393                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2394                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
2395                                                                                 | UPL_COMMIT_INACTIVATE;
2396
2397                                                 if (upl_dirty_page(pl, cur_pg))
2398                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2399
2400                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2401                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2402                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2403                                                 else
2404                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2405                                                                 PAGE_SIZE, commit_flags);
2406                                         }
2407                                 }
2408                                 if (uio_last < pages_in_upl) {
2409                                         /*
2410                                          * there were some invalid pages beyond the valid pages
2411                                          * that we didn't issue an I/O for, just release them
2412                                          * unchanged
2413                                          */
2414                                         ubc_upl_abort(upl, 0);
2415                                 }
2416
2417                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2418                                         upl, -1, -1, 0, 0);
2419                         }
2420                 }
2421                 if (retval == 0)
2422                         retval = error;
2423         }
2424
2425         return (retval);
2426 }
2427
2428 static
2429 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2430         struct vnode *vp;
2431         struct uio   *uio;
2432         off_t         filesize;
2433         int           devblocksize;
2434         int           flags;
2435 {
2436         upl_t            upl;
2437         upl_page_info_t  *pl;
2438         off_t            upl_f_offset;
2439         vm_offset_t      upl_offset;
2440         off_t            start_upl_f_offset;
2441         off_t            max_io_size;
2442         int              io_size;
2443         int              upl_size;
2444         int              upl_needed_size;
2445         int              pages_in_pl;
2446         vm_offset_t      paddr;
2447         int              upl_flags;
2448         kern_return_t    kret;
2449         int              segflg;
2450         struct iovec     *iov;
2451         int              i;
2452         int              force_data_sync;
2453         int              error  = 0;
2454         int              retval = 0;
2455
2456         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2457                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2458
2459         /*
2460          * When we enter this routine, we know
2461          *  -- the offset into the file is on a pagesize boundary
2462          *  -- the resid is a page multiple
2463          *  -- the resid will not exceed iov_len
2464          */
2465
2466         iov = uio->uio_iov;
2467         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2468
2469           max_io_size = filesize - uio->uio_offset;
2470
2471           if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2472               io_size = max_io_size;
2473           else
2474               io_size = uio->uio_resid;
2475
2476           /*
2477            * We don't come into this routine unless
2478            * UIO_USERSPACE is set.
2479            */
2480           segflg = uio->uio_segflg;
2481
2482           uio->uio_segflg = UIO_PHYS_USERSPACE;
2483
2484           /*
2485            * First look for pages already in the cache
2486            * and move them to user space.
2487            */
2488           while (io_size && (retval == 0)) {
2489             upl_f_offset = uio->uio_offset;
2490
2491             /*
2492              * If this call fails, it means the page is not
2493              * in the page cache.
2494              */
2495             if (ubc_page_op(vp, upl_f_offset,
2496                             UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS)
2497               break;
2498
2499             retval = uiomove((caddr_t)(paddr), PAGE_SIZE, uio);
2500
2501             ubc_page_op(vp, upl_f_offset,
2502                         UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2503
2504             io_size     -= PAGE_SIZE;
2505             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
2506                            (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2507           }
2508
2509           uio->uio_segflg = segflg;
2510
2511           if (retval)
2512             {
2513               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2514                            (int)uio->uio_offset, uio->uio_resid, 2, retval, 0);
2515               return(retval);
2516             }
2517
2518           /* If we are already finished with this read, then return */
2519           if (io_size == 0)
2520             {
2521
2522               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2523                            (int)uio->uio_offset, uio->uio_resid, 3, io_size, 0);
2524               return(0);
2525             }
2526
2527           max_io_size = io_size;
2528           if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2529             max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2530
2531           start_upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
2532           upl_f_offset = start_upl_f_offset;
2533           io_size = 0;
2534
2535           while(io_size < max_io_size)
2536             {
2537
2538               if(ubc_page_op(vp, upl_f_offset,
2539                                 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS)
2540               {
2541                         ubc_page_op(vp, upl_f_offset,
2542                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2543                         break;
2544               }
2545
2546                   /*
2547                    * Build up the io request parameters.
2548                    */
2549
2550                   io_size += PAGE_SIZE;
2551                   upl_f_offset += PAGE_SIZE;
2552                 }
2553
2554               if (io_size == 0)
2555                 return(retval);
2556
2557           upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2558           upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2559
2560           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2561                        (int)upl_offset, upl_needed_size, iov->iov_base, io_size, 0);
2562
2563           for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
2564             {
2565               pages_in_pl = 0;
2566               upl_size = upl_needed_size;
2567               upl_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2568
2569               kret = vm_map_get_upl(current_map(),
2570                                     (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2571                                     &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2572
2573               if (kret != KERN_SUCCESS)
2574                 {
2575                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2576                                (int)upl_offset, upl_size, io_size, kret, 0);
2577
2578                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2579                                (int)uio->uio_offset, uio->uio_resid, 4, retval, 0);
2580
2581                   /* cluster_nocopy_read: failed to get pagelist */
2582                   /* do not return kret here */
2583                   return(retval);
2584                 }
2585
2586               pages_in_pl = upl_size / PAGE_SIZE;
2587               pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2588
2589               for(i=0; i < pages_in_pl; i++)
2590                 {
2591                   if (!upl_valid_page(pl, i))
2592                     break;
2593                 }
2594               if (i == pages_in_pl)
2595                 break;
2596
2597               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2598                                   UPL_ABORT_FREE_ON_EMPTY);
2599             }
2600
2601           if (force_data_sync >= 3)
2602             {
2603                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2604                                (int)upl_offset, upl_size, io_size, kret, 0);
2605
2606                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2607                                (int)uio->uio_offset, uio->uio_resid, 5, retval, 0);
2608               return(retval);
2609             }
2610           /*
2611            * Consider the possibility that upl_size wasn't satisfied.
2612            */
2613           if (upl_size != upl_needed_size)
2614             io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2615
2616           if (io_size == 0)
2617             {
2618               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2619                                    UPL_ABORT_FREE_ON_EMPTY);
2620               return(retval);
2621             }
2622
2623           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2624                        (int)upl_offset, upl_size, io_size, kret, 0);
2625
2626           /*
2627            * issue a synchronous read to cluster_io
2628            */
2629
2630           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2631                        upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
2632
2633           error = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
2634                              io_size, CL_READ| CL_NOZERO, (struct buf *)0);
2635
2636           if (error == 0) {
2637             /*
2638              * The cluster_io read completed successfully,
2639              * update the uio structure and commit.
2640              */
2641
2642             ubc_upl_commit_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2643                                         UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
2644
2645             iov->iov_base += io_size;
2646             iov->iov_len -= io_size;
2647             uio->uio_resid -= io_size;
2648             uio->uio_offset += io_size;
2649           }
2650           else {
2651             ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2652                                    UPL_ABORT_FREE_ON_EMPTY);
2653           }
2654
2655           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2656                        upl, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
2657
2658           if (retval == 0)
2659             retval = error;
2660
2661         } /* end while */
2662
2663
2664         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2665                      (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2666
2667         return (retval);
2668 }
2669
2670
2671 static
2672 cluster_phys_read(vp, uio, filesize)
2673         struct vnode *vp;
2674         struct uio   *uio;
2675         off_t        filesize;
2676 {
2677         upl_t            upl;
2678         vm_offset_t      upl_offset;
2679         off_t            max_size;
2680         int              io_size;
2681         int              upl_size;
2682         int              upl_needed_size;
2683         int              pages_in_pl;
2684         int              upl_flags;
2685         kern_return_t    kret;
2686         struct iovec     *iov;
2687         int              error;
2688
2689         /*
2690          * When we enter this routine, we know
2691          *  -- the resid will not exceed iov_len
2692          *  -- the target address is physically contiguous
2693          */
2694
2695         iov = uio->uio_iov;
2696
2697         max_size = filesize - uio->uio_offset;
2698
2699         if (max_size < (off_t)((unsigned int)iov->iov_len))
2700             io_size = max_size;
2701         else
2702             io_size = iov->iov_len;
2703
2704         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2705         upl_needed_size = upl_offset + io_size;
2706
2707         pages_in_pl = 0;
2708         upl_size = upl_needed_size;
2709         upl_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2710
2711         kret = vm_map_get_upl(current_map(),
2712                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2713                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
2714
2715         if (kret != KERN_SUCCESS)
2716           {
2717             /* cluster_phys_read: failed to get pagelist */
2718             return(EINVAL);
2719           }
2720
2721         /*
2722          * Consider the possibility that upl_size wasn't satisfied.
2723          */
2724         if (upl_size < upl_needed_size)
2725           {
2726             ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2727             return(EINVAL);
2728           }
2729
2730         /*
2731          * issue a synchronous read to cluster_io
2732          */
2733
2734         error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2735                            io_size,  CL_READ| CL_NOZERO | CL_DEV_MEMORY, (struct buf *)0);
2736
2737         if (error == 0)
2738           {
2739             /*
2740              * The cluster_io read completed successfully,
2741              * update the uio structure and commit.
2742              */
2743
2744             ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
2745
2746             iov->iov_base += io_size;
2747             iov->iov_len -= io_size;
2748             uio->uio_resid -= io_size;
2749             uio->uio_offset += io_size;
2750           }
2751         else
2752             ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2753
2754         return (error);
2755 }
2756
2757 /*
2758  * generate advisory I/O's in the largest chunks possible
2759  * the completed pages will be released into the VM cache
2760  */
2761 advisory_read(vp, filesize, f_offset, resid, devblocksize)
2762         struct vnode *vp;
2763         off_t         filesize;
2764         off_t         f_offset;
2765         int           resid;
2766         int           devblocksize;
2767 {
2768         upl_page_info_t *pl;
2769         upl_t            upl;
2770         vm_offset_t      upl_offset;
2771         int              upl_size;
2772         off_t            upl_f_offset;
2773         int              start_offset;
2774         int              start_pg;
2775         int              last_pg;
2776         int              pages_in_upl;
2777         off_t            max_size;
2778         int              io_size;
2779         kern_return_t    kret;
2780         int              retval = 0;
2781
2782
2783         if (!UBCINFOEXISTS(vp))
2784                 return(EINVAL);
2785
2786         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
2787                      (int)f_offset, resid, (int)filesize, devblocksize, 0);
2788
2789         while (resid && f_offset < filesize && retval == 0) {
2790                 /*
2791                  * compute the size of the upl needed to encompass
2792                  * the requested read... limit each call to cluster_io
2793                  * to the maximum UPL size... cluster_io will clip if
2794                  * this exceeds the maximum io_size for the device,
2795                  * make sure to account for
2796                  * a starting offset that's not page aligned
2797                  */
2798                 start_offset = (int)(f_offset & PAGE_MASK_64);
2799                 upl_f_offset = f_offset - (off_t)start_offset;
2800                 max_size     = filesize - f_offset;
2801
2802                 if (resid < max_size)
2803                         io_size = resid;
2804                 else
2805                         io_size = max_size;
2806
2807                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2808                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2809                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2810                 pages_in_upl = upl_size / PAGE_SIZE;
2811
2812                 kret = ubc_create_upl(vp,
2813                                                 upl_f_offset,
2814                                                 upl_size,
2815                                                 &upl,
2816                                                 &pl,
2817                                                 UPL_FLAGS_NONE);
2818                 if (kret != KERN_SUCCESS)
2819                         panic("advisory_read: failed to get pagelist");
2820
2821
2822                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
2823                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
2824
2825                 /*
2826                  * scan from the beginning of the upl looking for the first
2827                  * non-valid page.... this will become the first page in
2828                  * the request we're going to make to 'cluster_io'... if all
2829                  * of the pages are valid, we won't call through to 'cluster_io'
2830                  */
2831                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2832                         if (!upl_valid_page(pl, start_pg))
2833                                 break;
2834                 }
2835
2836                 /*
2837                  * scan from the starting invalid page looking for a valid
2838                  * page before the end of the upl is reached, if we
2839                  * find one, then it will be the last page of the request to
2840                  * 'cluster_io'
2841                  */
2842                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2843                         if (upl_valid_page(pl, last_pg))
2844                                 break;
2845                 }
2846
2847                 if (start_pg < last_pg) {
2848                         /*
2849                          * we found a range of 'invalid' pages that must be filled
2850                          * if the last page in this range is the last page of the file
2851                          * we may have to clip the size of it to keep from reading past
2852                          * the end of the last physical block associated with the file
2853                          */
2854                         upl_offset = start_pg * PAGE_SIZE;
2855                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2856
2857                         if ((upl_f_offset + upl_offset + io_size) > filesize) {
2858                                 io_size = filesize - (upl_f_offset + upl_offset);
2859                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2860                         }
2861                         /*
2862                          * issue an asynchronous read to cluster_io
2863                          */
2864                         retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
2865                                           CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0);
2866                 }
2867                 if (start_pg) {
2868                         /*
2869                          * start_pg of non-zero indicates we found some already valid pages
2870                          * at the beginning of the upl.... we need to release these without
2871                          * modifying there state
2872                          */
2873                         ubc_upl_abort_range(upl, 0, start_pg * PAGE_SIZE,
2874                                         UPL_ABORT_FREE_ON_EMPTY);
2875
2876                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 62)) | DBG_FUNC_NONE,
2877                                     upl, 0, start_pg * PAGE_SIZE, 0, 0);
2878                 }
2879                 if (last_pg < pages_in_upl) {
2880                         /*
2881                          * the set of pages that we issued an I/O for did not extend all the
2882                          * way to the end of the upl..so just release them without modifying
2883                          * there state
2884                          */
2885                         ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, (pages_in_upl - last_pg) * PAGE_SIZE,
2886                                         UPL_ABORT_FREE_ON_EMPTY);
2887
2888                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 63)) | DBG_FUNC_NONE,
2889                                      upl, last_pg * PAGE_SIZE,
2890                                      (pages_in_upl - last_pg) * PAGE_SIZE, 0, 0);
2891                 }
2892                 io_size = (last_pg * PAGE_SIZE) - start_offset;
2893
2894                 if (io_size > resid)
2895                         io_size = resid;
2896                 f_offset += io_size;
2897                 resid    -= io_size;
2898         }
2899         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
2900                      (int)f_offset, resid, retval, 0, 0);
2901
2902         return(retval);
2903 }
2904
2905
2906 cluster_push(vp)
2907         struct vnode *vp;
2908 {
2909         upl_page_info_t *pl;
2910         upl_t            upl;
2911         vm_offset_t      upl_offset;
2912         int              upl_size;
2913         off_t            upl_f_offset;
2914         int              pages_in_upl;
2915         int              start_pg;
2916         int              last_pg;
2917         int              io_size;
2918         int              io_flags;
2919         int              size;
2920         kern_return_t    kret;
2921
2922
2923         if (!UBCINFOEXISTS(vp))
2924                 return(0);
2925
2926         if (vp->v_clen == 0 || (pages_in_upl = vp->v_lastw - vp->v_cstart) == 0)
2927                 return (0);
2928         upl_size = pages_in_upl * PAGE_SIZE;
2929         upl_f_offset = ((off_t)vp->v_cstart) * PAGE_SIZE_64;
2930         size = vp->v_ciosiz;
2931         vp->v_clen = 0;
2932
2933         if (size > upl_size || (upl_size - size) > PAGE_SIZE)
2934                 panic("cluster_push: v_ciosiz doesn't match size of cluster\n");
2935
2936         kret = ubc_create_upl(vp,
2937                                 upl_f_offset,
2938                                 upl_size,
2939                                 &upl,
2940                                         &pl,
2941                                         UPL_FLAGS_NONE);
2942         if (kret != KERN_SUCCESS)
2943                 panic("cluster_push: failed to get pagelist");
2944
2945         last_pg = 0;
2946
2947         while (size) {
2948
2949                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
2950                         if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
2951                                 break;
2952                 }
2953                 if (start_pg > last_pg) {
2954                         io_size = (start_pg - last_pg) * PAGE_SIZE;
2955
2956                         ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size,
2957                                         UPL_ABORT_FREE_ON_EMPTY);
2958
2959                         if (io_size < size)
2960                                 size -= io_size;
2961                         else
2962                                 break;
2963                 }
2964                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2965                         if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
2966                                 break;
2967                 }
2968                 upl_offset = start_pg * PAGE_SIZE;
2969
2970                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
2971
2972                 if (vp->v_flag & VNOCACHE_DATA)
2973                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC | CL_DUMP;
2974                 else
2975                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
2976
2977                 while (vp->v_numoutput >= ASYNC_THROTTLE) {
2978                         vp->v_flag |= VTHROTTLED;
2979                         tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
2980                 }
2981                 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, io_flags, (struct buf *)0);
2982
2983                 size -= io_size;
2984         }
2985         return(1);
2986 }