bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * Redistribution and use in source and binary forms, with or without
  28  * modification, are permitted provided that the following conditions
  29  * are met:
  30  * 1. Redistributions of source code must retain the above copyright
  31  *    notice, this list of conditions and the following disclaimer.
  32  * 2. Redistributions in binary form must reproduce the above copyright
  33  *    notice, this list of conditions and the following disclaimer in the
  34  *    documentation and/or other materials provided with the distribution.
  35  * 3. All advertising materials mentioning features or use of this software
  36  *    must display the following acknowledgement:
  37  *      This product includes software developed by the University of
  38  *      California, Berkeley and its contributors.
  39  * 4. Neither the name of the University nor the names of its contributors
  40  *    may be used to endorse or promote products derived from this software
  41  *    without specific prior written permission.
  42  *
  43  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  46  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  53  * SUCH DAMAGE.
  54  *
  55  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  56  */
  57
  58 #include <sys/param.h>
  59 #include <sys/proc.h>
  60 #include <sys/buf.h>
  61 #include <sys/vnode.h>
  62 #include <sys/mount.h>
  63 #include <sys/trace.h>
  64 #include <sys/malloc.h>
  65 #include <sys/resourcevar.h>
  66 #include <libkern/libkern.h>
  67
  68 #include <sys/ubc.h>
  69 #include <vm/vm_pageout.h>
  70
  71 #include <sys/kdebug.h>
  72
  73 #define CL_READ      0x01
  74 #define CL_ASYNC     0x02
  75 #define CL_COMMIT    0x04
  76 #define CL_NOMAP     0x08
  77 #define CL_PAGEOUT   0x10
  78 #define CL_AGE       0x20
  79 #define CL_DUMP      0x40
  80 #define CL_NOZERO    0x80
  81 #define CL_PAGEIN    0x100
  82 #define CL_DEV_MEMORY 0x200
  83
  84 /*
  85  * throttle the number of async writes that
  86  * can be outstanding on a single vnode
  87  * before we issue a synchronous write
  88  */
  89 #define ASYNC_THROTTLE  6
  90
  91 static int
  92 cluster_iodone(bp)
  93         struct buf *bp;
  94 {
  95         int         b_flags;
  96         int         error;
  97         int         total_size;
  98         int         total_resid;
  99         int         upl_offset;
 100         upl_t       upl;
 101         struct buf *cbp;
 102         struct buf *cbp_head;
 103         struct buf *cbp_next;
 104         struct buf *real_bp;
 105         struct vnode *vp;
 106         int         commit_size;
 107         int         pg_offset;
 108
 109
 110         cbp_head = (struct buf *)(bp->b_trans_head);
 111
 112         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 113                      cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 114
 115         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 116                 /*
 117                  * all I/O requests that are part of this transaction
 118                  * have to complete before we can process it
 119                  */
 120                 if ( !(cbp->b_flags & B_DONE)) {
 121
 122                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 123                                      cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
 124
 125                         return 0;
 126                 }
 127         }
 128         error       = 0;
 129         total_size  = 0;
 130         total_resid = 0;
 131
 132         cbp        = cbp_head;
 133         upl_offset = cbp->b_uploffset;
 134         upl        = cbp->b_pagelist;
 135         b_flags    = cbp->b_flags;
 136         real_bp    = cbp->b_real_bp;
 137         vp         = cbp->b_vp;
 138
 139         while (cbp) {
 140                 if (cbp->b_vectorcount > 1)
 141                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 142
 143                 if ((cbp->b_flags & B_ERROR) && error == 0)
 144                         error = cbp->b_error;
 145
 146                 total_resid += cbp->b_resid;
 147                 total_size  += cbp->b_bcount;
 148
 149                 cbp_next = cbp->b_trans_next;
 150
 151                 free_io_buf(cbp);
 152
 153                 cbp = cbp_next;
 154         }
 155         if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
 156                 vp->v_flag &= ~VTHROTTLED;
 157                 wakeup((caddr_t)&vp->v_numoutput);
 158         }
 159         if ((b_flags & B_NEED_IODONE) && real_bp) {
 160                 if (error) {
 161                         real_bp->b_flags |= B_ERROR;
 162                         real_bp->b_error = error;
 163                 }
 164                 real_bp->b_resid = total_resid;
 165
 166                 biodone(real_bp);
 167         }
 168         if (error == 0 && total_resid)
 169                 error = EIO;
 170
 171         if (b_flags & B_COMMIT_UPL) {
 172                 pg_offset   = upl_offset & PAGE_MASK;
 173                 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 174
 175                 if (error || (b_flags & B_NOCACHE)) {
 176                         int upl_abort_code;
 177
 178                         if (b_flags & B_PAGEOUT)
 179                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 180                         else if (b_flags & B_PGIN)
 181                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 182                         else
 183                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 184
 185                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 186                                         upl_abort_code);
 187
 188                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 189                                      upl, upl_offset - pg_offset, commit_size,
 190                                      0x80000000|upl_abort_code, 0);
 191
 192                 } else {
 193                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 194
 195                         if ( !(b_flags & B_PAGEOUT))
 196                                 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
 197                         if (b_flags & B_AGE)
 198                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 199
 200                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 201                                         upl_commit_flags);
 202
 203                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 204                                      upl, upl_offset - pg_offset, commit_size,
 205                                      upl_commit_flags, 0);
 206                 }
 207         } else
 208                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 209                              upl, upl_offset, 0, error, 0);
 210
 211         return (error);
 212 }
 213
 214
 215 static void
 216 cluster_zero(upl, upl_offset, size, flags, bp)
 217         upl_t         upl;
 218         vm_offset_t   upl_offset;
 219         int           size;
 220         int           flags;
 221         struct buf   *bp;
 222 {
 223         vm_offset_t   io_addr = 0;
 224         kern_return_t kret;
 225
 226         if ( !(flags & CL_NOMAP)) {
 227                 kret = ubc_upl_map(upl, &io_addr);
 228
 229                 if (kret != KERN_SUCCESS)
 230                         panic("cluster_zero: ubc_upl_map() failed with (%d)", kret);
 231                 if (io_addr == 0)
 232                         panic("cluster_zero: ubc_upl_map() mapped 0");
 233         } else
 234                 io_addr = (vm_offset_t)bp->b_data;
 235         bzero((caddr_t)(io_addr + upl_offset), size);
 236
 237         if ( !(flags & CL_NOMAP)) {
 238                 kret = ubc_upl_unmap(upl);
 239
 240                 if (kret != KERN_SUCCESS)
 241                         panic("cluster_zero: kernel_upl_unmap failed");
 242         }
 243 }
 244
 245 static int
 246 cluster_io(vp, upl, upl_offset, f_offset, size, flags, real_bp)
 247         struct vnode *vp;
 248         upl_t         upl;
 249         vm_offset_t   upl_offset;
 250         off_t         f_offset;
 251         int           size;
 252         int           flags;
 253         struct buf   *real_bp;
 254 {
 255         struct buf   *cbp;
 256         struct iovec *iovp;
 257         int           io_flags;
 258         int           error = 0;
 259         int           retval = 0;
 260         struct buf   *cbp_head = 0;
 261         struct buf   *cbp_tail = 0;
 262         upl_page_info_t *pl;
 263         int pg_count;
 264         int pg_offset;
 265         int max_iosize;
 266         int max_vectors;
 267         int priv;
 268
 269         if (flags & CL_READ) {
 270                 io_flags = (B_VECTORLIST | B_READ);
 271
 272                 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
 273         } else {
 274                 io_flags = (B_VECTORLIST | B_WRITEINPROG);
 275
 276                 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
 277         }
 278         pl = ubc_upl_pageinfo(upl);
 279
 280         if (flags & CL_ASYNC)
 281                 io_flags |= (B_CALL | B_ASYNC);
 282         if (flags & CL_AGE)
 283                 io_flags |= B_AGE;
 284         if (flags & CL_DUMP)
 285                 io_flags |= B_NOCACHE;
 286         if (flags & CL_PAGEIN)
 287                 io_flags |= B_PGIN;
 288
 289
 290         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 291                      (int)f_offset, size, upl_offset, flags, 0);
 292
 293         if ((flags & CL_READ) && ((upl_offset + size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 294                 /*
 295                  * then we are going to end up
 296                  * with a page that we can't complete (the file size wasn't a multiple
 297                  * of PAGE_SIZE and we're trying to read to the end of the file
 298                  * so we'll go ahead and zero out the portion of the page we can't
 299                  * read in from the file
 300                  */
 301                 cluster_zero(upl, upl_offset + size, PAGE_SIZE - ((upl_offset + size) & PAGE_MASK), flags, real_bp);
 302
 303                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
 304                              upl_offset + size, PAGE_SIZE - ((upl_offset + size) & PAGE_MASK),
 305                              flags, real_bp, 0);
 306         }
 307         while (size) {
 308                 size_t io_size;
 309                 int vsize;
 310                 int i;
 311                 int pl_index;
 312                 int pg_resid;
 313                 int num_contig;
 314                 daddr_t lblkno;
 315                 daddr_t blkno;
 316
 317                 if (size > max_iosize)
 318                         io_size = max_iosize;
 319                 else
 320                         io_size = size;
 321
 322                 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, &io_size, NULL)) {
 323                         if (error == EOPNOTSUPP)
 324                                 panic("VOP_CMAP Unimplemented");
 325                         break;
 326                 }
 327
 328                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 329                              (int)f_offset, (int)blkno, io_size, 0, 0);
 330
 331                 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
 332                         if (flags & CL_PAGEOUT) {
 333                                 error = EINVAL;
 334                                 break;
 335                         };
 336
 337                         /* Try paging out the page individually before
 338                            giving up entirely and dumping it (it could
 339                            be mapped in a "hole" and require allocation
 340                            before the I/O:
 341                          */
 342                          ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE_64, UPL_ABORT_FREE_ON_EMPTY);
 343                          if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
 344                                 error = EINVAL;
 345                                 break;
 346                          };
 347
 348                         upl_offset += PAGE_SIZE_64;
 349                         f_offset   += PAGE_SIZE_64;
 350                         size       -= PAGE_SIZE_64;
 351                         continue;
 352                 }
 353                 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
 354                 /*
 355                  * we have now figured out how much I/O we can do - this is in 'io_size'
 356                  * pl_index represents the first page in the 'upl' that the I/O will occur for
 357                  * pg_offset is the starting point in the first page for the I/O
 358                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 359                  */
 360                 pl_index  = upl_offset / PAGE_SIZE;
 361                 pg_offset = upl_offset & PAGE_MASK;
 362                 pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 363
 364                 if (flags & CL_DEV_MEMORY) {
 365                         /*
 366                          * currently, can't deal with reading 'holes' in file
 367                          */
 368                         if ((long)blkno == -1) {
 369                                 error = EINVAL;
 370                                 break;
 371                         }
 372                         /*
 373                          * treat physical requests as one 'giant' page
 374                          */
 375                         pg_count = 1;
 376                 }
 377                 if ((flags & CL_READ) && (long)blkno == -1) {
 378                         /*
 379                          * if we're reading and blkno == -1, then we've got a
 380                          * 'hole' in the file that we need to deal with by zeroing
 381                          * out the affected area in the upl
 382                          */
 383                         cluster_zero(upl, upl_offset, io_size, flags, real_bp);
 384
 385                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
 386                                      upl_offset, io_size, flags, real_bp, 0);
 387
 388                         pg_count = (io_size - pg_offset) / PAGE_SIZE;
 389
 390                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 391                                 pg_count++;
 392
 393                         if (pg_count) {
 394                                 if (pg_offset)
 395                                         pg_resid = PAGE_SIZE - pg_offset;
 396                                 else
 397                                         pg_resid = 0;
 398                                 if (flags & CL_COMMIT)
 399                                         ubc_upl_commit_range(upl,
 400                                                         upl_offset + pg_resid,
 401                                                         pg_count * PAGE_SIZE,
 402                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 403                         }
 404                         upl_offset += io_size;
 405                         f_offset   += io_size;
 406                         size       -= io_size;
 407
 408                         if (cbp_head && pg_count)
 409                                 goto start_io;
 410                         continue;
 411                 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
 412                         real_bp->b_blkno = blkno;
 413                 }
 414
 415                 if (pg_count > 1) {
 416                         if (pg_count > max_vectors) {
 417                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 418
 419                                 if (io_size < 0) {
 420                                         io_size = PAGE_SIZE - pg_offset;
 421                                         pg_count = 1;
 422                                 } else
 423                                         pg_count = max_vectors;
 424                         }
 425                         /*
 426                          * we need to allocate space for the vector list
 427                          */
 428                         if (pg_count > 1) {
 429                                 iovp = (struct iovec *)_MALLOC(sizeof(struct iovec) * pg_count,
 430                                                                M_SEGMENT, M_NOWAIT);
 431
 432                                 if (iovp == (struct iovec *) 0) {
 433                                         /*
 434                                          * if the allocation fails, then throttle down to a single page
 435                                          */
 436                                         io_size = PAGE_SIZE - pg_offset;
 437                                         pg_count = 1;
 438                                 }
 439                         }
 440                 }
 441
 442                 /* Throttle the speculative IO */
 443                 if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 444                         priv = 0;
 445                 else
 446                         priv = 1;
 447
 448                 cbp = alloc_io_buf(vp, priv);
 449
 450                 if (pg_count == 1)
 451                         /*
 452                          * we use the io vector that's reserved in the buffer header
 453                          * this insures we can always issue an I/O even in a low memory
 454                          * condition that prevents the _MALLOC from succeeding... this
 455                          * is necessary to prevent deadlocks with the pager
 456                          */
 457                         iovp = (struct iovec *)(&cbp->b_vects[0]);
 458
 459                 cbp->b_vectorlist  = (void *)iovp;
 460                 cbp->b_vectorcount = pg_count;
 461
 462                 if (flags & CL_DEV_MEMORY) {
 463
 464                         iovp->iov_len  = io_size;
 465                         iovp->iov_base = (caddr_t)upl_phys_page(pl, 0);
 466
 467                         if (iovp->iov_base == (caddr_t) 0) {
 468                                 free_io_buf(cbp);
 469                                 error = EINVAL;
 470                         } else
 471                                 iovp->iov_base += upl_offset;
 472                 } else {
 473
 474                   for (i = 0, vsize = io_size; i < pg_count; i++, iovp++) {
 475                         int     psize;
 476
 477                         psize = PAGE_SIZE - pg_offset;
 478
 479                         if (psize > vsize)
 480                                 psize = vsize;
 481
 482                         iovp->iov_len  = psize;
 483                         iovp->iov_base = (caddr_t)upl_phys_page(pl, pl_index + i);
 484
 485                         if (iovp->iov_base == (caddr_t) 0) {
 486                                 if (pg_count > 1)
 487                                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 488                                 free_io_buf(cbp);
 489
 490                                 error = EINVAL;
 491                                 break;
 492                         }
 493                         iovp->iov_base += pg_offset;
 494                         pg_offset = 0;
 495
 496                         if (flags & CL_PAGEOUT) {
 497                                 int         s;
 498                                 struct buf *bp;
 499
 500                                 s = splbio();
 501                                 if (bp = incore(vp, lblkno + i)) {
 502                                         if (!ISSET(bp->b_flags, B_BUSY)) {
 503                                                 bremfree(bp);
 504                                                 SET(bp->b_flags, (B_BUSY | B_INVAL));
 505                                                 splx(s);
 506                                                 brelse(bp);
 507                                         } else
 508                                                 panic("BUSY bp found in cluster_io");
 509                                 }
 510                                 splx(s);
 511                         }
 512                         vsize -= psize;
 513                     }
 514                 }
 515                 if (error)
 516                         break;
 517
 518                 if (flags & CL_ASYNC)
 519                         cbp->b_iodone = (void *)cluster_iodone;
 520                 cbp->b_flags |= io_flags;
 521
 522                 cbp->b_lblkno = lblkno;
 523                 cbp->b_blkno  = blkno;
 524                 cbp->b_bcount = io_size;
 525                 cbp->b_pagelist  = upl;
 526                 cbp->b_uploffset = upl_offset;
 527                 cbp->b_trans_next = (struct buf *)0;
 528
 529                 if (flags & CL_READ)
 530                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 531                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 532                 else
 533                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 534                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 535
 536                 if (cbp_head) {
 537                         cbp_tail->b_trans_next = cbp;
 538                         cbp_tail = cbp;
 539                 } else {
 540                         cbp_head = cbp;
 541                         cbp_tail = cbp;
 542                 }
 543                 (struct buf *)(cbp->b_trans_head) = cbp_head;
 544
 545                 upl_offset += io_size;
 546                 f_offset   += io_size;
 547                 size       -= io_size;
 548
 549                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY)) || size == 0) {
 550                         /*
 551                          * if we have no more I/O to issue or
 552                          * the current I/O we've prepared fully
 553                          * completes the last page in this request
 554                          * or it's been completed via a zero-fill
 555                          * due to a 'hole' in the file
 556                          * then go ahead and issue the I/O
 557                          */
 558 start_io:
 559                         if (flags & CL_COMMIT)
 560                                 cbp_head->b_flags |= B_COMMIT_UPL;
 561                         if (flags & CL_PAGEOUT)
 562                                 cbp_head->b_flags |= B_PAGEOUT;
 563                         if (flags & CL_PAGEIN)
 564                                 cbp_head->b_flags |= B_PGIN;
 565
 566                         if (real_bp) {
 567                                 cbp_head->b_flags |= B_NEED_IODONE;
 568                                 cbp_head->b_real_bp = real_bp;
 569                         }
 570
 571                         for (cbp = cbp_head; cbp;) {
 572                                 struct buf * cbp_next;
 573
 574                                 if (io_flags & B_WRITEINPROG)
 575                                         cbp->b_vp->v_numoutput++;
 576
 577                                 cbp_next = cbp->b_trans_next;
 578
 579                                 (void) VOP_STRATEGY(cbp);
 580                                 cbp = cbp_next;
 581                         }
 582                         if ( !(flags & CL_ASYNC)) {
 583                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 584                                         biowait(cbp);
 585
 586                                 if (error = cluster_iodone(cbp_head)) {
 587                                         retval = error;
 588                                         error  = 0;
 589                                 }
 590                         }
 591                         cbp_head = (struct buf *)0;
 592                         cbp_tail = (struct buf *)0;
 593                 }
 594         }
 595         if (error) {
 596                 int abort_size;
 597
 598                 for (cbp = cbp_head; cbp;) {
 599                         struct buf * cbp_next;
 600
 601                         if (cbp->b_vectorcount > 1)
 602                                 _FREE(cbp->b_vectorlist, M_SEGMENT);
 603                         upl_offset -= cbp->b_bcount;
 604                         size       += cbp->b_bcount;
 605
 606                         cbp_next = cbp->b_trans_next;
 607                         free_io_buf(cbp);
 608                         cbp = cbp_next;
 609                 }
 610                 pg_offset  = upl_offset & PAGE_MASK;
 611                 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 612
 613                 if (flags & CL_COMMIT) {
 614                         int upl_abort_code;
 615
 616                         if (flags & CL_PAGEOUT)
 617                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 618                         else if (flags & CL_PAGEIN)
 619                             upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 620                         else
 621                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 622
 623                         ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
 624                                                 upl_abort_code);
 625
 626                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
 627                                      upl, upl_offset - pg_offset, abort_size, error, 0);
 628                 }
 629                 if (real_bp) {
 630                         real_bp->b_flags |= B_ERROR;
 631                         real_bp->b_error  = error;
 632
 633                         biodone(real_bp);
 634                 }
 635                 if (retval == 0)
 636                         retval = error;
 637         }
 638         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
 639                      (int)f_offset, size, upl_offset, retval, 0);
 640
 641         return (retval);
 642 }
 643
 644
 645 static int
 646 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
 647         struct vnode *vp;
 648         off_t         f_offset;
 649         u_int         size;
 650         off_t         filesize;
 651         int           devblocksize;
 652 {
 653         upl_t         upl;
 654         upl_page_info_t *pl;
 655         int           pages_in_upl;
 656         int           start_pg;
 657         int           last_pg;
 658         int           last_valid;
 659         int           io_size;
 660
 661
 662         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
 663                      (int)f_offset, size, (int)filesize, 0, 0);
 664
 665         if (f_offset >= filesize) {
 666                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 667                              (int)f_offset, 0, 0, 0, 0);
 668                 return(0);
 669         }
 670         if (ubc_page_op(vp, f_offset, 0, 0, 0) == KERN_SUCCESS) {
 671                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 672                              (int)f_offset, 0, 0, 0, 0);
 673                 return(0);
 674         }
 675         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
 676                 size = MAX_UPL_TRANSFER * PAGE_SIZE;
 677         else
 678                 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
 679
 680         if ((off_t)size > (filesize - f_offset))
 681                 size = ((filesize - f_offset) + (devblocksize - 1)) & ~(devblocksize - 1);
 682
 683         pages_in_upl = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
 684
 685         ubc_create_upl(vp,
 686                         f_offset,
 687                         pages_in_upl * PAGE_SIZE,
 688                                 &upl,
 689                                 &pl,
 690                                 UPL_FLAGS_NONE);
 691
 692         if (upl == (upl_t) 0)
 693                 return(0);
 694
 695         /*
 696          * scan from the beginning of the upl looking for the first
 697          * non-valid page.... this will become the first page in
 698          * the request we're going to make to 'cluster_io'... if all
 699          * of the pages are valid, we won't call through to 'cluster_io'
 700          */
 701         for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
 702                 if (!upl_valid_page(pl, start_pg))
 703                         break;
 704         }
 705
 706         /*
 707          * scan from the starting invalid page looking for a valid
 708          * page before the end of the upl is reached, if we
 709          * find one, then it will be the last page of the request to
 710          * 'cluster_io'
 711          */
 712         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
 713                 if (upl_valid_page(pl, last_pg))
 714                         break;
 715         }
 716
 717         /*
 718          * if we find any more free valid pages at the tail of the upl
 719          * than update maxra accordingly....
 720          */
 721         for (last_valid = last_pg; last_valid < pages_in_upl; last_valid++) {
 722                 if (!upl_valid_page(pl, last_valid))
 723                         break;
 724         }
 725         if (start_pg < last_pg) {
 726                 vm_offset_t   upl_offset;
 727
 728                 /*
 729                  * we found a range of 'invalid' pages that must be filled
 730                  * 'size' has already been clipped to the LEOF
 731                  * make sure it's at least a multiple of the device block size
 732                  */
 733                 upl_offset = start_pg * PAGE_SIZE;
 734                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
 735
 736                 if ((upl_offset + io_size) > size) {
 737                         io_size = size - upl_offset;
 738
 739                         KERNEL_DEBUG(0xd001000, upl_offset, size, io_size, 0, 0);
 740                 }
 741                 cluster_io(vp, upl, upl_offset, f_offset + upl_offset, io_size,
 742                            CL_READ | CL_COMMIT | CL_ASYNC | CL_AGE, (struct buf *)0);
 743         }
 744         if (start_pg) {
 745                 /*
 746                  * start_pg of non-zero indicates we found some already valid pages
 747                  * at the beginning of the upl.... we need to release these without
 748                  * modifying there state
 749                  */
 750                 ubc_upl_abort_range(upl, 0, start_pg * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
 751
 752                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 50)) | DBG_FUNC_NONE,
 753                              upl, 0, start_pg * PAGE_SIZE, 0, 0);
 754         }
 755         if (last_pg < pages_in_upl) {
 756                 /*
 757                  * the set of pages that we issued an I/O for did not extend all the
 758                  * way to the end of the upl... so just release them without modifying
 759                  * there state
 760                  */
 761                 ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, (pages_in_upl - last_pg) * PAGE_SIZE,
 762                                 UPL_ABORT_FREE_ON_EMPTY);
 763
 764                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 50)) | DBG_FUNC_NONE,
 765                              upl, last_pg * PAGE_SIZE, (pages_in_upl - last_pg) * PAGE_SIZE, 0, 0);
 766         }
 767
 768         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 769                      (int)f_offset + (last_valid * PAGE_SIZE), 0, 0, 0, 0);
 770
 771         return(last_valid);
 772 }
 773
 774
 775
 776 static void
 777 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
 778         struct vnode *vp;
 779         daddr_t       b_lblkno;
 780         daddr_t       e_lblkno;
 781         off_t         filesize;
 782         int           devblocksize;
 783 {
 784         daddr_t       r_lblkno;
 785         off_t         f_offset;
 786         int           size_of_prefetch;
 787         int           max_iosize;
 788         int           max_pages;
 789
 790         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
 791                      b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
 792
 793         if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
 794                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 795                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
 796                 return;
 797         }
 798
 799         if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) && b_lblkno != (vp->v_maxra + 1))) {
 800                 vp->v_ralen = 0;
 801                 vp->v_maxra = 0;
 802
 803                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 804                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
 805
 806                 return;
 807         }
 808         vfs_io_attributes(vp, B_READ, &max_iosize, &max_pages);
 809
 810         if ((max_iosize / PAGE_SIZE) < max_pages)
 811                 max_pages = max_iosize / PAGE_SIZE;
 812         if (max_pages > MAX_UPL_TRANSFER)
 813                 max_pages = MAX_UPL_TRANSFER;
 814
 815         vp->v_ralen = vp->v_ralen ? min(max_pages, vp->v_ralen << 1) : 1;
 816
 817         if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
 818                 vp->v_ralen = min(max_pages, (e_lblkno + 1) - b_lblkno);
 819
 820         if (e_lblkno < vp->v_maxra) {
 821                 if ((vp->v_maxra - e_lblkno) > (max_pages / 4)) {
 822
 823                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 824                                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
 825                         return;
 826                 }
 827         }
 828         r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
 829         f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
 830
 831         size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
 832
 833         if (size_of_prefetch)
 834                 vp->v_maxra = r_lblkno + (size_of_prefetch - 1);
 835
 836         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 837                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
 838 }
 839
 840
 841 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 842         struct vnode *vp;
 843         upl_t         upl;
 844         vm_offset_t   upl_offset;
 845         off_t         f_offset;
 846         int           size;
 847         off_t         filesize;
 848         int           devblocksize;
 849         int           flags;
 850 {
 851         int           io_size;
 852         int           pg_size;
 853         off_t         max_size;
 854         int local_flags = CL_PAGEOUT;
 855
 856         if ((flags & UPL_IOSYNC) == 0)
 857                 local_flags |= CL_ASYNC;
 858         if ((flags & UPL_NOCOMMIT) == 0)
 859                 local_flags |= CL_COMMIT;
 860
 861         if (upl == (upl_t) 0)
 862                 panic("cluster_pageout: can't handle NULL upl yet\n");
 863
 864
 865         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
 866                      (int)f_offset, size, (int)filesize, local_flags, 0);
 867
 868         /*
 869          * If they didn't specify any I/O, then we are done...
 870          * we can't issue an abort because we don't know how
 871          * big the upl really is
 872          */
 873         if (size <= 0)
 874                 return (EINVAL);
 875
 876         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
 877                 if (local_flags & CL_COMMIT)
 878                         ubc_upl_abort_range(upl, upl_offset, size,
 879                                                 UPL_ABORT_FREE_ON_EMPTY);
 880                 return (EROFS);
 881         }
 882         /*
 883          * can't page-in from a negative offset
 884          * or if we're starting beyond the EOF
 885          * or if the file offset isn't page aligned
 886          * or the size requested isn't a multiple of PAGE_SIZE
 887          */
 888         if (f_offset < 0 || f_offset >= filesize ||
 889            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 890                 if (local_flags & CL_COMMIT)
 891                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 892                 return (EINVAL);
 893         }
 894         max_size = filesize - f_offset;
 895
 896         if (size < max_size)
 897                 io_size = size;
 898         else
 899                 io_size = (max_size + (devblocksize - 1)) & ~(devblocksize - 1);
 900
 901         pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 902
 903         if (size > pg_size) {
 904                 if (local_flags & CL_COMMIT)
 905                         ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
 906                                         UPL_ABORT_FREE_ON_EMPTY);
 907         }
 908         while (vp->v_numoutput >= ASYNC_THROTTLE) {
 909                 vp->v_flag |= VTHROTTLED;
 910                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_pageout", 0);
 911         }
 912
 913         return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
 914                            local_flags, (struct buf *)0));
 915 }
 916
 917
 918 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 919         struct vnode *vp;
 920         upl_t         upl;
 921         vm_offset_t   upl_offset;
 922         off_t         f_offset;
 923         int           size;
 924         off_t         filesize;
 925         int           devblocksize;
 926         int           flags;
 927 {
 928         u_int         io_size;
 929         int           pg_size;
 930         off_t         max_size;
 931         int           retval;
 932         int           local_flags = 0;
 933
 934
 935         /*
 936          * If they didn't ask for any data, then we are done...
 937          * we can't issue an abort because we don't know how
 938          * big the upl really is
 939          */
 940         if (size <= 0)
 941                 return (EINVAL);
 942
 943         if ((flags & UPL_NOCOMMIT) == 0)
 944                 local_flags = CL_COMMIT;
 945
 946         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
 947                      (int)f_offset, size, (int)filesize, local_flags, 0);
 948
 949         /*
 950          * can't page-in from a negative offset
 951          * or if we're starting beyond the EOF
 952          * or if the file offset isn't page aligned
 953          * or the size requested isn't a multiple of PAGE_SIZE
 954          */
 955         if (f_offset < 0 || f_offset >= filesize ||
 956            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 957                 if (local_flags & CL_COMMIT)
 958                         ubc_upl_abort_range(upl, upl_offset, size,
 959                                         UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY);
 960                 return (EINVAL);
 961         }
 962         max_size = filesize - f_offset;
 963
 964         if (size < max_size)
 965                 io_size = size;
 966         else
 967                 io_size = (max_size + (devblocksize - 1)) & ~(devblocksize - 1);
 968
 969         pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 970
 971         if (upl == (upl_t) 0) {
 972                 ubc_create_upl( vp,
 973                                                 f_offset,
 974                                                 pg_size,
 975                                                 &upl,
 976                                                 NULL,
 977                                                 UPL_FLAGS_NONE);
 978
 979                 if (upl == (upl_t) 0)
 980                         return (EINVAL);
 981
 982                 upl_offset = (vm_offset_t)0;
 983                 size = pg_size;
 984         }
 985         if (size > pg_size) {
 986                 if (local_flags & CL_COMMIT)
 987                         ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
 988                                         UPL_ABORT_FREE_ON_EMPTY);
 989         }
 990
 991         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
 992                             local_flags | CL_READ | CL_PAGEIN, (struct buf *)0);
 993
 994         if (retval == 0) {
 995                 int b_lblkno;
 996                 int e_lblkno;
 997
 998                 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
 999                 e_lblkno = (int)
1000                         ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1001
1002                 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) {
1003                         /*
1004                          * we haven't read the last page in of the file yet
1005                          * so let's try to read ahead if we're in
1006                          * a sequential access pattern
1007                          */
1008                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1009                 }
1010                 vp->v_lastr = e_lblkno;
1011         }
1012         return (retval);
1013 }
1014
1015
1016 cluster_bp(bp)
1017         struct buf *bp;
1018 {
1019         off_t  f_offset;
1020         int    flags;
1021
1022         if (bp->b_pagelist == (upl_t) 0)
1023                 panic("cluster_bp: can't handle NULL upl yet\n");
1024         if (bp->b_flags & B_READ)
1025                 flags = CL_ASYNC | CL_NOMAP | CL_READ;
1026         else
1027                 flags = CL_ASYNC | CL_NOMAP;
1028
1029         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1030
1031         return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, flags, bp));
1032 }
1033
1034
1035 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1036         struct vnode *vp;
1037         struct uio   *uio;
1038         off_t         oldEOF;
1039         off_t         newEOF;
1040         off_t         headOff;
1041         off_t         tailOff;
1042         int           devblocksize;
1043         int           flags;
1044 {
1045         int           prev_resid;
1046         int           clip_size;
1047         off_t         max_io_size;
1048         struct iovec  *iov;
1049         vm_offset_t   upl_offset;
1050         int           upl_size;
1051         int           pages_in_pl;
1052         upl_page_info_t *pl;
1053         int           upl_flags;
1054         upl_t         upl;
1055         int           retval = 0;
1056
1057
1058         if ((!uio) || (uio->uio_segflg != UIO_USERSPACE) || (!(vp->v_flag & VNOCACHE_DATA)))
1059           {
1060             retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1061             return(retval);
1062           }
1063
1064         while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1065           {
1066             /* we know we have a resid, so this is safe */
1067             iov = uio->uio_iov;
1068             while (iov->iov_len == 0) {
1069               uio->uio_iov++;
1070               uio->uio_iovcnt--;
1071               iov = uio->uio_iov;
1072             }
1073
1074             /*
1075              * We check every vector target and if it is physically
1076              * contiguous space, we skip the sanity checks.
1077              */
1078
1079             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1080             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1081             pages_in_pl = 0;
1082             upl_flags = UPL_QUERY_OBJECT_TYPE;
1083             if ((vm_map_get_upl(current_map(),
1084                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1085                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1086               {
1087                 /*
1088                  * the user app must have passed in an invalid address
1089                  */
1090                 return (EFAULT);
1091               }
1092
1093             if (upl_flags & UPL_PHYS_CONTIG)
1094               {
1095                 /*
1096                  * since the interface to the IOKit below us uses physical block #'s and
1097                  * block counts to specify the I/O, we can't handle anything that isn't
1098                  * devblocksize aligned
1099                  */
1100                 if ((uio->uio_offset & (devblocksize - 1)) || (uio->uio_resid & (devblocksize - 1)))
1101                     return(EINVAL);
1102
1103                 if (flags & IO_HEADZEROFILL)
1104                   {
1105                     flags &= ~IO_HEADZEROFILL;
1106
1107                     if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1108                         return(retval);
1109                   }
1110
1111                 retval = cluster_phys_write(vp, uio);
1112
1113                 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1114                   {
1115                     retval = cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL);
1116                     return(retval);
1117                   }
1118               }
1119             else if ((uio->uio_resid < 4 * PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1120               {
1121                 /*
1122                  * We set a threshhold of 4 pages to decide if the nocopy
1123                  * write loop is worth the trouble...
1124                  * we also come here if we're trying to zero the head and/or tail
1125                  * of a partially written page, and the user source is not a physically contiguous region
1126                  */
1127                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1128                 return(retval);
1129               }
1130             else if (uio->uio_offset & PAGE_MASK_64)
1131               {
1132                 /* Bring the file offset write up to a pagesize boundary */
1133                 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1134                 if (uio->uio_resid < clip_size)
1135                   clip_size = uio->uio_resid;
1136                 /*
1137                  * Fake the resid going into the cluster_write_x call
1138                  * and restore it on the way out.
1139                  */
1140                 prev_resid = uio->uio_resid;
1141                 uio->uio_resid = clip_size;
1142                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1143                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1144               }
1145             else if ((int)iov->iov_base & PAGE_MASK_64)
1146               {
1147                 clip_size = iov->iov_len;
1148                 prev_resid = uio->uio_resid;
1149                 uio->uio_resid = clip_size;
1150                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1151                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1152               }
1153             else
1154               {
1155                 /*
1156                  * If we come in here, we know the offset into
1157                  * the file is on a pagesize boundary
1158                  */
1159
1160                 max_io_size = newEOF - uio->uio_offset;
1161                 clip_size = uio->uio_resid;
1162                 if (iov->iov_len < clip_size)
1163                   clip_size = iov->iov_len;
1164                 if (max_io_size < clip_size)
1165                   clip_size = max_io_size;
1166
1167                 if (clip_size < PAGE_SIZE)
1168                   {
1169                     /*
1170                      * Take care of tail end of write in this vector
1171                      */
1172                     prev_resid = uio->uio_resid;
1173                     uio->uio_resid = clip_size;
1174                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1175                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1176                   }
1177                 else
1178                   {
1179                     /* round clip_size down to a multiple of pagesize */
1180                     clip_size = clip_size & ~(PAGE_MASK);
1181                     prev_resid = uio->uio_resid;
1182                     uio->uio_resid = clip_size;
1183                     retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1184                     if ((retval == 0) && uio->uio_resid)
1185                       retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1186                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1187                   }
1188               } /* end else */
1189           } /* end while */
1190         return(retval);
1191 }
1192
1193 static
1194 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1195         struct vnode *vp;
1196         struct uio   *uio;
1197         off_t         newEOF;
1198         int           devblocksize;
1199         int           flags;
1200 {
1201         upl_t            upl;
1202         upl_page_info_t  *pl;
1203         off_t            upl_f_offset;
1204         vm_offset_t      upl_offset;
1205         off_t            max_io_size;
1206         int              io_size;
1207         int              upl_size;
1208         int              upl_needed_size;
1209         int              pages_in_pl;
1210         int              upl_flags;
1211         kern_return_t    kret;
1212         struct iovec     *iov;
1213         int              i;
1214         int              force_data_sync;
1215         int              error  = 0;
1216
1217         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1218                      (int)uio->uio_offset, (int)uio->uio_resid,
1219                      (int)newEOF, devblocksize, 0);
1220
1221         /*
1222          * When we enter this routine, we know
1223          *  -- the offset into the file is on a pagesize boundary
1224          *  -- the resid is a page multiple
1225          *  -- the resid will not exceed iov_len
1226          */
1227
1228         iov = uio->uio_iov;
1229
1230         while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1231           io_size = uio->uio_resid;
1232
1233           if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1234             io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1235
1236           upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1237           upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1238
1239           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1240                        (int)upl_offset, upl_needed_size, iov->iov_base, io_size, 0);
1241
1242           for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
1243             {
1244               pages_in_pl = 0;
1245               upl_size = upl_needed_size;
1246               upl_flags = UPL_COPYOUT_FROM | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1247
1248               kret = vm_map_get_upl(current_map(),
1249                                     (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1250                                     &upl_size,
1251                                         &upl,
1252                                         NULL,
1253                                         &pages_in_pl,
1254                                         &upl_flags,
1255                                         force_data_sync);
1256
1257               if (kret != KERN_SUCCESS)
1258                 {
1259                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1260                                0, 0, 0, kret, 0);
1261
1262                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1263                                (int)uio->uio_offset, (int)uio->uio_resid, kret, 1, 0);
1264
1265                   /* cluster_nocopy_write: failed to get pagelist */
1266                   /* do not return kret here */
1267                   return(0);
1268                 }
1269
1270               pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1271               pages_in_pl = upl_size / PAGE_SIZE;
1272
1273               for(i=0; i < pages_in_pl; i++)
1274                 {
1275                   if (!upl_valid_page(pl, i))
1276                     break;
1277                 }
1278
1279               if (i == pages_in_pl)
1280                 break;
1281
1282                 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1283                                 UPL_ABORT_FREE_ON_EMPTY);
1284             }
1285
1286           if (force_data_sync >= 3)
1287             {
1288               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1289                            i, pages_in_pl, upl_size, kret, 0);
1290
1291               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1292                            (int)uio->uio_offset, (int)uio->uio_resid, kret, 2, 0);
1293               return(0);
1294             }
1295
1296           /*
1297            * Consider the possibility that upl_size wasn't satisfied.
1298            */
1299           if (upl_size != upl_needed_size)
1300             io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1301
1302           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1303                        (int)upl_offset, upl_size, iov->iov_base, io_size, 0);
1304
1305           if (io_size == 0)
1306             {
1307               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1308                                    UPL_ABORT_FREE_ON_EMPTY);
1309               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1310                      (int)uio->uio_offset, uio->uio_resid, 0, 3, 0);
1311
1312               return(0);
1313             }
1314
1315           /*
1316            * Now look for pages already in the cache
1317            * and throw them away.
1318            */
1319
1320           upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
1321           max_io_size = io_size;
1322
1323           while (max_io_size) {
1324
1325             /*
1326              * Flag UPL_POP_DUMP says if the page is found
1327              * in the page cache it must be thrown away.
1328              */
1329             ubc_page_op(vp,
1330                         upl_f_offset,
1331                         UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1332                         0, 0);
1333             max_io_size  -= PAGE_SIZE;
1334             upl_f_offset += PAGE_SIZE;
1335           }
1336
1337           /*
1338            * issue a synchronous write to cluster_io
1339            */
1340
1341           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1342                        (int)upl_offset, (int)uio->uio_offset, io_size, 0, 0);
1343
1344           error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1345                              io_size, 0, (struct buf *)0);
1346
1347           if (error == 0) {
1348             /*
1349              * The cluster_io write completed successfully,
1350              * update the uio structure and commit.
1351              */
1352
1353             ubc_upl_commit_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1354                                  UPL_COMMIT_FREE_ON_EMPTY);
1355
1356             iov->iov_base += io_size;
1357             iov->iov_len -= io_size;
1358             uio->uio_resid -= io_size;
1359             uio->uio_offset += io_size;
1360           }
1361           else {
1362             ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1363                                    UPL_ABORT_FREE_ON_EMPTY);
1364           }
1365
1366           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1367                        (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1368
1369         } /* end while */
1370
1371
1372         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1373                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1374
1375         return (error);
1376 }
1377
1378 static
1379 cluster_phys_write(vp, uio)
1380         struct vnode *vp;
1381         struct uio   *uio;
1382 {
1383         upl_t            upl;
1384         vm_offset_t      upl_offset;
1385         int              io_size;
1386         int              upl_size;
1387         int              upl_needed_size;
1388         int              pages_in_pl;
1389         int              upl_flags;
1390         kern_return_t    kret;
1391         struct iovec     *iov;
1392         int              error  = 0;
1393
1394         /*
1395          * When we enter this routine, we know
1396          *  -- the resid will not exceed iov_len
1397          *  -- the vector target address is physcially contiguous
1398          */
1399
1400         iov = uio->uio_iov;
1401         io_size = iov->iov_len;
1402         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1403         upl_needed_size = upl_offset + io_size;
1404
1405         pages_in_pl = 0;
1406         upl_size = upl_needed_size;
1407         upl_flags = UPL_COPYOUT_FROM | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1408
1409         kret = vm_map_get_upl(current_map(),
1410                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1411                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1412
1413         if (kret != KERN_SUCCESS)
1414           {
1415             /* cluster_phys_write: failed to get pagelist */
1416               /* note: return kret here */
1417               return(EINVAL);
1418           }
1419
1420         /*
1421          * Consider the possibility that upl_size wasn't satisfied.
1422          * This is a failure in the physical memory case.
1423          */
1424         if (upl_size < upl_needed_size)
1425           {
1426             kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1427             return(EINVAL);
1428           }
1429
1430         /*
1431          * issue a synchronous write to cluster_io
1432          */
1433
1434         error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1435                            io_size, CL_DEV_MEMORY, (struct buf *)0);
1436
1437         if (error == 0) {
1438           /*
1439            * The cluster_io write completed successfully,
1440            * update the uio structure and commit.
1441            */
1442
1443           ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
1444
1445           iov->iov_base += io_size;
1446           iov->iov_len -= io_size;
1447           uio->uio_resid -= io_size;
1448           uio->uio_offset += io_size;
1449         }
1450         else
1451           ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1452
1453         return (error);
1454 }
1455
1456 static
1457 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1458         struct vnode *vp;
1459         struct uio   *uio;
1460         off_t         oldEOF;
1461         off_t         newEOF;
1462         off_t         headOff;
1463         off_t         tailOff;
1464         int           devblocksize;
1465         int           flags;
1466 {
1467         upl_page_info_t *pl;
1468         upl_t            upl;
1469         vm_offset_t      upl_offset;
1470         int              upl_size;
1471         off_t            upl_f_offset;
1472         int              pages_in_upl;
1473         int              start_offset;
1474         int              xfer_resid;
1475         int              io_size;
1476         int              io_size_before_rounding;
1477         int              io_flags;
1478         vm_offset_t      io_address;
1479         int              io_offset;
1480         int              bytes_to_zero;
1481         int              bytes_to_move;
1482         kern_return_t    kret;
1483         int              retval = 0;
1484         int              uio_resid;
1485         long long        total_size;
1486         long long        zero_cnt;
1487         off_t            zero_off;
1488         long long        zero_cnt1;
1489         off_t            zero_off1;
1490         daddr_t          start_blkno;
1491         daddr_t          last_blkno;
1492
1493         if (uio) {
1494                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1495                              (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1496
1497                 uio_resid = uio->uio_resid;
1498         } else {
1499                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1500                              0, 0, (int)oldEOF, (int)newEOF, 0);
1501
1502                 uio_resid = 0;
1503         }
1504         zero_cnt  = 0;
1505         zero_cnt1 = 0;
1506
1507         if (flags & IO_HEADZEROFILL) {
1508                 /*
1509                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1510                  * so we zero fill the intervening space between the old EOF and the offset
1511                  * where the next chunk of real data begins.... ftruncate will also use this
1512                  * routine to zero fill to the new EOF when growing a file... in this case, the
1513                  * uio structure will not be provided
1514                  */
1515                 if (uio) {
1516                         if (headOff < uio->uio_offset) {
1517                                 zero_cnt = uio->uio_offset - headOff;
1518                                 zero_off = headOff;
1519                         }
1520                 } else if (headOff < newEOF) {
1521                         zero_cnt = newEOF - headOff;
1522                         zero_off = headOff;
1523                 }
1524         }
1525         if (flags & IO_TAILZEROFILL) {
1526                 if (uio) {
1527                         zero_off1 = uio->uio_offset + uio->uio_resid;
1528
1529                         if (zero_off1 < tailOff)
1530                                 zero_cnt1 = tailOff - zero_off1;
1531                 }
1532         }
1533         if (zero_cnt == 0 && uio == (struct uio *) 0)
1534           {
1535             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1536                          retval, 0, 0, 0, 0);
1537             return (0);
1538           }
1539
1540         while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1541                 /*
1542                  * for this iteration of the loop, figure out where our starting point is
1543                  */
1544                 if (zero_cnt) {
1545                         start_offset = (int)(zero_off & PAGE_MASK_64);
1546                         upl_f_offset = zero_off - start_offset;
1547                 } else if (uio_resid) {
1548                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1549                         upl_f_offset = uio->uio_offset - start_offset;
1550                 } else {
1551                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1552                         upl_f_offset = zero_off1 - start_offset;
1553                 }
1554                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1555                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1556
1557                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1558                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1559
1560                 /*
1561                  * compute the size of the upl needed to encompass
1562                  * the requested write... limit each call to cluster_io
1563                  * to the maximum UPL size... cluster_io will clip if
1564                  * this exceeds the maximum io_size for the device,
1565                  * make sure to account for
1566                  * a starting offset that's not page aligned
1567                  */
1568                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1569
1570                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1571                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1572
1573                 pages_in_upl = upl_size / PAGE_SIZE;
1574                 io_size      = upl_size - start_offset;
1575
1576                 if ((long long)io_size > total_size)
1577                         io_size = total_size;
1578
1579                 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1580                 last_blkno  = start_blkno + pages_in_upl;
1581
1582                 kret = ubc_create_upl(vp,
1583                                                         upl_f_offset,
1584                                                         upl_size,
1585                                                         &upl,
1586                                                         &pl,
1587                                                         UPL_FLAGS_NONE);
1588                 if (kret != KERN_SUCCESS)
1589                         panic("cluster_write: failed to get pagelist");
1590
1591                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
1592                         upl, (int)upl_f_offset, upl_size, start_offset, 0);
1593
1594                 if (start_offset && !upl_valid_page(pl, 0)) {
1595                         int   read_size;
1596
1597                         /*
1598                          * we're starting in the middle of the first page of the upl
1599                          * and the page isn't currently valid, so we're going to have
1600                          * to read it in first... this is a synchronous operation
1601                          */
1602                         read_size = PAGE_SIZE;
1603
1604                         if ((upl_f_offset + read_size) > newEOF) {
1605                                 read_size = newEOF - upl_f_offset;
1606                                 read_size = (read_size + (devblocksize - 1)) & ~(devblocksize - 1);
1607                         }
1608                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
1609                                             CL_READ, (struct buf *)0);
1610                         if (retval) {
1611                                 /*
1612                                  * we had an error during the read which causes us to abort
1613                                  * the current cluster_write request... before we do, we need
1614                                  * to release the rest of the pages in the upl without modifying
1615                                  * there state and mark the failed page in error
1616                                  */
1617                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1618                                 ubc_upl_abort(upl, 0);
1619
1620                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1621                                              upl, 0, 0, retval, 0);
1622                                 break;
1623                         }
1624                 }
1625                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1626                         /*
1627                          * the last offset we're writing to in this upl does not end on a page
1628                          * boundary... if it's not beyond the old EOF, then we'll also need to
1629                          * pre-read this page in if it isn't already valid
1630                          */
1631                         upl_offset = upl_size - PAGE_SIZE;
1632
1633                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1634                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1635                                 int   read_size;
1636
1637                                 read_size = PAGE_SIZE;
1638
1639                                 if ((upl_f_offset + upl_offset + read_size) > newEOF) {
1640                                         read_size = newEOF - (upl_f_offset + upl_offset);
1641                                         read_size = (read_size + (devblocksize - 1)) & ~(devblocksize - 1);
1642                                 }
1643                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
1644                                                     CL_READ, (struct buf *)0);
1645                                 if (retval) {
1646                                         /*
1647                                          * we had an error during the read which causes us to abort
1648                                          * the current cluster_write request... before we do, we
1649                                          * need to release the rest of the pages in the upl without
1650                                          * modifying there state and mark the failed page in error
1651                                          */
1652                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE,
1653                                                         UPL_ABORT_DUMP_PAGES);
1654                                         ubc_upl_abort(upl, 0);
1655
1656                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1657                                                      upl, 0, 0, retval, 0);
1658                                         break;
1659                                 }
1660                         }
1661                 }
1662                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
1663                         panic("cluster_write: ubc_upl_map failed\n");
1664                 xfer_resid = io_size;
1665                 io_offset = start_offset;
1666
1667                 while (zero_cnt && xfer_resid) {
1668
1669                         if (zero_cnt < (long long)xfer_resid)
1670                                 bytes_to_zero = zero_cnt;
1671                         else
1672                                 bytes_to_zero = xfer_resid;
1673
1674                         if ( !(flags & IO_NOZEROVALID)) {
1675                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1676
1677                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1678                                              (int)upl_f_offset + io_offset, bytes_to_zero,
1679                                              (int)zero_cnt, xfer_resid, 0);
1680                         } else {
1681                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1682
1683                                 if ( !upl_valid_page(pl, (int)(zero_off / PAGE_SIZE_64))) {
1684                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1685
1686                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1687                                                      (int)upl_f_offset + io_offset, bytes_to_zero,
1688                                                      (int)zero_cnt, xfer_resid, 0);
1689                                 }
1690                         }
1691                         xfer_resid -= bytes_to_zero;
1692                         zero_cnt   -= bytes_to_zero;
1693                         zero_off   += bytes_to_zero;
1694                         io_offset  += bytes_to_zero;
1695                 }
1696                 if (xfer_resid && uio_resid) {
1697                         bytes_to_move = min(uio_resid, xfer_resid);
1698
1699                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1700                                      (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1701
1702                         retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1703
1704                         if (retval) {
1705                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1706                                         panic("cluster_write: kernel_upl_unmap failed\n");
1707                                 ubc_upl_abort(upl, UPL_ABORT_DUMP_PAGES);
1708
1709                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1710                                              upl, 0, 0, retval, 0);
1711                         } else {
1712                                 uio_resid  -= bytes_to_move;
1713                                 xfer_resid -= bytes_to_move;
1714                                 io_offset  += bytes_to_move;
1715                         }
1716                 }
1717                 while (xfer_resid && zero_cnt1 && retval == 0) {
1718
1719                         if (zero_cnt1 < (long long)xfer_resid)
1720                                 bytes_to_zero = zero_cnt1;
1721                         else
1722                                 bytes_to_zero = xfer_resid;
1723
1724                         if ( !(flags & IO_NOZEROVALID)) {
1725                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1726
1727                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1728                                              (int)upl_f_offset + io_offset,
1729                                              bytes_to_zero, (int)zero_cnt1, xfer_resid, 0);
1730                         } else {
1731                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1732                                 if ( !upl_valid_page(pl, (int)(zero_off1 / PAGE_SIZE_64))) {
1733                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1734
1735                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1736                                                      (int)upl_f_offset + io_offset,
1737                                                      bytes_to_zero, (int)zero_cnt1, xfer_resid, 0);
1738                                 }
1739                         }
1740                         xfer_resid -= bytes_to_zero;
1741                         zero_cnt1  -= bytes_to_zero;
1742                         zero_off1  += bytes_to_zero;
1743                         io_offset  += bytes_to_zero;
1744                 }
1745
1746                 if (retval == 0) {
1747                         int must_push;
1748                         int can_delay;
1749
1750                         io_size += start_offset;
1751
1752                         if ((upl_f_offset + io_size) == newEOF && io_size < upl_size) {
1753                                 /*
1754                                  * if we're extending the file with this write
1755                                  * we'll zero fill the rest of the page so that
1756                                  * if the file gets extended again in such a way as to leave a
1757                                  * hole starting at this EOF, we'll have zero's in the correct spot
1758                                  */
1759                                 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1760
1761                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1762                                              (int)upl_f_offset + io_size,
1763                                              upl_size - io_size, 0, 0, 0);
1764                         }
1765                         if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1766                                 panic("cluster_write: kernel_upl_unmap failed\n");
1767
1768                         io_size_before_rounding = io_size;
1769
1770                         if (io_size & (devblocksize - 1))
1771                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
1772
1773                         must_push = 0;
1774                         can_delay = 0;
1775
1776                         if (vp->v_clen) {
1777                                 int newsize;
1778
1779                                 /*
1780                                  * we have an existing cluster... see if this write will extend it nicely
1781                                  */
1782                                 if (start_blkno >= vp->v_cstart) {
1783                                         if (last_blkno <= (vp->v_cstart + vp->v_clen)) {
1784                                                 /*
1785                                                  * we have a write that fits entirely
1786                                                  * within the existing cluster limits
1787                                                  */
1788                                                 if (last_blkno >= vp->v_lastw) {
1789                                                         /*
1790                                                          * if we're extending the dirty region within the cluster
1791                                                          * we need to update the cluster info... we check for blkno
1792                                                          * equality because we may be extending the file with a
1793                                                          * partial write.... this in turn changes our idea of how
1794                                                          * much data to write out (v_ciosiz) for the last page
1795                                                          */
1796                                                         vp->v_lastw = last_blkno;
1797                                                         newsize = io_size + ((start_blkno - vp->v_cstart) * PAGE_SIZE);
1798
1799                                                         if (newsize > vp->v_ciosiz)
1800                                                                 vp->v_ciosiz = newsize;
1801                                                 }
1802                                                 can_delay = 1;
1803                                                 goto finish_io;
1804                                         }
1805                                         if (start_blkno < (vp->v_cstart + vp->v_clen)) {
1806                                                 /*
1807                                                  * we have a write that starts in the middle of the current cluster
1808                                                  * but extends beyond the cluster's limit
1809                                                  * we'll clip the current cluster if we actually
1810                                                  * overlap with the new write and then push it out
1811                                                  * and start a new cluster with the current write
1812                                                  */
1813                                                  if (vp->v_lastw > start_blkno) {
1814                                                         vp->v_lastw = start_blkno;
1815                                                         vp->v_ciosiz = (vp->v_lastw - vp->v_cstart) * PAGE_SIZE;
1816                                                  }
1817                                         }
1818                                         /*
1819                                          * we also get here for the case where the current write starts
1820                                          * beyond the limit of the existing cluster
1821                                          */
1822                                         must_push = 1;
1823                                         goto check_delay;
1824                                 }
1825                                 /*
1826                                  * the current write starts in front of the current cluster
1827                                  */
1828                                 if (last_blkno > vp->v_cstart) {
1829                                         /*
1830                                          * the current write extends into the existing cluster
1831                                          */
1832                                         if ((vp->v_lastw - start_blkno) > vp->v_clen) {
1833                                                 /*
1834                                                  * if we were to combine this write with the current cluster
1835                                                  * we would exceed the cluster size limit....
1836                                                  * clip the current cluster by moving the start position
1837                                                  * to where the current write ends, and then push it
1838                                                  */
1839                                                 vp->v_ciosiz -= (last_blkno - vp->v_cstart) * PAGE_SIZE;
1840                                                 vp->v_cstart = last_blkno;
1841
1842                                                 /*
1843                                                  * round up the io_size to the nearest page size
1844                                                  * since we've coalesced with at least 1 pre-existing
1845                                                  * page in the current cluster... this write may have ended in the
1846                                                  * middle of the page which would cause io_size to give us an
1847                                                  * inaccurate view of how much I/O we actually need to do
1848                                                  */
1849                                                 io_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1850
1851                                                 must_push = 1;
1852                                                 goto check_delay;
1853                                         }
1854                                         /*
1855                                          * we can coalesce the current write with the existing cluster
1856                                          * adjust the cluster info to reflect this
1857                                          */
1858                                         if (last_blkno > vp->v_lastw) {
1859                                                 /*
1860                                                  * the current write completey overlaps
1861                                                  * the existing cluster
1862                                                  */
1863                                                 vp->v_lastw = last_blkno;
1864                                                 vp->v_ciosiz = io_size;
1865                                         } else {
1866                                                 vp->v_ciosiz += (vp->v_cstart - start_blkno) * PAGE_SIZE;
1867
1868                                                 if (io_size > vp->v_ciosiz)
1869                                                         vp->v_ciosiz = io_size;
1870                                         }
1871                                         vp->v_cstart = start_blkno;
1872                                         can_delay = 1;
1873                                         goto finish_io;
1874                                 }
1875                                 /*
1876                                  * this I/O range is entirely in front of the current cluster
1877                                  * so we need to push the current cluster out before beginning
1878                                  * a new one
1879                                  */
1880                                 must_push = 1;
1881                         }
1882 check_delay:
1883                         if (must_push)
1884                                 cluster_push(vp);
1885
1886                         if (io_size_before_rounding < (MAX_UPL_TRANSFER * PAGE_SIZE) && !(flags & IO_SYNC)) {
1887                                 vp->v_clen = MAX_UPL_TRANSFER;
1888                                 vp->v_cstart = start_blkno;
1889                                 vp->v_lastw  = last_blkno;
1890                                 vp->v_ciosiz = io_size;
1891
1892                                 can_delay = 1;
1893                         }
1894 finish_io:
1895                         if (can_delay) {
1896                                 ubc_upl_commit_range(upl, 0, upl_size,
1897                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1898                                 continue;
1899                         }
1900                         if (flags & IO_SYNC)
1901                                 io_flags = CL_COMMIT | CL_AGE;
1902                         else
1903                                 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
1904
1905                         if (vp->v_flag & VNOCACHE_DATA)
1906                                 io_flags |= CL_DUMP;
1907
1908                         while (vp->v_numoutput >= ASYNC_THROTTLE) {
1909                                 vp->v_flag |= VTHROTTLED;
1910                                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
1911                         }
1912                         retval = cluster_io(vp, upl, 0, upl_f_offset, io_size,
1913                                             io_flags, (struct buf *)0);
1914                 }
1915         }
1916         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1917                      retval, 0, 0, 0, 0);
1918
1919         return (retval);
1920 }
1921
1922 cluster_read(vp, uio, filesize, devblocksize, flags)
1923         struct vnode *vp;
1924         struct uio   *uio;
1925         off_t         filesize;
1926         int           devblocksize;
1927         int           flags;
1928 {
1929         int           prev_resid;
1930         int           clip_size;
1931         off_t         max_io_size;
1932         struct iovec  *iov;
1933         vm_offset_t   upl_offset;
1934         int           upl_size;
1935         int           pages_in_pl;
1936         upl_page_info_t *pl;
1937         int           upl_flags;
1938         upl_t         upl;
1939         int           retval = 0;
1940
1941         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
1942                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
1943
1944         /*
1945          * We set a threshhold of 4 pages to decide if the nocopy
1946          * read loop is worth the trouble...
1947          */
1948
1949         if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
1950           {
1951             retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1952             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
1953                          (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
1954             return(retval);
1955           }
1956
1957         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
1958           {
1959             /* we know we have a resid, so this is safe */
1960             iov = uio->uio_iov;
1961             while (iov->iov_len == 0) {
1962               uio->uio_iov++;
1963               uio->uio_iovcnt--;
1964               iov = uio->uio_iov;
1965             }
1966
1967             /*
1968              * We check every vector target and if it is physically
1969              * contiguous space, we skip the sanity checks.
1970              */
1971
1972             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1973             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1974             pages_in_pl = 0;
1975             upl_flags = UPL_QUERY_OBJECT_TYPE;
1976             if((vm_map_get_upl(current_map(),
1977                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1978                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1979               {
1980                 /*
1981                  * the user app must have passed in an invalid address
1982                  */
1983                 return (EFAULT);
1984               }
1985
1986             if (upl_flags & UPL_PHYS_CONTIG)
1987               {
1988                 retval = cluster_phys_read(vp, uio, filesize);
1989               }
1990             else if (uio->uio_resid < 4 * PAGE_SIZE)
1991               {
1992                 /*
1993                  * We set a threshhold of 4 pages to decide if the nocopy
1994                  * read loop is worth the trouble...
1995                  */
1996                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1997                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
1998                              (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
1999                 return(retval);
2000               }
2001             else if (uio->uio_offset & PAGE_MASK_64)
2002               {
2003                 /* Bring the file offset read up to a pagesize boundary */
2004                 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2005                 if (uio->uio_resid < clip_size)
2006                   clip_size = uio->uio_resid;
2007                 /*
2008                  * Fake the resid going into the cluster_read_x call
2009                  * and restore it on the way out.
2010                  */
2011                 prev_resid = uio->uio_resid;
2012                 uio->uio_resid = clip_size;
2013                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2014                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2015               }
2016             else if ((int)iov->iov_base & PAGE_MASK_64)
2017               {
2018                 clip_size = iov->iov_len;
2019                 prev_resid = uio->uio_resid;
2020                 uio->uio_resid = clip_size;
2021                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2022                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2023               }
2024             else
2025               {
2026                 /*
2027                  * If we come in here, we know the offset into
2028                  * the file is on a pagesize boundary
2029                  */
2030
2031                 max_io_size = filesize - uio->uio_offset;
2032                 clip_size = uio->uio_resid;
2033                 if (iov->iov_len < clip_size)
2034                   clip_size = iov->iov_len;
2035                 if (max_io_size < clip_size)
2036                   clip_size = (int)max_io_size;
2037
2038                 if (clip_size < PAGE_SIZE)
2039                   {
2040                     /*
2041                      * Take care of the tail end of the read in this vector.
2042                      */
2043                     prev_resid = uio->uio_resid;
2044                     uio->uio_resid = clip_size;
2045                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2046                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2047                   }
2048                 else
2049                   {
2050                     /* round clip_size down to a multiple of pagesize */
2051                     clip_size = clip_size & ~(PAGE_MASK);
2052                     prev_resid = uio->uio_resid;
2053                     uio->uio_resid = clip_size;
2054                     retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2055                     if ((retval==0) && uio->uio_resid)
2056                       retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2057                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2058                   }
2059               } /* end else */
2060           } /* end while */
2061
2062         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2063                      (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2064
2065         return(retval);
2066 }
2067
2068 static
2069 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2070         struct vnode *vp;
2071         struct uio   *uio;
2072         off_t         filesize;
2073         int           devblocksize;
2074         int           flags;
2075 {
2076         upl_page_info_t *pl;
2077         upl_t            upl;
2078         vm_offset_t      upl_offset;
2079         int              upl_size;
2080         off_t            upl_f_offset;
2081         int              start_offset;
2082         int              start_pg;
2083         int              last_pg;
2084         int              uio_last;
2085         int              pages_in_upl;
2086         off_t            max_size;
2087         int              io_size;
2088         vm_offset_t      io_address;
2089         kern_return_t    kret;
2090         int              segflg;
2091         int              error  = 0;
2092         int              retval = 0;
2093         int              b_lblkno;
2094         int              e_lblkno;
2095
2096         b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
2097
2098         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2099                 /*
2100                  * compute the size of the upl needed to encompass
2101                  * the requested read... limit each call to cluster_io
2102                  * to the maximum UPL size... cluster_io will clip if
2103                  * this exceeds the maximum io_size for the device,
2104                  * make sure to account for
2105                  * a starting offset that's not page aligned
2106                  */
2107                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2108                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2109                 max_size     = filesize - uio->uio_offset;
2110
2111                 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2112                         io_size = uio->uio_resid;
2113                 else
2114                         io_size = max_size;
2115 #ifdef ppc
2116                 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
2117                         segflg = uio->uio_segflg;
2118
2119                         uio->uio_segflg = UIO_PHYS_USERSPACE;
2120
2121                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2122                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2123
2124                         while (io_size && retval == 0) {
2125                                 int         xsize;
2126                                 vm_offset_t paddr;
2127
2128                                 if (ubc_page_op(vp,
2129                                                 upl_f_offset,
2130                                                 UPL_POP_SET | UPL_POP_BUSY,
2131                                                 &paddr, 0) != KERN_SUCCESS)
2132                                         break;
2133
2134                                 xsize = PAGE_SIZE - start_offset;
2135
2136                                 if (xsize > io_size)
2137                                         xsize = io_size;
2138
2139                                 retval = uiomove((caddr_t)(paddr + start_offset), xsize, uio);
2140
2141                                 ubc_page_op(vp, upl_f_offset,
2142                                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2143
2144                                 io_size     -= xsize;
2145                                 start_offset = (int)
2146                                         (uio->uio_offset & PAGE_MASK_64);
2147                                 upl_f_offset = uio->uio_offset - start_offset;
2148                         }
2149                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2150                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2151
2152                         uio->uio_segflg = segflg;
2153
2154                         if (retval)
2155                                 break;
2156
2157                         if (io_size == 0) {
2158                                 /*
2159                                  * we're already finished with this read request
2160                                  * let's see if we should do a read-ahead
2161                                  */
2162                                 e_lblkno = (int)
2163                                         ((uio->uio_offset - 1) / PAGE_SIZE_64);
2164
2165                                 if (!(vp->v_flag & VRAOFF))
2166                                         /*
2167                                          * let's try to read ahead if we're in
2168                                          * a sequential access pattern
2169                                          */
2170                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2171                                 vp->v_lastr = e_lblkno;
2172
2173                                 break;
2174                         }
2175                         max_size = filesize - uio->uio_offset;
2176                 }
2177 #endif
2178                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2179                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2180                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2181                 pages_in_upl = upl_size / PAGE_SIZE;
2182
2183                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2184                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
2185
2186                 kret = ubc_create_upl(vp,
2187                                                 upl_f_offset,
2188                                                 upl_size,
2189                                                 &upl,
2190                                                 &pl,
2191                                                 UPL_FLAGS_NONE);
2192                 if (kret != KERN_SUCCESS)
2193                         panic("cluster_read: failed to get pagelist");
2194
2195                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2196                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
2197
2198                 /*
2199                  * scan from the beginning of the upl looking for the first
2200                  * non-valid page.... this will become the first page in
2201                  * the request we're going to make to 'cluster_io'... if all
2202                  * of the pages are valid, we won't call through to 'cluster_io'
2203                  */
2204                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2205                         if (!upl_valid_page(pl, start_pg))
2206                                 break;
2207                 }
2208
2209                 /*
2210                  * scan from the starting invalid page looking for a valid
2211                  * page before the end of the upl is reached, if we
2212                  * find one, then it will be the last page of the request to
2213                  * 'cluster_io'
2214                  */
2215                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2216                         if (upl_valid_page(pl, last_pg))
2217                                 break;
2218                 }
2219
2220                 if (start_pg < last_pg) {
2221                         /*
2222                          * we found a range of 'invalid' pages that must be filled
2223                          * if the last page in this range is the last page of the file
2224                          * we may have to clip the size of it to keep from reading past
2225                          * the end of the last physical block associated with the file
2226                          */
2227                         upl_offset = start_pg * PAGE_SIZE;
2228                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2229
2230                         if ((upl_f_offset + upl_offset + io_size) > filesize) {
2231                                 io_size = filesize - (upl_f_offset + upl_offset);
2232                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2233                         }
2234                         /*
2235                          * issue a synchronous read to cluster_io
2236                          */
2237
2238                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2239                                            io_size, CL_READ, (struct buf *)0);
2240                 }
2241                 if (error == 0) {
2242                         /*
2243                          * if the read completed successfully, or there was no I/O request
2244                          * issued, than map the upl into kernel address space and
2245                          * move the data into user land.... we'll first add on any 'valid'
2246                          * pages that were present in the upl when we acquired it.
2247                          */
2248                         u_int  val_size;
2249                         u_int  size_of_prefetch;
2250
2251                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2252                                 if (!upl_valid_page(pl, uio_last))
2253                                         break;
2254                         }
2255                         /*
2256                          * compute size to transfer this round,  if uio->uio_resid is
2257                          * still non-zero after this uiomove, we'll loop around and
2258                          * set up for another I/O.
2259                          */
2260                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2261
2262                         if (max_size < val_size)
2263                                 val_size = max_size;
2264
2265                         if (uio->uio_resid < val_size)
2266                                 val_size = uio->uio_resid;
2267
2268                         e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2269
2270                         if (size_of_prefetch = (uio->uio_resid - val_size)) {
2271                                 /*
2272                                  * if there's still I/O left to do for this request, then issue a
2273                                  * pre-fetch I/O... the I/O wait time will overlap
2274                                  * with the copying of the data
2275                                  */
2276                                 cluster_rd_prefetch(vp, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
2277                         } else {
2278                                 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2279                                         /*
2280                                          * let's try to read ahead if we're in
2281                                          * a sequential access pattern
2282                                          */
2283                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2284                                 vp->v_lastr = e_lblkno;
2285                         }
2286 #ifdef ppc
2287                         if (uio->uio_segflg == UIO_USERSPACE) {
2288                                 int       offset;
2289
2290                                 segflg = uio->uio_segflg;
2291
2292                                 uio->uio_segflg = UIO_PHYS_USERSPACE;
2293
2294
2295                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2296                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2297
2298                                 offset = start_offset;
2299
2300                                 while (val_size && retval == 0) {
2301                                         int       csize;
2302                                         int       i;
2303                                         caddr_t   paddr;
2304
2305                                         i = offset / PAGE_SIZE;
2306                                         csize = min(PAGE_SIZE - start_offset, val_size);
2307
2308                                         paddr = (caddr_t)upl_phys_page(pl, i) + start_offset;
2309
2310                                         retval = uiomove(paddr, csize, uio);
2311
2312                                         val_size    -= csize;
2313                                         offset      += csize;
2314                                         start_offset = offset & PAGE_MASK;
2315                                 }
2316                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2317                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2318
2319                                 uio->uio_segflg = segflg;
2320                         } else
2321 #endif
2322                         {
2323                                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
2324                                         panic("cluster_read: ubc_upl_map() failed\n");
2325
2326                                 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2327
2328                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
2329                                         panic("cluster_read: ubc_upl_unmap() failed\n");
2330                         }
2331                 }
2332                 if (start_pg < last_pg) {
2333                         /*
2334                          * compute the range of pages that we actually issued an I/O for
2335                          * and either commit them as valid if the I/O succeeded
2336                          * or abort them if the I/O failed
2337                          */
2338                         io_size = (last_pg - start_pg) * PAGE_SIZE;
2339
2340                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2341                                      upl, start_pg * PAGE_SIZE, io_size, error, 0);
2342
2343                         if (error || (vp->v_flag & VNOCACHE_DATA))
2344                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2345                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2346                         else
2347                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2348                                                 UPL_COMMIT_CLEAR_DIRTY
2349                                                 | UPL_COMMIT_FREE_ON_EMPTY
2350                                                 | UPL_COMMIT_INACTIVATE);
2351
2352                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2353                                      upl, start_pg * PAGE_SIZE, io_size, error, 0);
2354                 }
2355                 if ((last_pg - start_pg) < pages_in_upl) {
2356                         int cur_pg;
2357                         int commit_flags;
2358
2359                         /*
2360                          * the set of pages that we issued an I/O for did not encompass
2361                          * the entire upl... so just release these without modifying
2362                          * there state
2363                          */
2364                         if (error)
2365                                 ubc_upl_abort(upl, 0);
2366                         else {
2367                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2368                                              upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2369
2370                                 if (start_pg) {
2371                                         /*
2372                                          * we found some already valid pages at the beginning of
2373                                          * the upl commit these back to the inactive list with
2374                                          * reference cleared
2375                                          */
2376                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2377                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2378                                                                    | UPL_COMMIT_INACTIVATE;
2379
2380                                                 if (upl_dirty_page(pl, cur_pg))
2381                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2382
2383                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2384                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2385                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2386                                                 else
2387                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2388                                                                 PAGE_SIZE, commit_flags);
2389                                         }
2390                                 }
2391                                 if (last_pg < uio_last) {
2392                                         /*
2393                                          * we found some already valid pages immediately after the
2394                                          * pages we issued I/O for, commit these back to the
2395                                          * inactive list with reference cleared
2396                                          */
2397                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2398                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
2399                                                                                 | UPL_COMMIT_INACTIVATE;
2400
2401                                                 if (upl_dirty_page(pl, cur_pg))
2402                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2403
2404                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2405                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2406                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2407                                                 else
2408                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2409                                                                 PAGE_SIZE, commit_flags);
2410                                         }
2411                                 }
2412                                 if (uio_last < pages_in_upl) {
2413                                         /*
2414                                          * there were some invalid pages beyond the valid pages
2415                                          * that we didn't issue an I/O for, just release them
2416                                          * unchanged
2417                                          */
2418                                         ubc_upl_abort(upl, 0);
2419                                 }
2420
2421                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2422                                         upl, -1, -1, 0, 0);
2423                         }
2424                 }
2425                 if (retval == 0)
2426                         retval = error;
2427         }
2428
2429         return (retval);
2430 }
2431
2432 static
2433 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2434         struct vnode *vp;
2435         struct uio   *uio;
2436         off_t         filesize;
2437         int           devblocksize;
2438         int           flags;
2439 {
2440         upl_t            upl;
2441         upl_page_info_t  *pl;
2442         off_t            upl_f_offset;
2443         vm_offset_t      upl_offset;
2444         off_t            start_upl_f_offset;
2445         off_t            max_io_size;
2446         int              io_size;
2447         int              upl_size;
2448         int              upl_needed_size;
2449         int              pages_in_pl;
2450         vm_offset_t      paddr;
2451         int              upl_flags;
2452         kern_return_t    kret;
2453         int              segflg;
2454         struct iovec     *iov;
2455         int              i;
2456         int              force_data_sync;
2457         int              error  = 0;
2458         int              retval = 0;
2459
2460         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2461                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2462
2463         /*
2464          * When we enter this routine, we know
2465          *  -- the offset into the file is on a pagesize boundary
2466          *  -- the resid is a page multiple
2467          *  -- the resid will not exceed iov_len
2468          */
2469
2470         iov = uio->uio_iov;
2471         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2472
2473           max_io_size = filesize - uio->uio_offset;
2474
2475           if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2476               io_size = max_io_size;
2477           else
2478               io_size = uio->uio_resid;
2479
2480           /*
2481            * We don't come into this routine unless
2482            * UIO_USERSPACE is set.
2483            */
2484           segflg = uio->uio_segflg;
2485
2486           uio->uio_segflg = UIO_PHYS_USERSPACE;
2487
2488           /*
2489            * First look for pages already in the cache
2490            * and move them to user space.
2491            */
2492           while (io_size && (retval == 0)) {
2493             upl_f_offset = uio->uio_offset;
2494
2495             /*
2496              * If this call fails, it means the page is not
2497              * in the page cache.
2498              */
2499             if (ubc_page_op(vp, upl_f_offset,
2500                             UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS)
2501               break;
2502
2503             retval = uiomove((caddr_t)(paddr), PAGE_SIZE, uio);
2504
2505             ubc_page_op(vp, upl_f_offset,
2506                         UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2507
2508             io_size     -= PAGE_SIZE;
2509             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
2510                            (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2511           }
2512
2513           uio->uio_segflg = segflg;
2514
2515           if (retval)
2516             {
2517               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2518                            (int)uio->uio_offset, uio->uio_resid, 2, retval, 0);
2519               return(retval);
2520             }
2521
2522           /* If we are already finished with this read, then return */
2523           if (io_size == 0)
2524             {
2525
2526               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2527                            (int)uio->uio_offset, uio->uio_resid, 3, io_size, 0);
2528               return(0);
2529             }
2530
2531           max_io_size = io_size;
2532           if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2533             max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2534
2535           start_upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
2536           upl_f_offset = start_upl_f_offset;
2537           io_size = 0;
2538
2539           while(io_size < max_io_size)
2540             {
2541
2542               if(ubc_page_op(vp, upl_f_offset,
2543                                 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS)
2544               {
2545                         ubc_page_op(vp, upl_f_offset,
2546                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2547                         break;
2548               }
2549
2550                   /*
2551                    * Build up the io request parameters.
2552                    */
2553
2554                   io_size += PAGE_SIZE;
2555                   upl_f_offset += PAGE_SIZE;
2556                 }
2557
2558               if (io_size == 0)
2559                 return(retval);
2560
2561           upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2562           upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2563
2564           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2565                        (int)upl_offset, upl_needed_size, iov->iov_base, io_size, 0);
2566
2567           for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
2568             {
2569               pages_in_pl = 0;
2570               upl_size = upl_needed_size;
2571               upl_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2572
2573               kret = vm_map_get_upl(current_map(),
2574                                     (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2575                                     &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2576
2577               if (kret != KERN_SUCCESS)
2578                 {
2579                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2580                                (int)upl_offset, upl_size, io_size, kret, 0);
2581
2582                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2583                                (int)uio->uio_offset, uio->uio_resid, 4, retval, 0);
2584
2585                   /* cluster_nocopy_read: failed to get pagelist */
2586                   /* do not return kret here */
2587                   return(retval);
2588                 }
2589
2590               pages_in_pl = upl_size / PAGE_SIZE;
2591               pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2592
2593               for(i=0; i < pages_in_pl; i++)
2594                 {
2595                   if (!upl_valid_page(pl, i))
2596                     break;
2597                 }
2598               if (i == pages_in_pl)
2599                 break;
2600
2601               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2602                                   UPL_ABORT_FREE_ON_EMPTY);
2603             }
2604
2605           if (force_data_sync >= 3)
2606             {
2607                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2608                                (int)upl_offset, upl_size, io_size, kret, 0);
2609
2610                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2611                                (int)uio->uio_offset, uio->uio_resid, 5, retval, 0);
2612               return(retval);
2613             }
2614           /*
2615            * Consider the possibility that upl_size wasn't satisfied.
2616            */
2617           if (upl_size != upl_needed_size)
2618             io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2619
2620           if (io_size == 0)
2621             {
2622               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2623                                    UPL_ABORT_FREE_ON_EMPTY);
2624               return(retval);
2625             }
2626
2627           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2628                        (int)upl_offset, upl_size, io_size, kret, 0);
2629
2630           /*
2631            * issue a synchronous read to cluster_io
2632            */
2633
2634           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2635                        upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
2636
2637           error = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
2638                              io_size, CL_READ| CL_NOZERO, (struct buf *)0);
2639
2640           if (error == 0) {
2641             /*
2642              * The cluster_io read completed successfully,
2643              * update the uio structure and commit.
2644              */
2645
2646             ubc_upl_commit_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2647                                         UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
2648
2649             iov->iov_base += io_size;
2650             iov->iov_len -= io_size;
2651             uio->uio_resid -= io_size;
2652             uio->uio_offset += io_size;
2653           }
2654           else {
2655             ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2656                                    UPL_ABORT_FREE_ON_EMPTY);
2657           }
2658
2659           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2660                        upl, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
2661
2662           if (retval == 0)
2663             retval = error;
2664
2665         } /* end while */
2666
2667
2668         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2669                      (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2670
2671         return (retval);
2672 }
2673
2674
2675 static
2676 cluster_phys_read(vp, uio, filesize)
2677         struct vnode *vp;
2678         struct uio   *uio;
2679         off_t        filesize;
2680 {
2681         upl_t            upl;
2682         vm_offset_t      upl_offset;
2683         off_t            max_size;
2684         int              io_size;
2685         int              upl_size;
2686         int              upl_needed_size;
2687         int              pages_in_pl;
2688         int              upl_flags;
2689         kern_return_t    kret;
2690         struct iovec     *iov;
2691         int              error;
2692
2693         /*
2694          * When we enter this routine, we know
2695          *  -- the resid will not exceed iov_len
2696          *  -- the target address is physically contiguous
2697          */
2698
2699         iov = uio->uio_iov;
2700
2701         max_size = filesize - uio->uio_offset;
2702
2703         if (max_size < (off_t)((unsigned int)iov->iov_len))
2704             io_size = max_size;
2705         else
2706             io_size = iov->iov_len;
2707
2708         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2709         upl_needed_size = upl_offset + io_size;
2710
2711         pages_in_pl = 0;
2712         upl_size = upl_needed_size;
2713         upl_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2714
2715         kret = vm_map_get_upl(current_map(),
2716                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2717                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
2718
2719         if (kret != KERN_SUCCESS)
2720           {
2721             /* cluster_phys_read: failed to get pagelist */
2722             return(EINVAL);
2723           }
2724
2725         /*
2726          * Consider the possibility that upl_size wasn't satisfied.
2727          */
2728         if (upl_size < upl_needed_size)
2729           {
2730             ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2731             return(EINVAL);
2732           }
2733
2734         /*
2735          * issue a synchronous read to cluster_io
2736          */
2737
2738         error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2739                            io_size,  CL_READ| CL_NOZERO | CL_DEV_MEMORY, (struct buf *)0);
2740
2741         if (error == 0)
2742           {
2743             /*
2744              * The cluster_io read completed successfully,
2745              * update the uio structure and commit.
2746              */
2747
2748             ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
2749
2750             iov->iov_base += io_size;
2751             iov->iov_len -= io_size;
2752             uio->uio_resid -= io_size;
2753             uio->uio_offset += io_size;
2754           }
2755         else
2756             ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2757
2758         return (error);
2759 }
2760
2761 /*
2762  * generate advisory I/O's in the largest chunks possible
2763  * the completed pages will be released into the VM cache
2764  */
2765 advisory_read(vp, filesize, f_offset, resid, devblocksize)
2766         struct vnode *vp;
2767         off_t         filesize;
2768         off_t         f_offset;
2769         int           resid;
2770         int           devblocksize;
2771 {
2772         upl_page_info_t *pl;
2773         upl_t            upl;
2774         vm_offset_t      upl_offset;
2775         int              upl_size;
2776         off_t            upl_f_offset;
2777         int              start_offset;
2778         int              start_pg;
2779         int              last_pg;
2780         int              pages_in_upl;
2781         off_t            max_size;
2782         int              io_size;
2783         kern_return_t    kret;
2784         int              retval = 0;
2785
2786
2787         if (!UBCINFOEXISTS(vp))
2788                 return(EINVAL);
2789
2790         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
2791                      (int)f_offset, resid, (int)filesize, devblocksize, 0);
2792
2793         while (resid && f_offset < filesize && retval == 0) {
2794                 /*
2795                  * compute the size of the upl needed to encompass
2796                  * the requested read... limit each call to cluster_io
2797                  * to the maximum UPL size... cluster_io will clip if
2798                  * this exceeds the maximum io_size for the device,
2799                  * make sure to account for
2800                  * a starting offset that's not page aligned
2801                  */
2802                 start_offset = (int)(f_offset & PAGE_MASK_64);
2803                 upl_f_offset = f_offset - (off_t)start_offset;
2804                 max_size     = filesize - f_offset;
2805
2806                 if (resid < max_size)
2807                         io_size = resid;
2808                 else
2809                         io_size = max_size;
2810
2811                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2812                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2813                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2814                 pages_in_upl = upl_size / PAGE_SIZE;
2815
2816                 kret = ubc_create_upl(vp,
2817                                                 upl_f_offset,
2818                                                 upl_size,
2819                                                 &upl,
2820                                                 &pl,
2821                                                 UPL_FLAGS_NONE);
2822                 if (kret != KERN_SUCCESS)
2823                         panic("advisory_read: failed to get pagelist");
2824
2825
2826                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
2827                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
2828
2829                 /*
2830                  * scan from the beginning of the upl looking for the first
2831                  * non-valid page.... this will become the first page in
2832                  * the request we're going to make to 'cluster_io'... if all
2833                  * of the pages are valid, we won't call through to 'cluster_io'
2834                  */
2835                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2836                         if (!upl_valid_page(pl, start_pg))
2837                                 break;
2838                 }
2839
2840                 /*
2841                  * scan from the starting invalid page looking for a valid
2842                  * page before the end of the upl is reached, if we
2843                  * find one, then it will be the last page of the request to
2844                  * 'cluster_io'
2845                  */
2846                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2847                         if (upl_valid_page(pl, last_pg))
2848                                 break;
2849                 }
2850
2851                 if (start_pg < last_pg) {
2852                         /*
2853                          * we found a range of 'invalid' pages that must be filled
2854                          * if the last page in this range is the last page of the file
2855                          * we may have to clip the size of it to keep from reading past
2856                          * the end of the last physical block associated with the file
2857                          */
2858                         upl_offset = start_pg * PAGE_SIZE;
2859                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2860
2861                         if ((upl_f_offset + upl_offset + io_size) > filesize) {
2862                                 io_size = filesize - (upl_f_offset + upl_offset);
2863                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2864                         }
2865                         /*
2866                          * issue an asynchronous read to cluster_io
2867                          */
2868                         retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
2869                                           CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0);
2870                 }
2871                 if (start_pg) {
2872                         /*
2873                          * start_pg of non-zero indicates we found some already valid pages
2874                          * at the beginning of the upl.... we need to release these without
2875                          * modifying there state
2876                          */
2877                         ubc_upl_abort_range(upl, 0, start_pg * PAGE_SIZE,
2878                                         UPL_ABORT_FREE_ON_EMPTY);
2879
2880                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 62)) | DBG_FUNC_NONE,
2881                                     upl, 0, start_pg * PAGE_SIZE, 0, 0);
2882                 }
2883                 if (last_pg < pages_in_upl) {
2884                         /*
2885                          * the set of pages that we issued an I/O for did not extend all the
2886                          * way to the end of the upl..so just release them without modifying
2887                          * there state
2888                          */
2889                         ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, (pages_in_upl - last_pg) * PAGE_SIZE,
2890                                         UPL_ABORT_FREE_ON_EMPTY);
2891
2892                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 63)) | DBG_FUNC_NONE,
2893                                      upl, last_pg * PAGE_SIZE,
2894                                      (pages_in_upl - last_pg) * PAGE_SIZE, 0, 0);
2895                 }
2896                 io_size = (last_pg * PAGE_SIZE) - start_offset;
2897
2898                 if (io_size > resid)
2899                         io_size = resid;
2900                 f_offset += io_size;
2901                 resid    -= io_size;
2902         }
2903         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
2904                      (int)f_offset, resid, retval, 0, 0);
2905
2906         return(retval);
2907 }
2908
2909
2910 cluster_push(vp)
2911         struct vnode *vp;
2912 {
2913         upl_page_info_t *pl;
2914         upl_t            upl;
2915         vm_offset_t      upl_offset;
2916         int              upl_size;
2917         off_t            upl_f_offset;
2918         int              pages_in_upl;
2919         int              start_pg;
2920         int              last_pg;
2921         int              io_size;
2922         int              io_flags;
2923         int              size;
2924         kern_return_t    kret;
2925
2926
2927         if (!UBCINFOEXISTS(vp))
2928                 return(0);
2929
2930         if (vp->v_clen == 0 || (pages_in_upl = vp->v_lastw - vp->v_cstart) == 0)
2931                 return (0);
2932         upl_size = pages_in_upl * PAGE_SIZE;
2933         upl_f_offset = ((off_t)vp->v_cstart) * PAGE_SIZE_64;
2934         size = vp->v_ciosiz;
2935         vp->v_clen = 0;
2936
2937         if (size > upl_size || (upl_size - size) > PAGE_SIZE)
2938                 panic("cluster_push: v_ciosiz doesn't match size of cluster\n");
2939
2940         kret = ubc_create_upl(vp,
2941                                 upl_f_offset,
2942                                 upl_size,
2943                                 &upl,
2944                                         &pl,
2945                                         UPL_FLAGS_NONE);
2946         if (kret != KERN_SUCCESS)
2947                 panic("cluster_push: failed to get pagelist");
2948
2949         last_pg = 0;
2950
2951         while (size) {
2952
2953                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
2954                         if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
2955                                 break;
2956                 }
2957                 if (start_pg > last_pg) {
2958                         io_size = (start_pg - last_pg) * PAGE_SIZE;
2959
2960                         ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size,
2961                                         UPL_ABORT_FREE_ON_EMPTY);
2962
2963                         if (io_size < size)
2964                                 size -= io_size;
2965                         else
2966                                 break;
2967                 }
2968                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2969                         if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
2970                                 break;
2971                 }
2972                 upl_offset = start_pg * PAGE_SIZE;
2973
2974                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
2975
2976                 if (vp->v_flag & VNOCACHE_DATA)
2977                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC | CL_DUMP;
2978                 else
2979                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
2980
2981                 while (vp->v_numoutput >= ASYNC_THROTTLE) {
2982                         vp->v_flag |= VTHROTTLED;
2983                         tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
2984                 }
2985                 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, io_flags, (struct buf *)0);
2986
2987                 size -= io_size;
2988         }
2989         return(1);
2990 }