bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * Redistribution and use in source and binary forms, with or without
  28  * modification, are permitted provided that the following conditions
  29  * are met:
  30  * 1. Redistributions of source code must retain the above copyright
  31  *    notice, this list of conditions and the following disclaimer.
  32  * 2. Redistributions in binary form must reproduce the above copyright
  33  *    notice, this list of conditions and the following disclaimer in the
  34  *    documentation and/or other materials provided with the distribution.
  35  * 3. All advertising materials mentioning features or use of this software
  36  *    must display the following acknowledgement:
  37  *      This product includes software developed by the University of
  38  *      California, Berkeley and its contributors.
  39  * 4. Neither the name of the University nor the names of its contributors
  40  *    may be used to endorse or promote products derived from this software
  41  *    without specific prior written permission.
  42  *
  43  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  46  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  53  * SUCH DAMAGE.
  54  *
  55  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  56  */
  57
  58 #include <sys/param.h>
  59 #include <sys/proc.h>
  60 #include <sys/buf.h>
  61 #include <sys/vnode.h>
  62 #include <sys/mount.h>
  63 #include <sys/trace.h>
  64 #include <sys/malloc.h>
  65 #include <sys/resourcevar.h>
  66 #include <libkern/libkern.h>
  67
  68 #include <sys/ubc.h>
  69 #include <vm/vm_pageout.h>
  70
  71 #include <sys/kdebug.h>
  72
  73 #define CL_READ      0x01
  74 #define CL_ASYNC     0x02
  75 #define CL_COMMIT    0x04
  76 #define CL_NOMAP     0x08
  77 #define CL_PAGEOUT   0x10
  78 #define CL_AGE       0x20
  79 #define CL_DUMP      0x40
  80 #define CL_NOZERO    0x80
  81 #define CL_PAGEIN    0x100
  82 #define CL_DEV_MEMORY 0x200
  83
  84 /*
  85  * throttle the number of async writes that
  86  * can be outstanding on a single vnode
  87  * before we issue a synchronous write
  88  */
  89 #define ASYNC_THROTTLE  6
  90
  91 static int
  92 cluster_iodone(bp)
  93         struct buf *bp;
  94 {
  95         int         b_flags;
  96         int         error;
  97         int         total_size;
  98         int         total_resid;
  99         int         upl_offset;
 100         upl_t       upl;
 101         struct buf *cbp;
 102         struct buf *cbp_head;
 103         struct buf *cbp_next;
 104         struct buf *real_bp;
 105         struct vnode *vp;
 106         int         commit_size;
 107         int         pg_offset;
 108
 109
 110         cbp_head = (struct buf *)(bp->b_trans_head);
 111
 112         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 113                      cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 114
 115         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 116                 /*
 117                  * all I/O requests that are part of this transaction
 118                  * have to complete before we can process it
 119                  */
 120                 if ( !(cbp->b_flags & B_DONE)) {
 121
 122                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 123                                      cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
 124
 125                         return 0;
 126                 }
 127         }
 128         error       = 0;
 129         total_size  = 0;
 130         total_resid = 0;
 131
 132         cbp        = cbp_head;
 133         upl_offset = cbp->b_uploffset;
 134         upl        = cbp->b_pagelist;
 135         b_flags    = cbp->b_flags;
 136         real_bp    = cbp->b_real_bp;
 137         vp         = cbp->b_vp;
 138
 139         while (cbp) {
 140                 if (cbp->b_vectorcount > 1)
 141                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 142
 143                 if ((cbp->b_flags & B_ERROR) && error == 0)
 144                         error = cbp->b_error;
 145
 146                 total_resid += cbp->b_resid;
 147                 total_size  += cbp->b_bcount;
 148
 149                 cbp_next = cbp->b_trans_next;
 150
 151                 free_io_buf(cbp);
 152
 153                 cbp = cbp_next;
 154         }
 155         if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
 156                 vp->v_flag &= ~VTHROTTLED;
 157                 wakeup((caddr_t)&vp->v_numoutput);
 158         }
 159         if ((b_flags & B_NEED_IODONE) && real_bp) {
 160                 if (error) {
 161                         real_bp->b_flags |= B_ERROR;
 162                         real_bp->b_error = error;
 163                 }
 164                 real_bp->b_resid = total_resid;
 165
 166                 biodone(real_bp);
 167         }
 168         if (error == 0 && total_resid)
 169                 error = EIO;
 170
 171         if (b_flags & B_COMMIT_UPL) {
 172                 pg_offset   = upl_offset & PAGE_MASK;
 173                 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 174
 175                 if (error || (b_flags & B_NOCACHE)) {
 176                         int upl_abort_code;
 177
 178                         if (b_flags & B_PAGEOUT)
 179                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 180                         else if (b_flags & B_PGIN)
 181                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 182                         else
 183                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 184
 185                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 186                                         upl_abort_code);
 187
 188                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 189                                      upl, upl_offset - pg_offset, commit_size,
 190                                      0x80000000|upl_abort_code, 0);
 191
 192                 } else {
 193                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 194
 195                         if ( !(b_flags & B_PAGEOUT))
 196                                 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
 197                         if (b_flags & B_AGE)
 198                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 199
 200                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 201                                         upl_commit_flags);
 202
 203                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 204                                      upl, upl_offset - pg_offset, commit_size,
 205                                      upl_commit_flags, 0);
 206                 }
 207         } else
 208                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 209                              upl, upl_offset, 0, error, 0);
 210
 211         return (error);
 212 }
 213
 214
 215 static void
 216 cluster_zero(upl, upl_offset, size, flags, bp)
 217         upl_t         upl;
 218         vm_offset_t   upl_offset;
 219         int           size;
 220         int           flags;
 221         struct buf   *bp;
 222 {
 223         vm_offset_t   io_addr = 0;
 224         kern_return_t kret;
 225
 226         if ( !(flags & CL_NOMAP)) {
 227                 kret = ubc_upl_map(upl, &io_addr);
 228
 229                 if (kret != KERN_SUCCESS)
 230                         panic("cluster_zero: ubc_upl_map() failed with (%d)", kret);
 231                 if (io_addr == 0)
 232                         panic("cluster_zero: ubc_upl_map() mapped 0");
 233         } else
 234                 io_addr = (vm_offset_t)bp->b_data;
 235         bzero((caddr_t)(io_addr + upl_offset), size);
 236
 237         if ( !(flags & CL_NOMAP)) {
 238                 kret = ubc_upl_unmap(upl);
 239
 240                 if (kret != KERN_SUCCESS)
 241                         panic("cluster_zero: kernel_upl_unmap failed");
 242         }
 243 }
 244
 245 static int
 246 cluster_io(vp, upl, upl_offset, f_offset, size, flags, real_bp)
 247         struct vnode *vp;
 248         upl_t         upl;
 249         vm_offset_t   upl_offset;
 250         off_t         f_offset;
 251         int           size;
 252         int           flags;
 253         struct buf   *real_bp;
 254 {
 255         struct buf   *cbp;
 256         struct iovec *iovp;
 257         int           io_flags;
 258         int           error = 0;
 259         int           retval = 0;
 260         struct buf   *cbp_head = 0;
 261         struct buf   *cbp_tail = 0;
 262         upl_page_info_t *pl;
 263         int pg_count;
 264         int pg_offset;
 265         int max_iosize;
 266         int max_vectors;
 267         int priv;
 268
 269         if (flags & CL_READ) {
 270                 io_flags = (B_VECTORLIST | B_READ);
 271
 272                 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
 273         } else {
 274                 io_flags = (B_VECTORLIST | B_WRITEINPROG);
 275
 276                 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
 277         }
 278         pl = ubc_upl_pageinfo(upl);
 279
 280         if (flags & CL_ASYNC)
 281                 io_flags |= (B_CALL | B_ASYNC);
 282         if (flags & CL_AGE)
 283                 io_flags |= B_AGE;
 284         if (flags & CL_DUMP)
 285                 io_flags |= B_NOCACHE;
 286         if (flags & CL_PAGEIN)
 287                 io_flags |= B_PGIN;
 288
 289
 290         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 291                      (int)f_offset, size, upl_offset, flags, 0);
 292
 293         if ((flags & CL_READ) && ((upl_offset + size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 294                 /*
 295                  * then we are going to end up
 296                  * with a page that we can't complete (the file size wasn't a multiple
 297                  * of PAGE_SIZE and we're trying to read to the end of the file
 298                  * so we'll go ahead and zero out the portion of the page we can't
 299                  * read in from the file
 300                  */
 301                 cluster_zero(upl, upl_offset + size, PAGE_SIZE - ((upl_offset + size) & PAGE_MASK), flags, real_bp);
 302
 303                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
 304                              upl_offset + size, PAGE_SIZE - ((upl_offset + size) & PAGE_MASK),
 305                              flags, real_bp, 0);
 306         }
 307         while (size) {
 308                 size_t io_size;
 309                 int vsize;
 310                 int i;
 311                 int pl_index;
 312                 int pg_resid;
 313                 int num_contig;
 314                 daddr_t lblkno;
 315                 daddr_t blkno;
 316
 317                 if (size > max_iosize)
 318                         io_size = max_iosize;
 319                 else
 320                         io_size = size;
 321
 322                 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, &io_size, NULL)) {
 323                         if (error == EOPNOTSUPP)
 324                                 panic("VOP_CMAP Unimplemented");
 325                         break;
 326                 }
 327
 328                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 329                              (int)f_offset, (int)blkno, io_size, 0, 0);
 330
 331                 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
 332                         if (flags & CL_PAGEOUT) {
 333                                 error = EINVAL;
 334                                 break;
 335                         };
 336
 337                         /* Try paging out the page individually before
 338                            giving up entirely and dumping it (it could
 339                            be mapped in a "hole" and require allocation
 340                            before the I/O:
 341                          */
 342                          ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE_64, UPL_ABORT_FREE_ON_EMPTY);
 343                          if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
 344                                 error = EINVAL;
 345                                 break;
 346                          };
 347
 348                         upl_offset += PAGE_SIZE_64;
 349                         f_offset   += PAGE_SIZE_64;
 350                         size       -= PAGE_SIZE_64;
 351                         continue;
 352                 }
 353                 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
 354                 /*
 355                  * we have now figured out how much I/O we can do - this is in 'io_size'
 356                  * pl_index represents the first page in the 'upl' that the I/O will occur for
 357                  * pg_offset is the starting point in the first page for the I/O
 358                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 359                  */
 360                 pl_index  = upl_offset / PAGE_SIZE;
 361                 pg_offset = upl_offset & PAGE_MASK;
 362                 pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 363
 364                 if (flags & CL_DEV_MEMORY) {
 365                         /*
 366                          * currently, can't deal with reading 'holes' in file
 367                          */
 368                         if ((long)blkno == -1) {
 369                                 error = EINVAL;
 370                                 break;
 371                         }
 372                         /*
 373                          * treat physical requests as one 'giant' page
 374                          */
 375                         pg_count = 1;
 376                 }
 377                 if ((flags & CL_READ) && (long)blkno == -1) {
 378                         /*
 379                          * if we're reading and blkno == -1, then we've got a
 380                          * 'hole' in the file that we need to deal with by zeroing
 381                          * out the affected area in the upl
 382                          */
 383                         cluster_zero(upl, upl_offset, io_size, flags, real_bp);
 384
 385                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
 386                                      upl_offset, io_size, flags, real_bp, 0);
 387
 388                         pg_count = (io_size - pg_offset) / PAGE_SIZE;
 389
 390                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 391                                 pg_count++;
 392
 393                         if (pg_count) {
 394                                 if (pg_offset)
 395                                         pg_resid = PAGE_SIZE - pg_offset;
 396                                 else
 397                                         pg_resid = 0;
 398                                 if (flags & CL_COMMIT)
 399                                         ubc_upl_commit_range(upl,
 400                                                         upl_offset + pg_resid,
 401                                                         pg_count * PAGE_SIZE,
 402                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 403                         }
 404                         upl_offset += io_size;
 405                         f_offset   += io_size;
 406                         size       -= io_size;
 407
 408                         if (cbp_head && pg_count)
 409                                 goto start_io;
 410                         continue;
 411                 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
 412                         real_bp->b_blkno = blkno;
 413                 }
 414
 415                 if (pg_count > 1) {
 416                         if (pg_count > max_vectors) {
 417                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 418
 419                                 if (io_size < 0) {
 420                                         io_size = PAGE_SIZE - pg_offset;
 421                                         pg_count = 1;
 422                                 } else
 423                                         pg_count = max_vectors;
 424                         }
 425                         /*
 426                          * we need to allocate space for the vector list
 427                          */
 428                         if (pg_count > 1) {
 429                                 iovp = (struct iovec *)_MALLOC(sizeof(struct iovec) * pg_count,
 430                                                                M_SEGMENT, M_NOWAIT);
 431
 432                                 if (iovp == (struct iovec *) 0) {
 433                                         /*
 434                                          * if the allocation fails, then throttle down to a single page
 435                                          */
 436                                         io_size = PAGE_SIZE - pg_offset;
 437                                         pg_count = 1;
 438                                 }
 439                         }
 440                 }
 441
 442                 /* Throttle the speculative IO */
 443                 if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 444                         priv = 0;
 445                 else
 446                         priv = 1;
 447
 448                 cbp = alloc_io_buf(vp, priv);
 449
 450                 if (pg_count == 1)
 451                         /*
 452                          * we use the io vector that's reserved in the buffer header
 453                          * this insures we can always issue an I/O even in a low memory
 454                          * condition that prevents the _MALLOC from succeeding... this
 455                          * is necessary to prevent deadlocks with the pager
 456                          */
 457                         iovp = (struct iovec *)(&cbp->b_vects[0]);
 458
 459                 cbp->b_vectorlist  = (void *)iovp;
 460                 cbp->b_vectorcount = pg_count;
 461
 462                 if (flags & CL_DEV_MEMORY) {
 463
 464                         iovp->iov_len  = io_size;
 465                         iovp->iov_base = (caddr_t)upl_phys_page(pl, 0);
 466
 467                         if (iovp->iov_base == (caddr_t) 0) {
 468                                 free_io_buf(cbp);
 469                                 error = EINVAL;
 470                         } else
 471                                 iovp->iov_base += upl_offset;
 472                 } else {
 473
 474                   for (i = 0, vsize = io_size; i < pg_count; i++, iovp++) {
 475                         int     psize;
 476
 477                         psize = PAGE_SIZE - pg_offset;
 478
 479                         if (psize > vsize)
 480                                 psize = vsize;
 481
 482                         iovp->iov_len  = psize;
 483                         iovp->iov_base = (caddr_t)upl_phys_page(pl, pl_index + i);
 484
 485                         if (iovp->iov_base == (caddr_t) 0) {
 486                                 if (pg_count > 1)
 487                                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 488                                 free_io_buf(cbp);
 489
 490                                 error = EINVAL;
 491                                 break;
 492                         }
 493                         iovp->iov_base += pg_offset;
 494                         pg_offset = 0;
 495
 496                         if (flags & CL_PAGEOUT) {
 497                                 int         s;
 498                                 struct buf *bp;
 499
 500                                 s = splbio();
 501                                 if (bp = incore(vp, lblkno + i)) {
 502                                         if (!ISSET(bp->b_flags, B_BUSY)) {
 503                                                 bremfree(bp);
 504                                                 SET(bp->b_flags, (B_BUSY | B_INVAL));
 505                                                 splx(s);
 506                                                 brelse(bp);
 507                                         } else
 508                                                 panic("BUSY bp found in cluster_io");
 509                                 }
 510                                 splx(s);
 511                         }
 512                         vsize -= psize;
 513                     }
 514                 }
 515                 if (error)
 516                         break;
 517
 518                 if (flags & CL_ASYNC)
 519                         cbp->b_iodone = (void *)cluster_iodone;
 520                 cbp->b_flags |= io_flags;
 521
 522                 cbp->b_lblkno = lblkno;
 523                 cbp->b_blkno  = blkno;
 524                 cbp->b_bcount = io_size;
 525                 cbp->b_pagelist  = upl;
 526                 cbp->b_uploffset = upl_offset;
 527                 cbp->b_trans_next = (struct buf *)0;
 528
 529                 if (flags & CL_READ)
 530                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 531                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 532                 else
 533                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 534                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 535
 536                 if (cbp_head) {
 537                         cbp_tail->b_trans_next = cbp;
 538                         cbp_tail = cbp;
 539                 } else {
 540                         cbp_head = cbp;
 541                         cbp_tail = cbp;
 542                 }
 543                 (struct buf *)(cbp->b_trans_head) = cbp_head;
 544
 545                 upl_offset += io_size;
 546                 f_offset   += io_size;
 547                 size       -= io_size;
 548
 549                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY)) || size == 0) {
 550                         /*
 551                          * if we have no more I/O to issue or
 552                          * the current I/O we've prepared fully
 553                          * completes the last page in this request
 554                          * or it's been completed via a zero-fill
 555                          * due to a 'hole' in the file
 556                          * then go ahead and issue the I/O
 557                          */
 558 start_io:
 559                         if (flags & CL_COMMIT)
 560                                 cbp_head->b_flags |= B_COMMIT_UPL;
 561                         if (flags & CL_PAGEOUT)
 562                                 cbp_head->b_flags |= B_PAGEOUT;
 563                         if (flags & CL_PAGEIN)
 564                                 cbp_head->b_flags |= B_PGIN;
 565
 566                         if (real_bp) {
 567                                 cbp_head->b_flags |= B_NEED_IODONE;
 568                                 cbp_head->b_real_bp = real_bp;
 569                         }
 570
 571                         for (cbp = cbp_head; cbp;) {
 572                                 struct buf * cbp_next;
 573
 574                                 if (io_flags & B_WRITEINPROG)
 575                                         cbp->b_vp->v_numoutput++;
 576
 577                                 cbp_next = cbp->b_trans_next;
 578
 579                                 (void) VOP_STRATEGY(cbp);
 580                                 cbp = cbp_next;
 581                         }
 582                         if ( !(flags & CL_ASYNC)) {
 583                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 584                                         biowait(cbp);
 585
 586                                 if (error = cluster_iodone(cbp_head)) {
 587                                         retval = error;
 588                                         error  = 0;
 589                                 }
 590                         }
 591                         cbp_head = (struct buf *)0;
 592                         cbp_tail = (struct buf *)0;
 593                 }
 594         }
 595         if (error) {
 596                 int abort_size;
 597
 598                 for (cbp = cbp_head; cbp;) {
 599                         struct buf * cbp_next;
 600
 601                         if (cbp->b_vectorcount > 1)
 602                                 _FREE(cbp->b_vectorlist, M_SEGMENT);
 603                         upl_offset -= cbp->b_bcount;
 604                         size       += cbp->b_bcount;
 605
 606                         cbp_next = cbp->b_trans_next;
 607                         free_io_buf(cbp);
 608                         cbp = cbp_next;
 609                 }
 610                 pg_offset  = upl_offset & PAGE_MASK;
 611                 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 612
 613                 if (flags & CL_COMMIT) {
 614                         int upl_abort_code;
 615
 616                         if (flags & CL_PAGEOUT)
 617                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 618                         else if (flags & CL_PAGEIN)
 619                             upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 620                         else
 621                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 622
 623                         ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
 624                                                 upl_abort_code);
 625
 626                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
 627                                      upl, upl_offset - pg_offset, abort_size, error, 0);
 628                 }
 629                 if (real_bp) {
 630                         real_bp->b_flags |= B_ERROR;
 631                         real_bp->b_error  = error;
 632
 633                         biodone(real_bp);
 634                 }
 635                 if (retval == 0)
 636                         retval = error;
 637         }
 638         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
 639                      (int)f_offset, size, upl_offset, retval, 0);
 640
 641         return (retval);
 642 }
 643
 644
 645 static int
 646 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
 647         struct vnode *vp;
 648         off_t         f_offset;
 649         u_int         size;
 650         off_t         filesize;
 651         int           devblocksize;
 652 {
 653         upl_t         upl;
 654         upl_page_info_t *pl;
 655         int           pages_in_upl;
 656         int           start_pg;
 657         int           last_pg;
 658         int           last_valid;
 659         int           io_size;
 660
 661
 662         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
 663                      (int)f_offset, size, (int)filesize, 0, 0);
 664
 665         if (f_offset >= filesize) {
 666                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 667                              (int)f_offset, 0, 0, 0, 0);
 668                 return(0);
 669         }
 670         if (ubc_page_op(vp, f_offset, 0, 0, 0) == KERN_SUCCESS) {
 671                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 672                              (int)f_offset, 0, 0, 0, 0);
 673                 return(0);
 674         }
 675         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
 676                 size = MAX_UPL_TRANSFER * PAGE_SIZE;
 677         else
 678                 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
 679
 680         if ((off_t)size > (filesize - f_offset))
 681                 size = ((filesize - f_offset) + (devblocksize - 1)) & ~(devblocksize - 1);
 682
 683         pages_in_upl = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
 684
 685         ubc_create_upl(vp,
 686                         f_offset,
 687                         pages_in_upl * PAGE_SIZE,
 688                                 &upl,
 689                                 &pl,
 690                                 UPL_FLAGS_NONE);
 691
 692         if (upl == (upl_t) 0)
 693                 return(0);
 694
 695         /*
 696          * scan from the beginning of the upl looking for the first
 697          * non-valid page.... this will become the first page in
 698          * the request we're going to make to 'cluster_io'... if all
 699          * of the pages are valid, we won't call through to 'cluster_io'
 700          */
 701         for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
 702                 if (!upl_valid_page(pl, start_pg))
 703                         break;
 704         }
 705
 706         /*
 707          * scan from the starting invalid page looking for a valid
 708          * page before the end of the upl is reached, if we
 709          * find one, then it will be the last page of the request to
 710          * 'cluster_io'
 711          */
 712         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
 713                 if (upl_valid_page(pl, last_pg))
 714                         break;
 715         }
 716
 717         /*
 718          * if we find any more free valid pages at the tail of the upl
 719          * than update maxra accordingly....
 720          */
 721         for (last_valid = last_pg; last_valid < pages_in_upl; last_valid++) {
 722                 if (!upl_valid_page(pl, last_valid))
 723                         break;
 724         }
 725         if (start_pg < last_pg) {
 726                 vm_offset_t   upl_offset;
 727
 728                 /*
 729                  * we found a range of 'invalid' pages that must be filled
 730                  * 'size' has already been clipped to the LEOF
 731                  * make sure it's at least a multiple of the device block size
 732                  */
 733                 upl_offset = start_pg * PAGE_SIZE;
 734                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
 735
 736                 if ((upl_offset + io_size) > size) {
 737                         io_size = size - upl_offset;
 738
 739                         KERNEL_DEBUG(0xd001000, upl_offset, size, io_size, 0, 0);
 740                 }
 741                 cluster_io(vp, upl, upl_offset, f_offset + upl_offset, io_size,
 742                            CL_READ | CL_COMMIT | CL_ASYNC | CL_AGE, (struct buf *)0);
 743         }
 744         if (start_pg) {
 745                 /*
 746                  * start_pg of non-zero indicates we found some already valid pages
 747                  * at the beginning of the upl.... we need to release these without
 748                  * modifying there state
 749                  */
 750                 ubc_upl_abort_range(upl, 0, start_pg * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
 751
 752                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 50)) | DBG_FUNC_NONE,
 753                              upl, 0, start_pg * PAGE_SIZE, 0, 0);
 754         }
 755         if (last_pg < pages_in_upl) {
 756                 /*
 757                  * the set of pages that we issued an I/O for did not extend all the
 758                  * way to the end of the upl... so just release them without modifying
 759                  * there state
 760                  */
 761                 ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, (pages_in_upl - last_pg) * PAGE_SIZE,
 762                                 UPL_ABORT_FREE_ON_EMPTY);
 763
 764                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 50)) | DBG_FUNC_NONE,
 765                              upl, last_pg * PAGE_SIZE, (pages_in_upl - last_pg) * PAGE_SIZE, 0, 0);
 766         }
 767
 768         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 769                      (int)f_offset + (last_valid * PAGE_SIZE), 0, 0, 0, 0);
 770
 771         return(last_valid);
 772 }
 773
 774
 775
 776 static void
 777 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
 778         struct vnode *vp;
 779         daddr_t       b_lblkno;
 780         daddr_t       e_lblkno;
 781         off_t         filesize;
 782         int           devblocksize;
 783 {
 784         daddr_t       r_lblkno;
 785         off_t         f_offset;
 786         int           size_of_prefetch;
 787         int           max_iosize;
 788         int           max_pages;
 789
 790         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
 791                      b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
 792
 793         if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
 794                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 795                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
 796                 return;
 797         }
 798
 799         if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) && b_lblkno != (vp->v_maxra + 1))) {
 800                 vp->v_ralen = 0;
 801                 vp->v_maxra = 0;
 802
 803                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 804                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
 805
 806                 return;
 807         }
 808         vfs_io_attributes(vp, B_READ, &max_iosize, &max_pages);
 809
 810         if ((max_iosize / PAGE_SIZE) < max_pages)
 811                 max_pages = max_iosize / PAGE_SIZE;
 812         if (max_pages > MAX_UPL_TRANSFER)
 813                 max_pages = MAX_UPL_TRANSFER;
 814
 815         vp->v_ralen = vp->v_ralen ? min(max_pages, vp->v_ralen << 1) : 1;
 816
 817         if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
 818                 vp->v_ralen = min(max_pages, (e_lblkno + 1) - b_lblkno);
 819
 820         if (e_lblkno < vp->v_maxra) {
 821                 if ((vp->v_maxra - e_lblkno) > (max_pages / 4)) {
 822
 823                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 824                                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
 825                         return;
 826                 }
 827         }
 828         r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
 829         f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
 830
 831         size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
 832
 833         if (size_of_prefetch)
 834                 vp->v_maxra = r_lblkno + (size_of_prefetch - 1);
 835
 836         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 837                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
 838 }
 839
 840
 841 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 842         struct vnode *vp;
 843         upl_t         upl;
 844         vm_offset_t   upl_offset;
 845         off_t         f_offset;
 846         int           size;
 847         off_t         filesize;
 848         int           devblocksize;
 849         int           flags;
 850 {
 851         int           io_size;
 852         int           pg_size;
 853         off_t         max_size;
 854         int local_flags = CL_PAGEOUT;
 855
 856         if ((flags & UPL_IOSYNC) == 0)
 857                 local_flags |= CL_ASYNC;
 858         if ((flags & UPL_NOCOMMIT) == 0)
 859                 local_flags |= CL_COMMIT;
 860
 861         if (upl == (upl_t) 0)
 862                 panic("cluster_pageout: can't handle NULL upl yet\n");
 863
 864
 865         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
 866                      (int)f_offset, size, (int)filesize, local_flags, 0);
 867
 868         /*
 869          * If they didn't specify any I/O, then we are done...
 870          * we can't issue an abort because we don't know how
 871          * big the upl really is
 872          */
 873         if (size <= 0)
 874                 return (EINVAL);
 875
 876         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
 877                 if (local_flags & CL_COMMIT)
 878                         ubc_upl_abort_range(upl, upl_offset, size,
 879                                                 UPL_ABORT_FREE_ON_EMPTY);
 880                 return (EROFS);
 881         }
 882         /*
 883          * can't page-in from a negative offset
 884          * or if we're starting beyond the EOF
 885          * or if the file offset isn't page aligned
 886          * or the size requested isn't a multiple of PAGE_SIZE
 887          */
 888         if (f_offset < 0 || f_offset >= filesize ||
 889            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 890                 if (local_flags & CL_COMMIT)
 891                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 892                 return (EINVAL);
 893         }
 894         max_size = filesize - f_offset;
 895
 896         if (size < max_size)
 897                 io_size = size;
 898         else
 899                 io_size = (max_size + (devblocksize - 1)) & ~(devblocksize - 1);
 900
 901         pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 902
 903         if (size > pg_size) {
 904                 if (local_flags & CL_COMMIT)
 905                         ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
 906                                         UPL_ABORT_FREE_ON_EMPTY);
 907         }
 908         while (vp->v_numoutput >= ASYNC_THROTTLE) {
 909                 vp->v_flag |= VTHROTTLED;
 910                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_pageout", 0);
 911         }
 912
 913         return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
 914                            local_flags, (struct buf *)0));
 915 }
 916
 917
 918 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 919         struct vnode *vp;
 920         upl_t         upl;
 921         vm_offset_t   upl_offset;
 922         off_t         f_offset;
 923         int           size;
 924         off_t         filesize;
 925         int           devblocksize;
 926         int           flags;
 927 {
 928         u_int         io_size;
 929         int           pg_size;
 930         off_t         max_size;
 931         int           retval;
 932         int           local_flags = 0;
 933
 934
 935         /*
 936          * If they didn't ask for any data, then we are done...
 937          * we can't issue an abort because we don't know how
 938          * big the upl really is
 939          */
 940         if (size <= 0)
 941                 return (EINVAL);
 942
 943         if ((flags & UPL_NOCOMMIT) == 0)
 944                 local_flags = CL_COMMIT;
 945
 946         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
 947                      (int)f_offset, size, (int)filesize, local_flags, 0);
 948
 949         /*
 950          * can't page-in from a negative offset
 951          * or if we're starting beyond the EOF
 952          * or if the file offset isn't page aligned
 953          * or the size requested isn't a multiple of PAGE_SIZE
 954          */
 955         if (f_offset < 0 || f_offset >= filesize ||
 956            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 957                 if (local_flags & CL_COMMIT)
 958                         ubc_upl_abort_range(upl, upl_offset, size,
 959                                         UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY);
 960                 return (EINVAL);
 961         }
 962         max_size = filesize - f_offset;
 963
 964         if (size < max_size)
 965                 io_size = size;
 966         else
 967                 io_size = (max_size + (devblocksize - 1)) & ~(devblocksize - 1);
 968
 969         pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 970
 971         if (upl == (upl_t) 0) {
 972                 ubc_create_upl( vp,
 973                                                 f_offset,
 974                                                 pg_size,
 975                                                 &upl,
 976                                                 NULL,
 977                                                 UPL_FLAGS_NONE);
 978
 979                 if (upl == (upl_t) 0)
 980                         return (EINVAL);
 981
 982                 upl_offset = (vm_offset_t)0;
 983                 size = pg_size;
 984         }
 985         if (size > pg_size) {
 986                 if (local_flags & CL_COMMIT)
 987                         ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
 988                                         UPL_ABORT_FREE_ON_EMPTY);
 989         }
 990
 991         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
 992                             local_flags | CL_READ | CL_PAGEIN, (struct buf *)0);
 993
 994         if (retval == 0) {
 995                 int b_lblkno;
 996                 int e_lblkno;
 997
 998                 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
 999                 e_lblkno = (int)
1000                         ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1001
1002                 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) {
1003                         /*
1004                          * we haven't read the last page in of the file yet
1005                          * so let's try to read ahead if we're in
1006                          * a sequential access pattern
1007                          */
1008                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1009                 }
1010                 vp->v_lastr = e_lblkno;
1011         }
1012         return (retval);
1013 }
1014
1015
1016 cluster_bp(bp)
1017         struct buf *bp;
1018 {
1019         off_t  f_offset;
1020         int    flags;
1021
1022         if (bp->b_pagelist == (upl_t) 0)
1023                 panic("cluster_bp: can't handle NULL upl yet\n");
1024         if (bp->b_flags & B_READ)
1025                 flags = CL_ASYNC | CL_NOMAP | CL_READ;
1026         else
1027                 flags = CL_ASYNC | CL_NOMAP;
1028
1029         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1030
1031         return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, flags, bp));
1032 }
1033
1034
1035 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1036         struct vnode *vp;
1037         struct uio   *uio;
1038         off_t         oldEOF;
1039         off_t         newEOF;
1040         off_t         headOff;
1041         off_t         tailOff;
1042         int           devblocksize;
1043         int           flags;
1044 {
1045         int           prev_resid;
1046         int           clip_size;
1047         off_t         max_io_size;
1048         struct iovec  *iov;
1049         vm_offset_t   upl_offset;
1050         int           upl_size;
1051         int           pages_in_pl;
1052         upl_page_info_t *pl;
1053         int           upl_flags;
1054         upl_t         upl;
1055         int           retval = 0;
1056
1057
1058         if ((!uio) || (uio->uio_segflg != UIO_USERSPACE) || (!(vp->v_flag & VNOCACHE_DATA)))
1059           {
1060             retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1061             return(retval);
1062           }
1063
1064         while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1065           {
1066             /* we know we have a resid, so this is safe */
1067             iov = uio->uio_iov;
1068             while (iov->iov_len == 0) {
1069               uio->uio_iov++;
1070               uio->uio_iovcnt--;
1071               iov = uio->uio_iov;
1072             }
1073
1074             /*
1075              * We check every vector target and if it is physically
1076              * contiguous space, we skip the sanity checks.
1077              */
1078
1079             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1080             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1081             pages_in_pl = 0;
1082             upl_flags = UPL_QUERY_OBJECT_TYPE;
1083             if ((vm_map_get_upl(current_map(),
1084                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1085                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1086               {
1087                 /*
1088                  * the user app must have passed in an invalid address
1089                  */
1090                 return (EFAULT);
1091               }
1092
1093             if (upl_flags & UPL_PHYS_CONTIG)
1094               {
1095                 /*
1096                  * since the interface to the IOKit below us uses physical block #'s and
1097                  * block counts to specify the I/O, we can't handle anything that isn't
1098                  * devblocksize aligned
1099                  */
1100                 if ((uio->uio_offset & (devblocksize - 1)) || (uio->uio_resid & (devblocksize - 1)))
1101                     return(EINVAL);
1102
1103                 if (flags & IO_HEADZEROFILL)
1104                   {
1105                     flags &= ~IO_HEADZEROFILL;
1106
1107                     if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1108                         return(retval);
1109                   }
1110
1111                 retval = cluster_phys_write(vp, uio);
1112
1113                 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1114                   {
1115                     retval = cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL);
1116                     return(retval);
1117                   }
1118               }
1119             else if ((uio->uio_resid < 4 * PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1120               {
1121                 /*
1122                  * We set a threshhold of 4 pages to decide if the nocopy
1123                  * write loop is worth the trouble...
1124                  * we also come here if we're trying to zero the head and/or tail
1125                  * of a partially written page, and the user source is not a physically contiguous region
1126                  */
1127                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1128                 return(retval);
1129               }
1130             else if (uio->uio_offset & PAGE_MASK_64)
1131               {
1132                 /* Bring the file offset write up to a pagesize boundary */
1133                 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1134                 if (uio->uio_resid < clip_size)
1135                   clip_size = uio->uio_resid;
1136                 /*
1137                  * Fake the resid going into the cluster_write_x call
1138                  * and restore it on the way out.
1139                  */
1140                 prev_resid = uio->uio_resid;
1141                 uio->uio_resid = clip_size;
1142                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1143                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1144               }
1145             else if ((int)iov->iov_base & PAGE_MASK_64)
1146               {
1147                 clip_size = iov->iov_len;
1148                 prev_resid = uio->uio_resid;
1149                 uio->uio_resid = clip_size;
1150                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1151                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1152               }
1153             else
1154               {
1155                 /*
1156                  * If we come in here, we know the offset into
1157                  * the file is on a pagesize boundary
1158                  */
1159
1160                 max_io_size = newEOF - uio->uio_offset;
1161                 clip_size = uio->uio_resid;
1162                 if (iov->iov_len < clip_size)
1163                   clip_size = iov->iov_len;
1164                 if (max_io_size < clip_size)
1165                   clip_size = max_io_size;
1166
1167                 if (clip_size < PAGE_SIZE)
1168                   {
1169                     /*
1170                      * Take care of tail end of write in this vector
1171                      */
1172                     prev_resid = uio->uio_resid;
1173                     uio->uio_resid = clip_size;
1174                     retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1175                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1176                   }
1177                 else
1178                   {
1179                     /* round clip_size down to a multiple of pagesize */
1180                     clip_size = clip_size & ~(PAGE_MASK);
1181                     prev_resid = uio->uio_resid;
1182                     uio->uio_resid = clip_size;
1183                     retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1184                     if ((retval == 0) && uio->uio_resid)
1185                       retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1186                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1187                   }
1188               } /* end else */
1189           } /* end while */
1190         return(retval);
1191 }
1192
1193 static
1194 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1195         struct vnode *vp;
1196         struct uio   *uio;
1197         off_t         newEOF;
1198         int           devblocksize;
1199         int           flags;
1200 {
1201         upl_t            upl;
1202         upl_page_info_t  *pl;
1203         off_t            upl_f_offset;
1204         vm_offset_t      upl_offset;
1205         off_t            max_io_size;
1206         int              io_size;
1207         int              upl_size;
1208         int              upl_needed_size;
1209         int              pages_in_pl;
1210         int              upl_flags;
1211         kern_return_t    kret;
1212         struct iovec     *iov;
1213         int              i;
1214         int              force_data_sync;
1215         int              error  = 0;
1216
1217         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1218                      (int)uio->uio_offset, (int)uio->uio_resid,
1219                      (int)newEOF, devblocksize, 0);
1220
1221         /*
1222          * When we enter this routine, we know
1223          *  -- the offset into the file is on a pagesize boundary
1224          *  -- the resid is a page multiple
1225          *  -- the resid will not exceed iov_len
1226          */
1227
1228         iov = uio->uio_iov;
1229
1230         while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1231           io_size = uio->uio_resid;
1232
1233           if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1234             io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1235
1236           upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1237           upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1238
1239           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1240                        (int)upl_offset, upl_needed_size, iov->iov_base, io_size, 0);
1241
1242           for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
1243             {
1244               pages_in_pl = 0;
1245               upl_size = upl_needed_size;
1246               upl_flags = UPL_COPYOUT_FROM | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1247
1248               kret = vm_map_get_upl(current_map(),
1249                                     (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1250                                     &upl_size,
1251                                         &upl,
1252                                         NULL,
1253                                         &pages_in_pl,
1254                                         &upl_flags,
1255                                         force_data_sync);
1256
1257               if (kret != KERN_SUCCESS)
1258                 {
1259                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1260                                0, 0, 0, kret, 0);
1261
1262                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1263                                (int)uio->uio_offset, (int)uio->uio_resid, kret, 1, 0);
1264
1265                   /* cluster_nocopy_write: failed to get pagelist */
1266                   /* do not return kret here */
1267                   return(0);
1268                 }
1269
1270               pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1271               pages_in_pl = upl_size / PAGE_SIZE;
1272
1273               for(i=0; i < pages_in_pl; i++)
1274                 {
1275                   if (!upl_valid_page(pl, i))
1276                     break;
1277                 }
1278
1279               if (i == pages_in_pl)
1280                 break;
1281
1282                 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1283                                 UPL_ABORT_FREE_ON_EMPTY);
1284             }
1285
1286           if (force_data_sync >= 3)
1287             {
1288               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1289                            i, pages_in_pl, upl_size, kret, 0);
1290
1291               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1292                            (int)uio->uio_offset, (int)uio->uio_resid, kret, 2, 0);
1293               return(0);
1294             }
1295
1296           /*
1297            * Consider the possibility that upl_size wasn't satisfied.
1298            */
1299           if (upl_size != upl_needed_size)
1300             io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1301
1302           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1303                        (int)upl_offset, upl_size, iov->iov_base, io_size, 0);
1304
1305           if (io_size == 0)
1306             {
1307               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1308                                    UPL_ABORT_FREE_ON_EMPTY);
1309               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1310                      (int)uio->uio_offset, uio->uio_resid, 0, 3, 0);
1311
1312               return(0);
1313             }
1314
1315           /*
1316            * Now look for pages already in the cache
1317            * and throw them away.
1318            */
1319
1320           upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
1321           max_io_size = io_size;
1322
1323           while (max_io_size) {
1324
1325             /*
1326              * Flag UPL_POP_DUMP says if the page is found
1327              * in the page cache it must be thrown away.
1328              */
1329             ubc_page_op(vp,
1330                         upl_f_offset,
1331                         UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1332                         0, 0);
1333             max_io_size  -= PAGE_SIZE;
1334             upl_f_offset += PAGE_SIZE;
1335           }
1336
1337           /*
1338            * issue a synchronous write to cluster_io
1339            */
1340
1341           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1342                        (int)upl_offset, (int)uio->uio_offset, io_size, 0, 0);
1343
1344           error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1345                              io_size, 0, (struct buf *)0);
1346
1347           if (error == 0) {
1348             /*
1349              * The cluster_io write completed successfully,
1350              * update the uio structure.
1351              */
1352             iov->iov_base += io_size;
1353             iov->iov_len -= io_size;
1354             uio->uio_resid -= io_size;
1355             uio->uio_offset += io_size;
1356           }
1357           /*
1358            * always 'commit' the I/O via the abort primitive whether the I/O
1359            * succeeded cleanly or not... this is necessary to insure that
1360            * we preserve the state of the DIRTY flag on the pages used to
1361            * provide the data for the I/O... the state of this flag SHOULD
1362            * NOT be changed by a write
1363            */
1364           ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1365                               UPL_ABORT_FREE_ON_EMPTY);
1366
1367
1368           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1369                        (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1370
1371         } /* end while */
1372
1373
1374         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1375                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1376
1377         return (error);
1378 }
1379
1380 static
1381 cluster_phys_write(vp, uio)
1382         struct vnode *vp;
1383         struct uio   *uio;
1384 {
1385         upl_t            upl;
1386         vm_offset_t      upl_offset;
1387         int              io_size;
1388         int              upl_size;
1389         int              upl_needed_size;
1390         int              pages_in_pl;
1391         int              upl_flags;
1392         kern_return_t    kret;
1393         struct iovec     *iov;
1394         int              error  = 0;
1395
1396         /*
1397          * When we enter this routine, we know
1398          *  -- the resid will not exceed iov_len
1399          *  -- the vector target address is physcially contiguous
1400          */
1401
1402         iov = uio->uio_iov;
1403         io_size = iov->iov_len;
1404         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1405         upl_needed_size = upl_offset + io_size;
1406
1407         pages_in_pl = 0;
1408         upl_size = upl_needed_size;
1409         upl_flags = UPL_COPYOUT_FROM | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1410
1411         kret = vm_map_get_upl(current_map(),
1412                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1413                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1414
1415         if (kret != KERN_SUCCESS)
1416           {
1417             /* cluster_phys_write: failed to get pagelist */
1418               /* note: return kret here */
1419               return(EINVAL);
1420           }
1421
1422         /*
1423          * Consider the possibility that upl_size wasn't satisfied.
1424          * This is a failure in the physical memory case.
1425          */
1426         if (upl_size < upl_needed_size)
1427           {
1428             kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1429             return(EINVAL);
1430           }
1431
1432         /*
1433          * issue a synchronous write to cluster_io
1434          */
1435
1436         error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1437                            io_size, CL_DEV_MEMORY, (struct buf *)0);
1438
1439         if (error == 0) {
1440           /*
1441            * The cluster_io write completed successfully,
1442            * update the uio structure and commit.
1443            */
1444
1445           ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
1446
1447           iov->iov_base += io_size;
1448           iov->iov_len -= io_size;
1449           uio->uio_resid -= io_size;
1450           uio->uio_offset += io_size;
1451         }
1452         else
1453           ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1454
1455         return (error);
1456 }
1457
1458 static
1459 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1460         struct vnode *vp;
1461         struct uio   *uio;
1462         off_t         oldEOF;
1463         off_t         newEOF;
1464         off_t         headOff;
1465         off_t         tailOff;
1466         int           devblocksize;
1467         int           flags;
1468 {
1469         upl_page_info_t *pl;
1470         upl_t            upl;
1471         vm_offset_t      upl_offset;
1472         int              upl_size;
1473         off_t            upl_f_offset;
1474         int              pages_in_upl;
1475         int              start_offset;
1476         int              xfer_resid;
1477         int              io_size;
1478         int              io_size_before_rounding;
1479         int              io_flags;
1480         vm_offset_t      io_address;
1481         int              io_offset;
1482         int              bytes_to_zero;
1483         int              bytes_to_move;
1484         kern_return_t    kret;
1485         int              retval = 0;
1486         int              uio_resid;
1487         long long        total_size;
1488         long long        zero_cnt;
1489         off_t            zero_off;
1490         long long        zero_cnt1;
1491         off_t            zero_off1;
1492         daddr_t          start_blkno;
1493         daddr_t          last_blkno;
1494
1495         if (uio) {
1496                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1497                              (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1498
1499                 uio_resid = uio->uio_resid;
1500         } else {
1501                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1502                              0, 0, (int)oldEOF, (int)newEOF, 0);
1503
1504                 uio_resid = 0;
1505         }
1506         zero_cnt  = 0;
1507         zero_cnt1 = 0;
1508
1509         if (flags & IO_HEADZEROFILL) {
1510                 /*
1511                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1512                  * so we zero fill the intervening space between the old EOF and the offset
1513                  * where the next chunk of real data begins.... ftruncate will also use this
1514                  * routine to zero fill to the new EOF when growing a file... in this case, the
1515                  * uio structure will not be provided
1516                  */
1517                 if (uio) {
1518                         if (headOff < uio->uio_offset) {
1519                                 zero_cnt = uio->uio_offset - headOff;
1520                                 zero_off = headOff;
1521                         }
1522                 } else if (headOff < newEOF) {
1523                         zero_cnt = newEOF - headOff;
1524                         zero_off = headOff;
1525                 }
1526         }
1527         if (flags & IO_TAILZEROFILL) {
1528                 if (uio) {
1529                         zero_off1 = uio->uio_offset + uio->uio_resid;
1530
1531                         if (zero_off1 < tailOff)
1532                                 zero_cnt1 = tailOff - zero_off1;
1533                 }
1534         }
1535         if (zero_cnt == 0 && uio == (struct uio *) 0)
1536           {
1537             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1538                          retval, 0, 0, 0, 0);
1539             return (0);
1540           }
1541
1542         while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1543                 /*
1544                  * for this iteration of the loop, figure out where our starting point is
1545                  */
1546                 if (zero_cnt) {
1547                         start_offset = (int)(zero_off & PAGE_MASK_64);
1548                         upl_f_offset = zero_off - start_offset;
1549                 } else if (uio_resid) {
1550                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1551                         upl_f_offset = uio->uio_offset - start_offset;
1552                 } else {
1553                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1554                         upl_f_offset = zero_off1 - start_offset;
1555                 }
1556                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1557                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1558
1559                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1560                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1561
1562                 /*
1563                  * compute the size of the upl needed to encompass
1564                  * the requested write... limit each call to cluster_io
1565                  * to the maximum UPL size... cluster_io will clip if
1566                  * this exceeds the maximum io_size for the device,
1567                  * make sure to account for
1568                  * a starting offset that's not page aligned
1569                  */
1570                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1571
1572                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1573                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1574
1575                 pages_in_upl = upl_size / PAGE_SIZE;
1576                 io_size      = upl_size - start_offset;
1577
1578                 if ((long long)io_size > total_size)
1579                         io_size = total_size;
1580
1581                 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1582                 last_blkno  = start_blkno + pages_in_upl;
1583
1584                 kret = ubc_create_upl(vp,
1585                                                         upl_f_offset,
1586                                                         upl_size,
1587                                                         &upl,
1588                                                         &pl,
1589                                                         UPL_FLAGS_NONE);
1590                 if (kret != KERN_SUCCESS)
1591                         panic("cluster_write: failed to get pagelist");
1592
1593                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
1594                         upl, (int)upl_f_offset, upl_size, start_offset, 0);
1595
1596                 if (start_offset && !upl_valid_page(pl, 0)) {
1597                         int   read_size;
1598
1599                         /*
1600                          * we're starting in the middle of the first page of the upl
1601                          * and the page isn't currently valid, so we're going to have
1602                          * to read it in first... this is a synchronous operation
1603                          */
1604                         read_size = PAGE_SIZE;
1605
1606                         if ((upl_f_offset + read_size) > newEOF) {
1607                                 read_size = newEOF - upl_f_offset;
1608                                 read_size = (read_size + (devblocksize - 1)) & ~(devblocksize - 1);
1609                         }
1610                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
1611                                             CL_READ, (struct buf *)0);
1612                         if (retval) {
1613                                 /*
1614                                  * we had an error during the read which causes us to abort
1615                                  * the current cluster_write request... before we do, we need
1616                                  * to release the rest of the pages in the upl without modifying
1617                                  * there state and mark the failed page in error
1618                                  */
1619                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1620                                 ubc_upl_abort(upl, 0);
1621
1622                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1623                                              upl, 0, 0, retval, 0);
1624                                 break;
1625                         }
1626                 }
1627                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1628                         /*
1629                          * the last offset we're writing to in this upl does not end on a page
1630                          * boundary... if it's not beyond the old EOF, then we'll also need to
1631                          * pre-read this page in if it isn't already valid
1632                          */
1633                         upl_offset = upl_size - PAGE_SIZE;
1634
1635                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1636                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1637                                 int   read_size;
1638
1639                                 read_size = PAGE_SIZE;
1640
1641                                 if ((upl_f_offset + upl_offset + read_size) > newEOF) {
1642                                         read_size = newEOF - (upl_f_offset + upl_offset);
1643                                         read_size = (read_size + (devblocksize - 1)) & ~(devblocksize - 1);
1644                                 }
1645                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
1646                                                     CL_READ, (struct buf *)0);
1647                                 if (retval) {
1648                                         /*
1649                                          * we had an error during the read which causes us to abort
1650                                          * the current cluster_write request... before we do, we
1651                                          * need to release the rest of the pages in the upl without
1652                                          * modifying there state and mark the failed page in error
1653                                          */
1654                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE,
1655                                                         UPL_ABORT_DUMP_PAGES);
1656                                         ubc_upl_abort(upl, 0);
1657
1658                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1659                                                      upl, 0, 0, retval, 0);
1660                                         break;
1661                                 }
1662                         }
1663                 }
1664                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
1665                         panic("cluster_write: ubc_upl_map failed\n");
1666                 xfer_resid = io_size;
1667                 io_offset = start_offset;
1668
1669                 while (zero_cnt && xfer_resid) {
1670
1671                         if (zero_cnt < (long long)xfer_resid)
1672                                 bytes_to_zero = zero_cnt;
1673                         else
1674                                 bytes_to_zero = xfer_resid;
1675
1676                         if ( !(flags & IO_NOZEROVALID)) {
1677                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1678
1679                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1680                                              (int)upl_f_offset + io_offset, bytes_to_zero,
1681                                              (int)zero_cnt, xfer_resid, 0);
1682                         } else {
1683                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1684
1685                                 if ( !upl_valid_page(pl, (int)(zero_off / PAGE_SIZE_64))) {
1686                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1687
1688                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1689                                                      (int)upl_f_offset + io_offset, bytes_to_zero,
1690                                                      (int)zero_cnt, xfer_resid, 0);
1691                                 }
1692                         }
1693                         xfer_resid -= bytes_to_zero;
1694                         zero_cnt   -= bytes_to_zero;
1695                         zero_off   += bytes_to_zero;
1696                         io_offset  += bytes_to_zero;
1697                 }
1698                 if (xfer_resid && uio_resid) {
1699                         bytes_to_move = min(uio_resid, xfer_resid);
1700
1701                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1702                                      (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1703
1704                         retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1705
1706                         if (retval) {
1707                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1708                                         panic("cluster_write: kernel_upl_unmap failed\n");
1709                                 ubc_upl_abort(upl, UPL_ABORT_DUMP_PAGES);
1710
1711                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1712                                              upl, 0, 0, retval, 0);
1713                         } else {
1714                                 uio_resid  -= bytes_to_move;
1715                                 xfer_resid -= bytes_to_move;
1716                                 io_offset  += bytes_to_move;
1717                         }
1718                 }
1719                 while (xfer_resid && zero_cnt1 && retval == 0) {
1720
1721                         if (zero_cnt1 < (long long)xfer_resid)
1722                                 bytes_to_zero = zero_cnt1;
1723                         else
1724                                 bytes_to_zero = xfer_resid;
1725
1726                         if ( !(flags & IO_NOZEROVALID)) {
1727                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1728
1729                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1730                                              (int)upl_f_offset + io_offset,
1731                                              bytes_to_zero, (int)zero_cnt1, xfer_resid, 0);
1732                         } else {
1733                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1734                                 if ( !upl_valid_page(pl, (int)(zero_off1 / PAGE_SIZE_64))) {
1735                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1736
1737                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1738                                                      (int)upl_f_offset + io_offset,
1739                                                      bytes_to_zero, (int)zero_cnt1, xfer_resid, 0);
1740                                 }
1741                         }
1742                         xfer_resid -= bytes_to_zero;
1743                         zero_cnt1  -= bytes_to_zero;
1744                         zero_off1  += bytes_to_zero;
1745                         io_offset  += bytes_to_zero;
1746                 }
1747
1748                 if (retval == 0) {
1749                         int must_push;
1750                         int can_delay;
1751
1752                         io_size += start_offset;
1753
1754                         if ((upl_f_offset + io_size) == newEOF && io_size < upl_size) {
1755                                 /*
1756                                  * if we're extending the file with this write
1757                                  * we'll zero fill the rest of the page so that
1758                                  * if the file gets extended again in such a way as to leave a
1759                                  * hole starting at this EOF, we'll have zero's in the correct spot
1760                                  */
1761                                 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1762
1763                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1764                                              (int)upl_f_offset + io_size,
1765                                              upl_size - io_size, 0, 0, 0);
1766                         }
1767                         if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1768                                 panic("cluster_write: kernel_upl_unmap failed\n");
1769
1770                         io_size_before_rounding = io_size;
1771
1772                         if (io_size & (devblocksize - 1))
1773                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
1774
1775                         must_push = 0;
1776                         can_delay = 0;
1777
1778                         if (vp->v_clen) {
1779                                 int newsize;
1780
1781                                 /*
1782                                  * we have an existing cluster... see if this write will extend it nicely
1783                                  */
1784                                 if (start_blkno >= vp->v_cstart) {
1785                                         if (last_blkno <= (vp->v_cstart + vp->v_clen)) {
1786                                                 /*
1787                                                  * we have a write that fits entirely
1788                                                  * within the existing cluster limits
1789                                                  */
1790                                                 if (last_blkno >= vp->v_lastw) {
1791                                                         /*
1792                                                          * if we're extending the dirty region within the cluster
1793                                                          * we need to update the cluster info... we check for blkno
1794                                                          * equality because we may be extending the file with a
1795                                                          * partial write.... this in turn changes our idea of how
1796                                                          * much data to write out (v_ciosiz) for the last page
1797                                                          */
1798                                                         vp->v_lastw = last_blkno;
1799                                                         newsize = io_size + ((start_blkno - vp->v_cstart) * PAGE_SIZE);
1800
1801                                                         if (newsize > vp->v_ciosiz)
1802                                                                 vp->v_ciosiz = newsize;
1803                                                 }
1804                                                 can_delay = 1;
1805                                                 goto finish_io;
1806                                         }
1807                                         if (start_blkno < (vp->v_cstart + vp->v_clen)) {
1808                                                 /*
1809                                                  * we have a write that starts in the middle of the current cluster
1810                                                  * but extends beyond the cluster's limit
1811                                                  * we'll clip the current cluster if we actually
1812                                                  * overlap with the new write and then push it out
1813                                                  * and start a new cluster with the current write
1814                                                  */
1815                                                  if (vp->v_lastw > start_blkno) {
1816                                                         vp->v_lastw = start_blkno;
1817                                                         vp->v_ciosiz = (vp->v_lastw - vp->v_cstart) * PAGE_SIZE;
1818                                                  }
1819                                         }
1820                                         /*
1821                                          * we also get here for the case where the current write starts
1822                                          * beyond the limit of the existing cluster
1823                                          */
1824                                         must_push = 1;
1825                                         goto check_delay;
1826                                 }
1827                                 /*
1828                                  * the current write starts in front of the current cluster
1829                                  */
1830                                 if (last_blkno > vp->v_cstart) {
1831                                         /*
1832                                          * the current write extends into the existing cluster
1833                                          */
1834                                         if ((vp->v_lastw - start_blkno) > vp->v_clen) {
1835                                                 /*
1836                                                  * if we were to combine this write with the current cluster
1837                                                  * we would exceed the cluster size limit....
1838                                                  * clip the current cluster by moving the start position
1839                                                  * to where the current write ends, and then push it
1840                                                  */
1841                                                 vp->v_ciosiz -= (last_blkno - vp->v_cstart) * PAGE_SIZE;
1842                                                 vp->v_cstart = last_blkno;
1843
1844                                                 /*
1845                                                  * round up the io_size to the nearest page size
1846                                                  * since we've coalesced with at least 1 pre-existing
1847                                                  * page in the current cluster... this write may have ended in the
1848                                                  * middle of the page which would cause io_size to give us an
1849                                                  * inaccurate view of how much I/O we actually need to do
1850                                                  */
1851                                                 io_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1852
1853                                                 must_push = 1;
1854                                                 goto check_delay;
1855                                         }
1856                                         /*
1857                                          * we can coalesce the current write with the existing cluster
1858                                          * adjust the cluster info to reflect this
1859                                          */
1860                                         if (last_blkno > vp->v_lastw) {
1861                                                 /*
1862                                                  * the current write completey overlaps
1863                                                  * the existing cluster
1864                                                  */
1865                                                 vp->v_lastw = last_blkno;
1866                                                 vp->v_ciosiz = io_size;
1867                                         } else {
1868                                                 vp->v_ciosiz += (vp->v_cstart - start_blkno) * PAGE_SIZE;
1869
1870                                                 if (io_size > vp->v_ciosiz)
1871                                                         vp->v_ciosiz = io_size;
1872                                         }
1873                                         vp->v_cstart = start_blkno;
1874                                         can_delay = 1;
1875                                         goto finish_io;
1876                                 }
1877                                 /*
1878                                  * this I/O range is entirely in front of the current cluster
1879                                  * so we need to push the current cluster out before beginning
1880                                  * a new one
1881                                  */
1882                                 must_push = 1;
1883                         }
1884 check_delay:
1885                         if (must_push)
1886                                 cluster_push(vp);
1887
1888                         if (io_size_before_rounding < (MAX_UPL_TRANSFER * PAGE_SIZE) && !(flags & IO_SYNC)) {
1889                                 vp->v_clen = MAX_UPL_TRANSFER;
1890                                 vp->v_cstart = start_blkno;
1891                                 vp->v_lastw  = last_blkno;
1892                                 vp->v_ciosiz = io_size;
1893
1894                                 can_delay = 1;
1895                         }
1896 finish_io:
1897                         if (can_delay) {
1898                                 ubc_upl_commit_range(upl, 0, upl_size,
1899                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1900                                 continue;
1901                         }
1902                         if (flags & IO_SYNC)
1903                                 io_flags = CL_COMMIT | CL_AGE;
1904                         else
1905                                 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
1906
1907                         if (vp->v_flag & VNOCACHE_DATA)
1908                                 io_flags |= CL_DUMP;
1909
1910                         while (vp->v_numoutput >= ASYNC_THROTTLE) {
1911                                 vp->v_flag |= VTHROTTLED;
1912                                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
1913                         }
1914                         retval = cluster_io(vp, upl, 0, upl_f_offset, io_size,
1915                                             io_flags, (struct buf *)0);
1916                 }
1917         }
1918         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1919                      retval, 0, 0, 0, 0);
1920
1921         return (retval);
1922 }
1923
1924 cluster_read(vp, uio, filesize, devblocksize, flags)
1925         struct vnode *vp;
1926         struct uio   *uio;
1927         off_t         filesize;
1928         int           devblocksize;
1929         int           flags;
1930 {
1931         int           prev_resid;
1932         int           clip_size;
1933         off_t         max_io_size;
1934         struct iovec  *iov;
1935         vm_offset_t   upl_offset;
1936         int           upl_size;
1937         int           pages_in_pl;
1938         upl_page_info_t *pl;
1939         int           upl_flags;
1940         upl_t         upl;
1941         int           retval = 0;
1942
1943         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
1944                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
1945
1946         /*
1947          * We set a threshhold of 4 pages to decide if the nocopy
1948          * read loop is worth the trouble...
1949          */
1950
1951         if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
1952           {
1953             retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1954             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
1955                          (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
1956             return(retval);
1957           }
1958
1959         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
1960           {
1961             /* we know we have a resid, so this is safe */
1962             iov = uio->uio_iov;
1963             while (iov->iov_len == 0) {
1964               uio->uio_iov++;
1965               uio->uio_iovcnt--;
1966               iov = uio->uio_iov;
1967             }
1968
1969             /*
1970              * We check every vector target and if it is physically
1971              * contiguous space, we skip the sanity checks.
1972              */
1973
1974             upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1975             upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1976             pages_in_pl = 0;
1977             upl_flags = UPL_QUERY_OBJECT_TYPE;
1978             if((vm_map_get_upl(current_map(),
1979                                (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1980                                &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1981               {
1982                 /*
1983                  * the user app must have passed in an invalid address
1984                  */
1985                 return (EFAULT);
1986               }
1987
1988             if (upl_flags & UPL_PHYS_CONTIG)
1989               {
1990                 retval = cluster_phys_read(vp, uio, filesize);
1991               }
1992             else if (uio->uio_resid < 4 * PAGE_SIZE)
1993               {
1994                 /*
1995                  * We set a threshhold of 4 pages to decide if the nocopy
1996                  * read loop is worth the trouble...
1997                  */
1998                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1999                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2000                              (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2001                 return(retval);
2002               }
2003             else if (uio->uio_offset & PAGE_MASK_64)
2004               {
2005                 /* Bring the file offset read up to a pagesize boundary */
2006                 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2007                 if (uio->uio_resid < clip_size)
2008                   clip_size = uio->uio_resid;
2009                 /*
2010                  * Fake the resid going into the cluster_read_x call
2011                  * and restore it on the way out.
2012                  */
2013                 prev_resid = uio->uio_resid;
2014                 uio->uio_resid = clip_size;
2015                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2016                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2017               }
2018             else if ((int)iov->iov_base & PAGE_MASK_64)
2019               {
2020                 clip_size = iov->iov_len;
2021                 prev_resid = uio->uio_resid;
2022                 uio->uio_resid = clip_size;
2023                 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2024                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2025               }
2026             else
2027               {
2028                 /*
2029                  * If we come in here, we know the offset into
2030                  * the file is on a pagesize boundary
2031                  */
2032
2033                 max_io_size = filesize - uio->uio_offset;
2034                 clip_size = uio->uio_resid;
2035                 if (iov->iov_len < clip_size)
2036                   clip_size = iov->iov_len;
2037                 if (max_io_size < clip_size)
2038                   clip_size = (int)max_io_size;
2039
2040                 if (clip_size < PAGE_SIZE)
2041                   {
2042                     /*
2043                      * Take care of the tail end of the read in this vector.
2044                      */
2045                     prev_resid = uio->uio_resid;
2046                     uio->uio_resid = clip_size;
2047                     retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2048                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2049                   }
2050                 else
2051                   {
2052                     /* round clip_size down to a multiple of pagesize */
2053                     clip_size = clip_size & ~(PAGE_MASK);
2054                     prev_resid = uio->uio_resid;
2055                     uio->uio_resid = clip_size;
2056                     retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2057                     if ((retval==0) && uio->uio_resid)
2058                       retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2059                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2060                   }
2061               } /* end else */
2062           } /* end while */
2063
2064         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2065                      (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2066
2067         return(retval);
2068 }
2069
2070 static
2071 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2072         struct vnode *vp;
2073         struct uio   *uio;
2074         off_t         filesize;
2075         int           devblocksize;
2076         int           flags;
2077 {
2078         upl_page_info_t *pl;
2079         upl_t            upl;
2080         vm_offset_t      upl_offset;
2081         int              upl_size;
2082         off_t            upl_f_offset;
2083         int              start_offset;
2084         int              start_pg;
2085         int              last_pg;
2086         int              uio_last;
2087         int              pages_in_upl;
2088         off_t            max_size;
2089         int              io_size;
2090         vm_offset_t      io_address;
2091         kern_return_t    kret;
2092         int              segflg;
2093         int              error  = 0;
2094         int              retval = 0;
2095         int              b_lblkno;
2096         int              e_lblkno;
2097
2098         b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
2099
2100         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2101                 /*
2102                  * compute the size of the upl needed to encompass
2103                  * the requested read... limit each call to cluster_io
2104                  * to the maximum UPL size... cluster_io will clip if
2105                  * this exceeds the maximum io_size for the device,
2106                  * make sure to account for
2107                  * a starting offset that's not page aligned
2108                  */
2109                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2110                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2111                 max_size     = filesize - uio->uio_offset;
2112
2113                 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2114                         io_size = uio->uio_resid;
2115                 else
2116                         io_size = max_size;
2117 #ifdef ppc
2118                 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
2119                         segflg = uio->uio_segflg;
2120
2121                         uio->uio_segflg = UIO_PHYS_USERSPACE;
2122
2123                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2124                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2125
2126                         while (io_size && retval == 0) {
2127                                 int         xsize;
2128                                 vm_offset_t paddr;
2129
2130                                 if (ubc_page_op(vp,
2131                                                 upl_f_offset,
2132                                                 UPL_POP_SET | UPL_POP_BUSY,
2133                                                 &paddr, 0) != KERN_SUCCESS)
2134                                         break;
2135
2136                                 xsize = PAGE_SIZE - start_offset;
2137
2138                                 if (xsize > io_size)
2139                                         xsize = io_size;
2140
2141                                 retval = uiomove((caddr_t)(paddr + start_offset), xsize, uio);
2142
2143                                 ubc_page_op(vp, upl_f_offset,
2144                                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2145
2146                                 io_size     -= xsize;
2147                                 start_offset = (int)
2148                                         (uio->uio_offset & PAGE_MASK_64);
2149                                 upl_f_offset = uio->uio_offset - start_offset;
2150                         }
2151                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2152                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2153
2154                         uio->uio_segflg = segflg;
2155
2156                         if (retval)
2157                                 break;
2158
2159                         if (io_size == 0) {
2160                                 /*
2161                                  * we're already finished with this read request
2162                                  * let's see if we should do a read-ahead
2163                                  */
2164                                 e_lblkno = (int)
2165                                         ((uio->uio_offset - 1) / PAGE_SIZE_64);
2166
2167                                 if (!(vp->v_flag & VRAOFF))
2168                                         /*
2169                                          * let's try to read ahead if we're in
2170                                          * a sequential access pattern
2171                                          */
2172                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2173                                 vp->v_lastr = e_lblkno;
2174
2175                                 break;
2176                         }
2177                         max_size = filesize - uio->uio_offset;
2178                 }
2179 #endif
2180                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2181                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2182                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2183                 pages_in_upl = upl_size / PAGE_SIZE;
2184
2185                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2186                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
2187
2188                 kret = ubc_create_upl(vp,
2189                                                 upl_f_offset,
2190                                                 upl_size,
2191                                                 &upl,
2192                                                 &pl,
2193                                                 UPL_FLAGS_NONE);
2194                 if (kret != KERN_SUCCESS)
2195                         panic("cluster_read: failed to get pagelist");
2196
2197                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2198                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
2199
2200                 /*
2201                  * scan from the beginning of the upl looking for the first
2202                  * non-valid page.... this will become the first page in
2203                  * the request we're going to make to 'cluster_io'... if all
2204                  * of the pages are valid, we won't call through to 'cluster_io'
2205                  */
2206                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2207                         if (!upl_valid_page(pl, start_pg))
2208                                 break;
2209                 }
2210
2211                 /*
2212                  * scan from the starting invalid page looking for a valid
2213                  * page before the end of the upl is reached, if we
2214                  * find one, then it will be the last page of the request to
2215                  * 'cluster_io'
2216                  */
2217                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2218                         if (upl_valid_page(pl, last_pg))
2219                                 break;
2220                 }
2221
2222                 if (start_pg < last_pg) {
2223                         /*
2224                          * we found a range of 'invalid' pages that must be filled
2225                          * if the last page in this range is the last page of the file
2226                          * we may have to clip the size of it to keep from reading past
2227                          * the end of the last physical block associated with the file
2228                          */
2229                         upl_offset = start_pg * PAGE_SIZE;
2230                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2231
2232                         if ((upl_f_offset + upl_offset + io_size) > filesize) {
2233                                 io_size = filesize - (upl_f_offset + upl_offset);
2234                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2235                         }
2236                         /*
2237                          * issue a synchronous read to cluster_io
2238                          */
2239
2240                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2241                                            io_size, CL_READ, (struct buf *)0);
2242                 }
2243                 if (error == 0) {
2244                         /*
2245                          * if the read completed successfully, or there was no I/O request
2246                          * issued, than map the upl into kernel address space and
2247                          * move the data into user land.... we'll first add on any 'valid'
2248                          * pages that were present in the upl when we acquired it.
2249                          */
2250                         u_int  val_size;
2251                         u_int  size_of_prefetch;
2252
2253                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2254                                 if (!upl_valid_page(pl, uio_last))
2255                                         break;
2256                         }
2257                         /*
2258                          * compute size to transfer this round,  if uio->uio_resid is
2259                          * still non-zero after this uiomove, we'll loop around and
2260                          * set up for another I/O.
2261                          */
2262                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2263
2264                         if (max_size < val_size)
2265                                 val_size = max_size;
2266
2267                         if (uio->uio_resid < val_size)
2268                                 val_size = uio->uio_resid;
2269
2270                         e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2271
2272                         if (size_of_prefetch = (uio->uio_resid - val_size)) {
2273                                 /*
2274                                  * if there's still I/O left to do for this request, then issue a
2275                                  * pre-fetch I/O... the I/O wait time will overlap
2276                                  * with the copying of the data
2277                                  */
2278                                 cluster_rd_prefetch(vp, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
2279                         } else {
2280                                 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2281                                         /*
2282                                          * let's try to read ahead if we're in
2283                                          * a sequential access pattern
2284                                          */
2285                                         cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2286                                 vp->v_lastr = e_lblkno;
2287                         }
2288 #ifdef ppc
2289                         if (uio->uio_segflg == UIO_USERSPACE) {
2290                                 int       offset;
2291
2292                                 segflg = uio->uio_segflg;
2293
2294                                 uio->uio_segflg = UIO_PHYS_USERSPACE;
2295
2296
2297                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2298                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2299
2300                                 offset = start_offset;
2301
2302                                 while (val_size && retval == 0) {
2303                                         int       csize;
2304                                         int       i;
2305                                         caddr_t   paddr;
2306
2307                                         i = offset / PAGE_SIZE;
2308                                         csize = min(PAGE_SIZE - start_offset, val_size);
2309
2310                                         paddr = (caddr_t)upl_phys_page(pl, i) + start_offset;
2311
2312                                         retval = uiomove(paddr, csize, uio);
2313
2314                                         val_size    -= csize;
2315                                         offset      += csize;
2316                                         start_offset = offset & PAGE_MASK;
2317                                 }
2318                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2319                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2320
2321                                 uio->uio_segflg = segflg;
2322                         } else
2323 #endif
2324                         {
2325                                 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
2326                                         panic("cluster_read: ubc_upl_map() failed\n");
2327
2328                                 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2329
2330                                 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
2331                                         panic("cluster_read: ubc_upl_unmap() failed\n");
2332                         }
2333                 }
2334                 if (start_pg < last_pg) {
2335                         /*
2336                          * compute the range of pages that we actually issued an I/O for
2337                          * and either commit them as valid if the I/O succeeded
2338                          * or abort them if the I/O failed
2339                          */
2340                         io_size = (last_pg - start_pg) * PAGE_SIZE;
2341
2342                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2343                                      upl, start_pg * PAGE_SIZE, io_size, error, 0);
2344
2345                         if (error || (vp->v_flag & VNOCACHE_DATA))
2346                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2347                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2348                         else
2349                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2350                                                 UPL_COMMIT_CLEAR_DIRTY
2351                                                 | UPL_COMMIT_FREE_ON_EMPTY
2352                                                 | UPL_COMMIT_INACTIVATE);
2353
2354                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2355                                      upl, start_pg * PAGE_SIZE, io_size, error, 0);
2356                 }
2357                 if ((last_pg - start_pg) < pages_in_upl) {
2358                         int cur_pg;
2359                         int commit_flags;
2360
2361                         /*
2362                          * the set of pages that we issued an I/O for did not encompass
2363                          * the entire upl... so just release these without modifying
2364                          * there state
2365                          */
2366                         if (error)
2367                                 ubc_upl_abort(upl, 0);
2368                         else {
2369                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2370                                              upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2371
2372                                 if (start_pg) {
2373                                         /*
2374                                          * we found some already valid pages at the beginning of
2375                                          * the upl commit these back to the inactive list with
2376                                          * reference cleared
2377                                          */
2378                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2379                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2380                                                                    | UPL_COMMIT_INACTIVATE;
2381
2382                                                 if (upl_dirty_page(pl, cur_pg))
2383                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2384
2385                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2386                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2387                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2388                                                 else
2389                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2390                                                                 PAGE_SIZE, commit_flags);
2391                                         }
2392                                 }
2393                                 if (last_pg < uio_last) {
2394                                         /*
2395                                          * we found some already valid pages immediately after the
2396                                          * pages we issued I/O for, commit these back to the
2397                                          * inactive list with reference cleared
2398                                          */
2399                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2400                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
2401                                                                                 | UPL_COMMIT_INACTIVATE;
2402
2403                                                 if (upl_dirty_page(pl, cur_pg))
2404                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2405
2406                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2407                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2408                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2409                                                 else
2410                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2411                                                                 PAGE_SIZE, commit_flags);
2412                                         }
2413                                 }
2414                                 if (uio_last < pages_in_upl) {
2415                                         /*
2416                                          * there were some invalid pages beyond the valid pages
2417                                          * that we didn't issue an I/O for, just release them
2418                                          * unchanged
2419                                          */
2420                                         ubc_upl_abort(upl, 0);
2421                                 }
2422
2423                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2424                                         upl, -1, -1, 0, 0);
2425                         }
2426                 }
2427                 if (retval == 0)
2428                         retval = error;
2429         }
2430
2431         return (retval);
2432 }
2433
2434 static
2435 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2436         struct vnode *vp;
2437         struct uio   *uio;
2438         off_t         filesize;
2439         int           devblocksize;
2440         int           flags;
2441 {
2442         upl_t            upl;
2443         upl_page_info_t  *pl;
2444         off_t            upl_f_offset;
2445         vm_offset_t      upl_offset;
2446         off_t            start_upl_f_offset;
2447         off_t            max_io_size;
2448         int              io_size;
2449         int              upl_size;
2450         int              upl_needed_size;
2451         int              pages_in_pl;
2452         vm_offset_t      paddr;
2453         int              upl_flags;
2454         kern_return_t    kret;
2455         int              segflg;
2456         struct iovec     *iov;
2457         int              i;
2458         int              force_data_sync;
2459         int              error  = 0;
2460         int              retval = 0;
2461
2462         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2463                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2464
2465         /*
2466          * When we enter this routine, we know
2467          *  -- the offset into the file is on a pagesize boundary
2468          *  -- the resid is a page multiple
2469          *  -- the resid will not exceed iov_len
2470          */
2471
2472         iov = uio->uio_iov;
2473         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2474
2475           max_io_size = filesize - uio->uio_offset;
2476
2477           if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2478               io_size = max_io_size;
2479           else
2480               io_size = uio->uio_resid;
2481
2482           /*
2483            * We don't come into this routine unless
2484            * UIO_USERSPACE is set.
2485            */
2486           segflg = uio->uio_segflg;
2487
2488           uio->uio_segflg = UIO_PHYS_USERSPACE;
2489
2490           /*
2491            * First look for pages already in the cache
2492            * and move them to user space.
2493            */
2494           while (io_size && (retval == 0)) {
2495             upl_f_offset = uio->uio_offset;
2496
2497             /*
2498              * If this call fails, it means the page is not
2499              * in the page cache.
2500              */
2501             if (ubc_page_op(vp, upl_f_offset,
2502                             UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS)
2503               break;
2504
2505             retval = uiomove((caddr_t)(paddr), PAGE_SIZE, uio);
2506
2507             ubc_page_op(vp, upl_f_offset,
2508                         UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2509
2510             io_size     -= PAGE_SIZE;
2511             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
2512                            (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2513           }
2514
2515           uio->uio_segflg = segflg;
2516
2517           if (retval)
2518             {
2519               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2520                            (int)uio->uio_offset, uio->uio_resid, 2, retval, 0);
2521               return(retval);
2522             }
2523
2524           /* If we are already finished with this read, then return */
2525           if (io_size == 0)
2526             {
2527
2528               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2529                            (int)uio->uio_offset, uio->uio_resid, 3, io_size, 0);
2530               return(0);
2531             }
2532
2533           max_io_size = io_size;
2534           if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2535             max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2536
2537           start_upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
2538           upl_f_offset = start_upl_f_offset;
2539           io_size = 0;
2540
2541           while(io_size < max_io_size)
2542             {
2543
2544               if(ubc_page_op(vp, upl_f_offset,
2545                                 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS)
2546               {
2547                         ubc_page_op(vp, upl_f_offset,
2548                             UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2549                         break;
2550               }
2551
2552                   /*
2553                    * Build up the io request parameters.
2554                    */
2555
2556                   io_size += PAGE_SIZE;
2557                   upl_f_offset += PAGE_SIZE;
2558                 }
2559
2560               if (io_size == 0)
2561                 return(retval);
2562
2563           upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2564           upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2565
2566           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2567                        (int)upl_offset, upl_needed_size, iov->iov_base, io_size, 0);
2568
2569           for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
2570             {
2571               pages_in_pl = 0;
2572               upl_size = upl_needed_size;
2573               upl_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2574
2575               kret = vm_map_get_upl(current_map(),
2576                                     (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2577                                     &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2578
2579               if (kret != KERN_SUCCESS)
2580                 {
2581                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2582                                (int)upl_offset, upl_size, io_size, kret, 0);
2583
2584                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2585                                (int)uio->uio_offset, uio->uio_resid, 4, retval, 0);
2586
2587                   /* cluster_nocopy_read: failed to get pagelist */
2588                   /* do not return kret here */
2589                   return(retval);
2590                 }
2591
2592               pages_in_pl = upl_size / PAGE_SIZE;
2593               pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2594
2595               for(i=0; i < pages_in_pl; i++)
2596                 {
2597                   if (!upl_valid_page(pl, i))
2598                     break;
2599                 }
2600               if (i == pages_in_pl)
2601                 break;
2602
2603               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2604                                   UPL_ABORT_FREE_ON_EMPTY);
2605             }
2606
2607           if (force_data_sync >= 3)
2608             {
2609                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2610                                (int)upl_offset, upl_size, io_size, kret, 0);
2611
2612                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2613                                (int)uio->uio_offset, uio->uio_resid, 5, retval, 0);
2614               return(retval);
2615             }
2616           /*
2617            * Consider the possibility that upl_size wasn't satisfied.
2618            */
2619           if (upl_size != upl_needed_size)
2620             io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2621
2622           if (io_size == 0)
2623             {
2624               ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2625                                    UPL_ABORT_FREE_ON_EMPTY);
2626               return(retval);
2627             }
2628
2629           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2630                        (int)upl_offset, upl_size, io_size, kret, 0);
2631
2632           /*
2633            * issue a synchronous read to cluster_io
2634            */
2635
2636           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2637                        upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
2638
2639           error = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
2640                              io_size, CL_READ| CL_NOZERO, (struct buf *)0);
2641
2642           if (error == 0) {
2643             /*
2644              * The cluster_io read completed successfully,
2645              * update the uio structure and commit.
2646              */
2647
2648             ubc_upl_commit_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2649                                         UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
2650
2651             iov->iov_base += io_size;
2652             iov->iov_len -= io_size;
2653             uio->uio_resid -= io_size;
2654             uio->uio_offset += io_size;
2655           }
2656           else {
2657             ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2658                                    UPL_ABORT_FREE_ON_EMPTY);
2659           }
2660
2661           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2662                        upl, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
2663
2664           if (retval == 0)
2665             retval = error;
2666
2667         } /* end while */
2668
2669
2670         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2671                      (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2672
2673         return (retval);
2674 }
2675
2676
2677 static
2678 cluster_phys_read(vp, uio, filesize)
2679         struct vnode *vp;
2680         struct uio   *uio;
2681         off_t        filesize;
2682 {
2683         upl_t            upl;
2684         vm_offset_t      upl_offset;
2685         off_t            max_size;
2686         int              io_size;
2687         int              upl_size;
2688         int              upl_needed_size;
2689         int              pages_in_pl;
2690         int              upl_flags;
2691         kern_return_t    kret;
2692         struct iovec     *iov;
2693         int              error;
2694
2695         /*
2696          * When we enter this routine, we know
2697          *  -- the resid will not exceed iov_len
2698          *  -- the target address is physically contiguous
2699          */
2700
2701         iov = uio->uio_iov;
2702
2703         max_size = filesize - uio->uio_offset;
2704
2705         if (max_size < (off_t)((unsigned int)iov->iov_len))
2706             io_size = max_size;
2707         else
2708             io_size = iov->iov_len;
2709
2710         upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2711         upl_needed_size = upl_offset + io_size;
2712
2713         pages_in_pl = 0;
2714         upl_size = upl_needed_size;
2715         upl_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2716
2717         kret = vm_map_get_upl(current_map(),
2718                               (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2719                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
2720
2721         if (kret != KERN_SUCCESS)
2722           {
2723             /* cluster_phys_read: failed to get pagelist */
2724             return(EINVAL);
2725           }
2726
2727         /*
2728          * Consider the possibility that upl_size wasn't satisfied.
2729          */
2730         if (upl_size < upl_needed_size)
2731           {
2732             ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2733             return(EINVAL);
2734           }
2735
2736         /*
2737          * issue a synchronous read to cluster_io
2738          */
2739
2740         error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2741                            io_size,  CL_READ| CL_NOZERO | CL_DEV_MEMORY, (struct buf *)0);
2742
2743         if (error == 0)
2744           {
2745             /*
2746              * The cluster_io read completed successfully,
2747              * update the uio structure and commit.
2748              */
2749
2750             ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
2751
2752             iov->iov_base += io_size;
2753             iov->iov_len -= io_size;
2754             uio->uio_resid -= io_size;
2755             uio->uio_offset += io_size;
2756           }
2757         else
2758             ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2759
2760         return (error);
2761 }
2762
2763 /*
2764  * generate advisory I/O's in the largest chunks possible
2765  * the completed pages will be released into the VM cache
2766  */
2767 advisory_read(vp, filesize, f_offset, resid, devblocksize)
2768         struct vnode *vp;
2769         off_t         filesize;
2770         off_t         f_offset;
2771         int           resid;
2772         int           devblocksize;
2773 {
2774         upl_page_info_t *pl;
2775         upl_t            upl;
2776         vm_offset_t      upl_offset;
2777         int              upl_size;
2778         off_t            upl_f_offset;
2779         int              start_offset;
2780         int              start_pg;
2781         int              last_pg;
2782         int              pages_in_upl;
2783         off_t            max_size;
2784         int              io_size;
2785         kern_return_t    kret;
2786         int              retval = 0;
2787
2788
2789         if (!UBCINFOEXISTS(vp))
2790                 return(EINVAL);
2791
2792         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
2793                      (int)f_offset, resid, (int)filesize, devblocksize, 0);
2794
2795         while (resid && f_offset < filesize && retval == 0) {
2796                 /*
2797                  * compute the size of the upl needed to encompass
2798                  * the requested read... limit each call to cluster_io
2799                  * to the maximum UPL size... cluster_io will clip if
2800                  * this exceeds the maximum io_size for the device,
2801                  * make sure to account for
2802                  * a starting offset that's not page aligned
2803                  */
2804                 start_offset = (int)(f_offset & PAGE_MASK_64);
2805                 upl_f_offset = f_offset - (off_t)start_offset;
2806                 max_size     = filesize - f_offset;
2807
2808                 if (resid < max_size)
2809                         io_size = resid;
2810                 else
2811                         io_size = max_size;
2812
2813                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2814                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2815                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2816                 pages_in_upl = upl_size / PAGE_SIZE;
2817
2818                 kret = ubc_create_upl(vp,
2819                                                 upl_f_offset,
2820                                                 upl_size,
2821                                                 &upl,
2822                                                 &pl,
2823                                                 UPL_FLAGS_NONE);
2824                 if (kret != KERN_SUCCESS)
2825                         panic("advisory_read: failed to get pagelist");
2826
2827
2828                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
2829                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
2830
2831                 /*
2832                  * scan from the beginning of the upl looking for the first
2833                  * non-valid page.... this will become the first page in
2834                  * the request we're going to make to 'cluster_io'... if all
2835                  * of the pages are valid, we won't call through to 'cluster_io'
2836                  */
2837                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2838                         if (!upl_valid_page(pl, start_pg))
2839                                 break;
2840                 }
2841
2842                 /*
2843                  * scan from the starting invalid page looking for a valid
2844                  * page before the end of the upl is reached, if we
2845                  * find one, then it will be the last page of the request to
2846                  * 'cluster_io'
2847                  */
2848                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2849                         if (upl_valid_page(pl, last_pg))
2850                                 break;
2851                 }
2852
2853                 if (start_pg < last_pg) {
2854                         /*
2855                          * we found a range of 'invalid' pages that must be filled
2856                          * if the last page in this range is the last page of the file
2857                          * we may have to clip the size of it to keep from reading past
2858                          * the end of the last physical block associated with the file
2859                          */
2860                         upl_offset = start_pg * PAGE_SIZE;
2861                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2862
2863                         if ((upl_f_offset + upl_offset + io_size) > filesize) {
2864                                 io_size = filesize - (upl_f_offset + upl_offset);
2865                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2866                         }
2867                         /*
2868                          * issue an asynchronous read to cluster_io
2869                          */
2870                         retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
2871                                           CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0);
2872                 }
2873                 if (start_pg) {
2874                         /*
2875                          * start_pg of non-zero indicates we found some already valid pages
2876                          * at the beginning of the upl.... we need to release these without
2877                          * modifying there state
2878                          */
2879                         ubc_upl_abort_range(upl, 0, start_pg * PAGE_SIZE,
2880                                         UPL_ABORT_FREE_ON_EMPTY);
2881
2882                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 62)) | DBG_FUNC_NONE,
2883                                     upl, 0, start_pg * PAGE_SIZE, 0, 0);
2884                 }
2885                 if (last_pg < pages_in_upl) {
2886                         /*
2887                          * the set of pages that we issued an I/O for did not extend all the
2888                          * way to the end of the upl..so just release them without modifying
2889                          * there state
2890                          */
2891                         ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, (pages_in_upl - last_pg) * PAGE_SIZE,
2892                                         UPL_ABORT_FREE_ON_EMPTY);
2893
2894                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 63)) | DBG_FUNC_NONE,
2895                                      upl, last_pg * PAGE_SIZE,
2896                                      (pages_in_upl - last_pg) * PAGE_SIZE, 0, 0);
2897                 }
2898                 io_size = (last_pg * PAGE_SIZE) - start_offset;
2899
2900                 if (io_size > resid)
2901                         io_size = resid;
2902                 f_offset += io_size;
2903                 resid    -= io_size;
2904         }
2905         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
2906                      (int)f_offset, resid, retval, 0, 0);
2907
2908         return(retval);
2909 }
2910
2911
2912 cluster_push(vp)
2913         struct vnode *vp;
2914 {
2915         upl_page_info_t *pl;
2916         upl_t            upl;
2917         vm_offset_t      upl_offset;
2918         int              upl_size;
2919         off_t            upl_f_offset;
2920         int              pages_in_upl;
2921         int              start_pg;
2922         int              last_pg;
2923         int              io_size;
2924         int              io_flags;
2925         int              size;
2926         kern_return_t    kret;
2927
2928
2929         if (!UBCINFOEXISTS(vp))
2930                 return(0);
2931
2932         if (vp->v_clen == 0 || (pages_in_upl = vp->v_lastw - vp->v_cstart) == 0)
2933                 return (0);
2934         upl_size = pages_in_upl * PAGE_SIZE;
2935         upl_f_offset = ((off_t)vp->v_cstart) * PAGE_SIZE_64;
2936         size = vp->v_ciosiz;
2937         vp->v_clen = 0;
2938
2939         if (size > upl_size || (upl_size - size) > PAGE_SIZE)
2940                 panic("cluster_push: v_ciosiz doesn't match size of cluster\n");
2941
2942         kret = ubc_create_upl(vp,
2943                                 upl_f_offset,
2944                                 upl_size,
2945                                 &upl,
2946                                         &pl,
2947                                         UPL_FLAGS_NONE);
2948         if (kret != KERN_SUCCESS)
2949                 panic("cluster_push: failed to get pagelist");
2950
2951         last_pg = 0;
2952
2953         while (size) {
2954
2955                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
2956                         if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
2957                                 break;
2958                 }
2959                 if (start_pg > last_pg) {
2960                         io_size = (start_pg - last_pg) * PAGE_SIZE;
2961
2962                         ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size,
2963                                         UPL_ABORT_FREE_ON_EMPTY);
2964
2965                         if (io_size < size)
2966                                 size -= io_size;
2967                         else
2968                                 break;
2969                 }
2970                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2971                         if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
2972                                 break;
2973                 }
2974                 upl_offset = start_pg * PAGE_SIZE;
2975
2976                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
2977
2978                 if (vp->v_flag & VNOCACHE_DATA)
2979                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC | CL_DUMP;
2980                 else
2981                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
2982
2983                 while (vp->v_numoutput >= ASYNC_THROTTLE) {
2984                         vp->v_flag |= VTHROTTLED;
2985                         tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
2986                 }
2987                 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, io_flags, (struct buf *)0);
2988
2989                 size -= io_size;
2990         }
2991         return(1);
2992 }