bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * Redistribution and use in source and binary forms, with or without
  28  * modification, are permitted provided that the following conditions
  29  * are met:
  30  * 1. Redistributions of source code must retain the above copyright
  31  *    notice, this list of conditions and the following disclaimer.
  32  * 2. Redistributions in binary form must reproduce the above copyright
  33  *    notice, this list of conditions and the following disclaimer in the
  34  *    documentation and/or other materials provided with the distribution.
  35  * 3. All advertising materials mentioning features or use of this software
  36  *    must display the following acknowledgement:
  37  *      This product includes software developed by the University of
  38  *      California, Berkeley and its contributors.
  39  * 4. Neither the name of the University nor the names of its contributors
  40  *    may be used to endorse or promote products derived from this software
  41  *    without specific prior written permission.
  42  *
  43  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  46  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  53  * SUCH DAMAGE.
  54  *
  55  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  56  */
  57
  58 #include <sys/param.h>
  59 #include <sys/proc.h>
  60 #include <sys/buf.h>
  61 #include <sys/vnode.h>
  62 #include <sys/mount.h>
  63 #include <sys/trace.h>
  64 #include <sys/malloc.h>
  65 #include <sys/resourcevar.h>
  66 #include <libkern/libkern.h>
  67
  68 #include <sys/ubc.h>
  69 #include <vm/vm_pageout.h>
  70 #include <mach/memory_object_types.h>
  71
  72 #include <sys/kdebug.h>
  73
  74
  75 #define CL_READ      0x01
  76 #define CL_ASYNC     0x02
  77 #define CL_COMMIT    0x04
  78 #define CL_NOMAP     0x08
  79 #define CL_PAGEOUT   0x10
  80 #define CL_AGE       0x20
  81 #define CL_DUMP      0x40
  82 #define CL_NOZERO    0x80
  83 #define CL_PAGEIN    0x100
  84
  85 /*
  86  * throttle the number of async writes that
  87  * can be outstanding on a single vnode
  88  * before we issue a synchronous write
  89  */
  90 #define ASYNC_THROTTLE  3
  91
  92 static int
  93 cluster_iodone(bp)
  94         struct buf *bp;
  95 {
  96         int         b_flags;
  97         int         error;
  98         int         total_size;
  99         int         total_resid;
 100         int         upl_offset;
 101         upl_t       upl;
 102         struct buf *cbp;
 103         struct buf *cbp_head;
 104         struct buf *cbp_next;
 105         struct buf *real_bp;
 106         int         commit_size;
 107         int         pg_offset;
 108
 109
 110         cbp_head = (struct buf *)(bp->b_trans_head);
 111
 112         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 113                      cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 114
 115         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 116                 /*
 117                  * all I/O requests that are part of this transaction
 118                  * have to complete before we can process it
 119                  */
 120                 if ( !(cbp->b_flags & B_DONE)) {
 121
 122                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 123                                      cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
 124
 125                         return 0;
 126                 }
 127         }
 128         error       = 0;
 129         total_size  = 0;
 130         total_resid = 0;
 131
 132         cbp        = cbp_head;
 133         upl_offset = cbp->b_uploffset;
 134         upl        = cbp->b_pagelist;
 135         b_flags    = cbp->b_flags;
 136         real_bp    = cbp->b_real_bp;
 137
 138         while (cbp) {
 139                 if (cbp->b_vectorcount > 1)
 140                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 141
 142                 if ((cbp->b_flags & B_ERROR) && error == 0)
 143                         error = cbp->b_error;
 144
 145                 total_resid += cbp->b_resid;
 146                 total_size  += cbp->b_bcount;
 147
 148                 cbp_next = cbp->b_trans_next;
 149
 150                 free_io_buf(cbp);
 151
 152                 cbp = cbp_next;
 153         }
 154         if ((b_flags & B_NEED_IODONE) && real_bp) {
 155                 if (error) {
 156                         real_bp->b_flags |= B_ERROR;
 157                         real_bp->b_error = error;
 158                 }
 159                 real_bp->b_resid = total_resid;
 160
 161                 biodone(real_bp);
 162         }
 163         if (error == 0 && total_resid)
 164                 error = EIO;
 165
 166         if (b_flags & B_COMMIT_UPL) {
 167                 pg_offset   = upl_offset & PAGE_MASK;
 168                 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
 169
 170                 if (error || (b_flags & B_NOCACHE)) {
 171                         int upl_abort_code;
 172
 173                         if (b_flags & B_PAGEOUT)
 174                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 175                         else
 176                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 177
 178                         kernel_upl_abort_range(upl, upl_offset - pg_offset, commit_size, upl_abort_code);
 179
 180                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 181                                      upl, upl_offset - pg_offset, commit_size,
 182                                      0x80000000|upl_abort_code, 0);
 183
 184                 } else {
 185                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 186
 187                         if ( !(b_flags & B_PAGEOUT))
 188                                 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
 189                         if (b_flags & B_AGE)
 190                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 191
 192                         kernel_upl_commit_range(upl, upl_offset - pg_offset,
 193                                         commit_size, upl_commit_flags,
 194                                         UPL_GET_INTERNAL_PAGE_LIST(upl),
 195                                         MAX_UPL_TRANSFER);
 196
 197                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 198                                      upl, upl_offset - pg_offset, commit_size,
 199                                      upl_commit_flags, 0);
 200                 }
 201         } else
 202                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 203                              upl, upl_offset, 0, error, 0);
 204
 205         return (error);
 206 }
 207
 208
 209 static void
 210 cluster_zero(upl, upl_offset, size, flags, bp)
 211         upl_t         upl;
 212         vm_offset_t   upl_offset;
 213         int           size;
 214         int           flags;
 215         struct buf   *bp;
 216 {
 217         vm_offset_t   io_addr = 0;
 218         kern_return_t kret;
 219
 220         if ( !(flags & CL_NOMAP)) {
 221                 kret = kernel_upl_map(kernel_map, upl, &io_addr);
 222
 223                 if (kret != KERN_SUCCESS)
 224                         panic("cluster_zero: kernel_upl_map() failed with (%d)", kret);
 225                 if (io_addr == 0)
 226                         panic("cluster_zero: kernel_upl_map mapped 0");
 227         } else
 228                 io_addr = (vm_offset_t)bp->b_data;
 229         bzero((caddr_t)(io_addr + upl_offset), size);
 230
 231         if ( !(flags & CL_NOMAP)) {
 232                 kret = kernel_upl_unmap(kernel_map, upl);
 233
 234                 if (kret != KERN_SUCCESS)
 235                         panic("cluster_zero: kernel_upl_unmap failed");
 236         }
 237 }
 238
 239
 240 static int
 241 cluster_io(vp, upl, upl_offset, f_offset, size, flags, real_bp)
 242         struct vnode *vp;
 243         upl_t         upl;
 244         vm_offset_t   upl_offset;
 245         off_t         f_offset;
 246         int           size;
 247         int           flags;
 248         struct buf   *real_bp;
 249 {
 250         struct buf   *cbp;
 251         struct iovec *iovp;
 252         int           io_flags;
 253         int           error = 0;
 254         int           retval = 0;
 255         struct buf   *cbp_head = 0;
 256         struct buf   *cbp_tail = 0;
 257         upl_page_info_t *pl;
 258         int pg_count;
 259         int pg_offset;
 260
 261         if (flags & CL_READ)
 262                 io_flags = (B_VECTORLIST | B_READ);
 263         else
 264                 io_flags = (B_VECTORLIST | B_WRITEINPROG);
 265
 266         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
 267
 268         if (flags & CL_ASYNC)
 269                 io_flags |= (B_CALL | B_ASYNC);
 270         if (flags & CL_AGE)
 271                 io_flags |= B_AGE;
 272         if (flags & CL_DUMP)
 273                 io_flags |= B_NOCACHE;
 274
 275
 276         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 277                      (int)f_offset, size, upl_offset, flags, 0);
 278
 279         if ((flags & CL_READ) && ((upl_offset + size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 280                 /*
 281                  * then we are going to end up
 282                  * with a page that we can't complete (the file size wasn't a multiple
 283                  * of PAGE_SIZE and we're trying to read to the end of the file
 284                  * so we'll go ahead and zero out the portion of the page we can't
 285                  * read in from the file
 286                  */
 287                 cluster_zero(upl, upl_offset + size, PAGE_SIZE - ((upl_offset + size) & PAGE_MASK), flags, real_bp);
 288
 289                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
 290                              upl_offset + size, PAGE_SIZE - ((upl_offset + size) & PAGE_MASK),
 291                              flags, real_bp, 0);
 292         }
 293         while (size) {
 294                 size_t io_size;
 295                 int vsize;
 296                 int i;
 297                 int pl_index;
 298                 int pg_resid;
 299                 int num_contig;
 300                 daddr_t lblkno;
 301                 daddr_t blkno;
 302
 303                 if (size > MAXPHYSIO)
 304                         io_size = MAXPHYSIO;
 305                 else
 306                         io_size = size;
 307
 308                 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, &io_size, NULL)) {
 309                         if (error == EOPNOTSUPP)
 310                                 panic("VOP_CMAP Unimplemented");
 311                         break;
 312                 }
 313
 314                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 315                              (int)f_offset, (int)blkno, io_size, 0, 0);
 316
 317                 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
 318                         error = EINVAL;
 319                         break;
 320                 }
 321                 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
 322                 /*
 323                  * we have now figured out how much I/O we can do - this is in 'io_size'
 324                  * pl_index represents the first page in the 'upl' that the I/O will occur for
 325                  * pg_offset is the starting point in the first page for the I/O
 326                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 327                  */
 328                 pl_index  = upl_offset / PAGE_SIZE;
 329                 pg_offset = upl_offset & PAGE_MASK;
 330                 pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 331
 332                 if ((flags & CL_READ) && (long)blkno == -1) {
 333                         /*
 334                          * if we're reading and blkno == -1, then we've got a
 335                          * 'hole' in the file that we need to deal with by zeroing
 336                          * out the affected area in the upl
 337                          */
 338                         cluster_zero(upl, upl_offset, io_size, flags, real_bp);
 339
 340                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
 341                                      upl_offset, io_size, flags, real_bp, 0);
 342
 343                         pg_count = (io_size - pg_offset) / PAGE_SIZE;
 344
 345                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 346                                 pg_count++;
 347
 348                         if (pg_count) {
 349                                 if (pg_offset)
 350                                         pg_resid = PAGE_SIZE - pg_offset;
 351                                 else
 352                                         pg_resid = 0;
 353                                 if (flags & CL_COMMIT)
 354                                         kernel_upl_commit_range(upl,
 355                                                 upl_offset + pg_resid,
 356                                                 pg_count * PAGE_SIZE,
 357                                                 UPL_COMMIT_CLEAR_DIRTY
 358                                                    | UPL_COMMIT_FREE_ON_EMPTY,
 359                                                 pl, MAX_UPL_TRANSFER);
 360                         }
 361                         upl_offset += io_size;
 362                         f_offset   += io_size;
 363                         size       -= io_size;
 364
 365                         if (cbp_head && pg_count)
 366                                 goto start_io;
 367                         continue;
 368                 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
 369                         real_bp->b_blkno = blkno;
 370                 }
 371                 if (pg_count > 1) {
 372                         /*
 373                          * we need to allocate space for the vector list
 374                          */
 375                         iovp = (struct iovec *)_MALLOC(sizeof(struct iovec) * pg_count,
 376                                                        M_SEGMENT, M_NOWAIT);
 377                         if (iovp == (struct iovec *) 0) {
 378                                 /*
 379                                  * if the allocation fails, then throttle down to a single page
 380                                  */
 381                                 io_size = PAGE_SIZE - pg_offset;
 382                                 pg_count = 1;
 383                         }
 384                 }
 385                 cbp = alloc_io_buf(vp);
 386
 387
 388                 if (pg_count == 1)
 389                         /*
 390                          * we use the io vector that's reserved in the buffer header
 391                          * this insures we can always issue an I/O even in a low memory
 392                          * condition that prevents the _MALLOC from succeeding... this
 393                          * is necessary to prevent deadlocks with the pager
 394                          */
 395                         iovp = (struct iovec *)(&cbp->b_vects[0]);
 396
 397                 cbp->b_vectorlist  = (void *)iovp;
 398                 cbp->b_vectorcount = pg_count;
 399
 400                 for (i = 0, vsize = io_size; i < pg_count; i++, iovp++) {
 401                         int     psize;
 402
 403                         psize = PAGE_SIZE - pg_offset;
 404
 405                         if (psize > vsize)
 406                                 psize = vsize;
 407
 408                         iovp->iov_len  = psize;
 409                         iovp->iov_base = (caddr_t)upl_phys_page(pl, pl_index + i);
 410
 411                         if (iovp->iov_base == (caddr_t) 0) {
 412                                 if (pg_count > 1)
 413                                         _FREE(cbp->b_vectorlist, M_SEGMENT);
 414                                 free_io_buf(cbp);
 415
 416                                 error = EINVAL;
 417                                 break;
 418                         }
 419                         iovp->iov_base += pg_offset;
 420                         pg_offset = 0;
 421
 422                         if (flags & CL_PAGEOUT) {
 423                                 int         s;
 424                                 struct buf *bp;
 425
 426                                 s = splbio();
 427                                 if (bp = incore(vp, lblkno + i)) {
 428                                         if (!ISSET(bp->b_flags, B_BUSY)) {
 429                                                 bremfree(bp);
 430                                                 SET(bp->b_flags, (B_BUSY | B_INVAL));
 431                                                 splx(s);
 432                                                 brelse(bp);
 433                                         } else
 434                                                 panic("BUSY bp found in cluster_io");
 435                                 }
 436                                 splx(s);
 437                         }
 438                         vsize -= psize;
 439                 }
 440                 if (error)
 441                         break;
 442
 443                 if (flags & CL_ASYNC)
 444                         cbp->b_iodone = (void *)cluster_iodone;
 445                 cbp->b_flags |= io_flags;
 446
 447                 cbp->b_lblkno = lblkno;
 448                 cbp->b_blkno  = blkno;
 449                 cbp->b_bcount = io_size;
 450                 cbp->b_pagelist  = upl;
 451                 cbp->b_uploffset = upl_offset;
 452                 cbp->b_trans_next = (struct buf *)0;
 453
 454                 if (flags & CL_READ)
 455                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 456                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 457                 else
 458                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 459                                      cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
 460
 461                 if (cbp_head) {
 462                         cbp_tail->b_trans_next = cbp;
 463                         cbp_tail = cbp;
 464                 } else {
 465                         cbp_head = cbp;
 466                         cbp_tail = cbp;
 467                 }
 468                 (struct buf *)(cbp->b_trans_head) = cbp_head;
 469
 470                 upl_offset += io_size;
 471                 f_offset   += io_size;
 472                 size       -= io_size;
 473
 474                 if ( !(upl_offset & PAGE_MASK) || size == 0) {
 475                         /*
 476                          * if we have no more I/O to issue or
 477                          * the current I/O we've prepared fully
 478                          * completes the last page in this request
 479                          * or it's been completed via a zero-fill
 480                          * due to a 'hole' in the file
 481                          * then go ahead and issue the I/O
 482                          */
 483 start_io:
 484                         if (flags & CL_COMMIT)
 485                                 cbp_head->b_flags |= B_COMMIT_UPL;
 486                         if (flags & CL_PAGEOUT)
 487                                 cbp_head->b_flags |= B_PAGEOUT;
 488
 489                         if (real_bp) {
 490                                 cbp_head->b_flags |= B_NEED_IODONE;
 491                                 cbp_head->b_real_bp = real_bp;
 492                         }
 493
 494                         for (cbp = cbp_head; cbp;) {
 495                                 struct buf * cbp_next;
 496
 497                                 if (io_flags & B_WRITEINPROG)
 498                                         cbp->b_vp->v_numoutput++;
 499
 500                                 cbp_next = cbp->b_trans_next;
 501
 502                                 (void) VOP_STRATEGY(cbp);
 503                                 cbp = cbp_next;
 504                         }
 505                         if ( !(flags & CL_ASYNC)) {
 506                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 507                                         biowait(cbp);
 508
 509                                 if (error = cluster_iodone(cbp_head)) {
 510                                         retval = error;
 511                                         error  = 0;
 512                                 }
 513                         }
 514                         cbp_head = (struct buf *)0;
 515                         cbp_tail = (struct buf *)0;
 516                 }
 517         }
 518         if (error) {
 519                 for (cbp = cbp_head; cbp;) {
 520                         struct buf * cbp_next;
 521
 522                         if (cbp->b_vectorcount > 1)
 523                                 _FREE(cbp->b_vectorlist, M_SEGMENT);
 524                         cbp_next = cbp->b_trans_next;
 525                         free_io_buf(cbp);
 526                         cbp = cbp_next;
 527
 528                 }
 529                 pg_offset = upl_offset & PAGE_MASK;
 530                 pg_count  = (size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 531
 532                 if (flags & CL_COMMIT) {
 533                         int upl_abort_code;
 534
 535                         if (flags & CL_PAGEOUT)
 536                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 537                         else if (flags & CL_PAGEIN)
 538                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 539                         else
 540                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 541
 542                         kernel_upl_abort_range(upl, upl_offset - pg_offset, pg_count * PAGE_SIZE, upl_abort_code);
 543
 544                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
 545                                      upl, upl_offset - pg_offset, pg_count * PAGE_SIZE, error, 0);
 546                 }
 547                 if (real_bp) {
 548                         real_bp->b_flags |= B_ERROR;
 549                         real_bp->b_error  = error;
 550
 551                         biodone(real_bp);
 552                 }
 553                 if (retval == 0)
 554                         retval = error;
 555         }
 556         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
 557                      (int)f_offset, size, upl_offset, retval, 0);
 558
 559         return (retval);
 560 }
 561
 562
 563 static int
 564 cluster_rd_prefetch(vp, object, f_offset, size, filesize, devblocksize)
 565         struct vnode *vp;
 566         void         *object;
 567         off_t         f_offset;
 568         u_int         size;
 569         off_t         filesize;
 570         int           devblocksize;
 571 {
 572         upl_t         upl;
 573         upl_page_info_t *pl;
 574         int           pages_in_upl;
 575         int           start_pg;
 576         int           last_pg;
 577         int           last_valid;
 578         int           io_size;
 579
 580
 581         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
 582                      (int)f_offset, size, (int)filesize, 0, 0);
 583
 584         if (f_offset >= filesize) {
 585                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 586                              (int)f_offset, 0, 0, 0, 0);
 587                 return(0);
 588         }
 589         if (memory_object_page_op(object, (vm_offset_t)f_offset, 0, 0, 0) == KERN_SUCCESS) {
 590                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 591                              (int)f_offset, 0, 0, 0, 0);
 592                 return(0);
 593         }
 594         if (size > MAXPHYSIO)
 595                 size = MAXPHYSIO;
 596         else
 597                 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
 598
 599         if ((off_t)size > (filesize - f_offset))
 600                 size = ((filesize - f_offset) + (devblocksize - 1)) & ~(devblocksize - 1);
 601
 602         pages_in_upl = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
 603
 604
 605         vm_fault_list_request(object, (vm_object_offset_t)f_offset, pages_in_upl * PAGE_SIZE, &upl, NULL, 0,
 606                                       UPL_CLEAN_IN_PLACE | UPL_NO_SYNC | UPL_SET_INTERNAL);
 607         if (upl == (upl_t) 0)
 608                 return(0);
 609
 610         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
 611
 612         /*
 613          * scan from the beginning of the upl looking for the first
 614          * non-valid page.... this will become the first page in
 615          * the request we're going to make to 'cluster_io'... if all
 616          * of the pages are valid, we won't call through to 'cluster_io'
 617          */
 618         for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
 619                 if (!upl_valid_page(pl, start_pg))
 620                         break;
 621         }
 622
 623         /*
 624          * scan from the starting invalid page looking for a valid
 625          * page before the end of the upl is reached, if we
 626          * find one, then it will be the last page of the request to
 627          * 'cluster_io'
 628          */
 629         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
 630                 if (upl_valid_page(pl, last_pg))
 631                         break;
 632         }
 633
 634         /*
 635          * if we find any more free valid pages at the tail of the upl
 636          * than update maxra accordingly....
 637          */
 638         for (last_valid = last_pg; last_valid < pages_in_upl; last_valid++) {
 639                 if (!upl_valid_page(pl, last_valid))
 640                         break;
 641         }
 642         if (start_pg < last_pg) {
 643                 vm_offset_t   upl_offset;
 644
 645                 /*
 646                  * we found a range of 'invalid' pages that must be filled
 647                  * 'size' has already been clipped to the LEOF
 648                  * make sure it's at least a multiple of the device block size
 649                  */
 650                 upl_offset = start_pg * PAGE_SIZE;
 651                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
 652
 653                 if ((upl_offset + io_size) > size) {
 654                         io_size = size - upl_offset;
 655
 656                         KERNEL_DEBUG(0xd001000, upl_offset, size, io_size, 0, 0);
 657                 }
 658                 cluster_io(vp, upl, upl_offset, f_offset + upl_offset, io_size,
 659                            CL_READ | CL_COMMIT | CL_ASYNC | CL_AGE, (struct buf *)0);
 660         }
 661         if (start_pg) {
 662                 /*
 663                  * start_pg of non-zero indicates we found some already valid pages
 664                  * at the beginning of the upl.... we need to release these without
 665                  * modifying there state
 666                  */
 667                 kernel_upl_abort_range(upl, 0, start_pg * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
 668
 669                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 50)) | DBG_FUNC_NONE,
 670                              upl, 0, start_pg * PAGE_SIZE, 0, 0);
 671         }
 672         if (last_pg < pages_in_upl) {
 673                 /*
 674                  * the set of pages that we issued an I/O for did not extend all the
 675                  * way to the end of the upl... so just release them without modifying
 676                  * there state
 677                  */
 678                 kernel_upl_abort_range(upl, last_pg * PAGE_SIZE, (pages_in_upl - last_pg) * PAGE_SIZE,
 679                                 UPL_ABORT_FREE_ON_EMPTY);
 680
 681                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 50)) | DBG_FUNC_NONE,
 682                              upl, last_pg * PAGE_SIZE, (pages_in_upl - last_pg) * PAGE_SIZE, 0, 0);
 683         }
 684
 685         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
 686                      (int)f_offset + (last_valid * PAGE_SIZE), 0, 0, 0, 0);
 687
 688         return(last_valid);
 689 }
 690
 691
 692
 693 static void
 694 cluster_rd_ahead(vp, object, b_lblkno, e_lblkno, filesize, devblocksize)
 695         struct vnode *vp;
 696         void         *object;
 697         daddr_t       b_lblkno;
 698         daddr_t       e_lblkno;
 699         off_t         filesize;
 700         int           devblocksize;
 701 {
 702         daddr_t       r_lblkno;
 703         off_t         f_offset;
 704         int           size_of_prefetch;
 705
 706
 707         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
 708                      b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
 709
 710         if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
 711                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 712                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
 713                 return;
 714         }
 715
 716         if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) && b_lblkno != (vp->v_maxra + 1))) {
 717                 vp->v_ralen = 0;
 718                 vp->v_maxra = 0;
 719
 720                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 721                              vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
 722
 723                 return;
 724         }
 725         vp->v_ralen = vp->v_ralen ? min(MAXPHYSIO/PAGE_SIZE, vp->v_ralen << 1) : 1;
 726
 727         if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
 728                 vp->v_ralen = min(MAXPHYSIO/PAGE_SIZE, (e_lblkno + 1) - b_lblkno);
 729
 730         if (e_lblkno < vp->v_maxra) {
 731                 if ((vp->v_maxra - e_lblkno) > ((MAXPHYSIO/PAGE_SIZE) / 4)) {
 732
 733                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 734                                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
 735                         return;
 736                 }
 737         }
 738         r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
 739         f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
 740
 741         size_of_prefetch = cluster_rd_prefetch(vp, object, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
 742
 743         if (size_of_prefetch)
 744                 vp->v_maxra = r_lblkno + (size_of_prefetch - 1);
 745
 746         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
 747                      vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
 748 }
 749
 750
 751 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 752         struct vnode *vp;
 753         upl_t         upl;
 754         vm_offset_t   upl_offset;
 755         off_t         f_offset;
 756         int           size;
 757         off_t         filesize;
 758         int           devblocksize;
 759         int           flags;
 760 {
 761         int           io_size;
 762         int           pg_size;
 763         off_t         max_size;
 764         int local_flags = CL_PAGEOUT;
 765
 766         if ((flags & UPL_IOSYNC) == 0)
 767                 local_flags |= CL_ASYNC;
 768         if ((flags & UPL_NOCOMMIT) == 0)
 769                 local_flags |= CL_COMMIT;
 770
 771         if (upl == (upl_t) 0)
 772                 panic("cluster_pageout: can't handle NULL upl yet\n");
 773
 774
 775         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
 776                      (int)f_offset, size, (int)filesize, local_flags, 0);
 777
 778         /*
 779          * If they didn't specify any I/O, then we are done...
 780          * we can't issue an abort because we don't know how
 781          * big the upl really is
 782          */
 783         if (size <= 0)
 784                 return (EINVAL);
 785
 786         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
 787                 if (local_flags & CL_COMMIT)
 788                         kernel_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 789                 return (EROFS);
 790         }
 791         /*
 792          * can't page-in from a negative offset
 793          * or if we're starting beyond the EOF
 794          * or if the file offset isn't page aligned
 795          * or the size requested isn't a multiple of PAGE_SIZE
 796          */
 797         if (f_offset < 0 || f_offset >= filesize ||
 798            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 799                 if (local_flags & CL_COMMIT)
 800                         kernel_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 801                 return (EINVAL);
 802         }
 803         max_size = filesize - f_offset;
 804
 805         if (size < max_size)
 806                 io_size = size;
 807         else
 808                 io_size = (max_size + (devblocksize - 1)) & ~(devblocksize - 1);
 809
 810         pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 811
 812         if (size > pg_size) {
 813                 if (local_flags & CL_COMMIT)
 814                         kernel_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
 815                                         UPL_ABORT_FREE_ON_EMPTY);
 816         }
 817
 818         return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
 819                            local_flags, (struct buf *)0));
 820 }
 821
 822
 823 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
 824         struct vnode *vp;
 825         upl_t         upl;
 826         vm_offset_t   upl_offset;
 827         off_t         f_offset;
 828         int           size;
 829         off_t         filesize;
 830         int           devblocksize;
 831         int           flags;
 832 {
 833         u_int         io_size;
 834         int           pg_size;
 835         off_t         max_size;
 836         int           retval;
 837         int           local_flags = 0;
 838         void         *object = 0;
 839
 840
 841         /*
 842          * If they didn't ask for any data, then we are done...
 843          * we can't issue an abort because we don't know how
 844          * big the upl really is
 845          */
 846         if (size <= 0)
 847                 return (EINVAL);
 848
 849         if ((flags & UPL_NOCOMMIT) == 0)
 850                 local_flags = CL_COMMIT;
 851
 852         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
 853                      (int)f_offset, size, (int)filesize, local_flags, 0);
 854
 855         /*
 856          * can't page-in from a negative offset
 857          * or if we're starting beyond the EOF
 858          * or if the file offset isn't page aligned
 859          * or the size requested isn't a multiple of PAGE_SIZE
 860          */
 861         if (f_offset < 0 || f_offset >= filesize ||
 862            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
 863                 if (local_flags & CL_COMMIT)
 864                         kernel_upl_abort_range(upl, upl_offset, size, UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY);
 865                 return (EINVAL);
 866         }
 867         max_size = filesize - f_offset;
 868
 869         if (size < max_size)
 870                 io_size = size;
 871         else
 872                 io_size = (max_size + (devblocksize - 1)) & ~(devblocksize - 1);
 873
 874         pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 875
 876         if (upl == (upl_t) 0) {
 877                 object = ubc_getobject(vp, UBC_PAGINGOP|UBC_NOREACTIVATE);
 878                 if (object == (void *)NULL)
 879                         panic("cluster_pagein: ubc_getobject failed");
 880
 881                 vm_fault_list_request(object, (vm_offset_t)f_offset, pg_size, &upl, NULL, 0,
 882                                       UPL_CLEAN_IN_PLACE | UPL_NO_SYNC | UPL_SET_INTERNAL);
 883                 if (upl == (upl_t) 0)
 884                         return (EINVAL);
 885
 886                 upl_offset = (vm_offset_t)0;
 887                 size = pg_size;
 888         }
 889         if (size > pg_size) {
 890                 if (local_flags & CL_COMMIT)
 891                         kernel_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
 892                                         UPL_ABORT_FREE_ON_EMPTY);
 893         }
 894
 895         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
 896                             local_flags | CL_READ | CL_PAGEIN, (struct buf *)0);
 897
 898         if (retval == 0) {
 899                 int b_lblkno;
 900                 int e_lblkno;
 901
 902                 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
 903                 e_lblkno = (int)
 904                         ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
 905
 906                 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) {
 907                         if (object == (void *)0) {
 908                                 object = ubc_getobject(vp, UBC_PAGINGOP|UBC_NOREACTIVATE);
 909                                         if (object == (void *)NULL)
 910                                         panic("cluster_pagein: ubc_getobject failed");
 911                         }
 912                         /*
 913                          * we haven't read the last page in of the file yet
 914                          * so let's try to read ahead if we're in
 915                          * a sequential access pattern
 916                          */
 917                         cluster_rd_ahead(vp, object, b_lblkno, e_lblkno, filesize, devblocksize);
 918                 }
 919                 vp->v_lastr = e_lblkno;
 920         }
 921         return (retval);
 922 }
 923
 924
 925 cluster_bp(bp)
 926         struct buf *bp;
 927 {
 928         off_t  f_offset;
 929         int    flags;
 930
 931         if (bp->b_pagelist == (upl_t) 0)
 932                 panic("cluster_bp: can't handle NULL upl yet\n");
 933         if (bp->b_flags & B_READ)
 934                 flags = CL_ASYNC | CL_NOMAP | CL_READ;
 935         else
 936                 flags = CL_ASYNC | CL_NOMAP;
 937
 938         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
 939
 940         return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, flags, bp));
 941 }
 942
 943
 944 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
 945         struct vnode *vp;
 946         struct uio   *uio;
 947         off_t         oldEOF;
 948         off_t         newEOF;
 949         off_t         headOff;
 950         off_t         tailOff;
 951         int           devblocksize;
 952         int           flags;
 953 {
 954         void          *object;
 955         int           prev_resid;
 956         int           clip_size;
 957         off_t         max_io_size;
 958         struct iovec  *iov;
 959         int           retval = 0;
 960
 961
 962         object = ubc_getobject(vp, UBC_NOREACTIVATE);
 963         if (object == (void *)NULL)
 964                 panic("cluster_write: ubc_getobject failed");
 965
 966         /*
 967          * We set a threshhold of 4 pages to decide if the nocopy
 968          * write loop is worth the trouble...
 969          */
 970
 971         if ((!uio) || (uio->uio_resid < 4 * PAGE_SIZE) ||
 972             (flags & IO_TAILZEROFILL) || (flags & IO_HEADZEROFILL) ||
 973             (uio->uio_segflg != UIO_USERSPACE) || (!(vp->v_flag & VNOCACHE_DATA)))
 974           {
 975             retval = cluster_write_x(object, vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
 976             return(retval);
 977           }
 978
 979         while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
 980           {
 981             /* we know we have a resid, so this is safe */
 982             iov = uio->uio_iov;
 983             while (iov->iov_len == 0) {
 984               uio->uio_iov++;
 985               uio->uio_iovcnt--;
 986               iov = uio->uio_iov;
 987             }
 988
 989             if (uio->uio_offset & PAGE_MASK_64)
 990               {
 991                 /* Bring the file offset write up to a pagesize boundary */
 992                 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
 993                 if (uio->uio_resid < clip_size)
 994                   clip_size = uio->uio_resid;
 995                 /*
 996                  * Fake the resid going into the cluster_write_x call
 997                  * and restore it on the way out.
 998                  */
 999                 prev_resid = uio->uio_resid;
1000                 uio->uio_resid = clip_size;
1001                 retval = cluster_write_x(object, vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1002                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1003               }
1004             else if ((int)iov->iov_base & PAGE_MASK_64)
1005               {
1006                 clip_size = iov->iov_len;
1007                 prev_resid = uio->uio_resid;
1008                 uio->uio_resid = clip_size;
1009                 retval = cluster_write_x(object, vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1010                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1011               }
1012             else
1013               {
1014                 /*
1015                  * If we come in here, we know the offset into
1016                  * the file is on a pagesize boundary
1017                  */
1018
1019                 max_io_size = newEOF - uio->uio_offset;
1020                 clip_size = uio->uio_resid;
1021                 if (iov->iov_len < clip_size)
1022                   clip_size = iov->iov_len;
1023                 if (max_io_size < clip_size)
1024                   clip_size = max_io_size;
1025
1026                 if (clip_size < PAGE_SIZE)
1027                   {
1028                     /*
1029                      * Take care of tail end of write in this vector
1030                      */
1031                     prev_resid = uio->uio_resid;
1032                     uio->uio_resid = clip_size;
1033                     retval = cluster_write_x(object, vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1034                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1035                   }
1036                 else
1037                   {
1038                     /* round clip_size down to a multiple of pagesize */
1039                     clip_size = clip_size & ~(PAGE_MASK);
1040                     prev_resid = uio->uio_resid;
1041                     uio->uio_resid = clip_size;
1042                     retval = cluster_nocopy_write(object, vp, uio, newEOF, devblocksize, flags);
1043                     if ((retval == 0) && uio->uio_resid)
1044                       retval = cluster_write_x(object, vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1045                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1046                   }
1047               } /* end else */
1048           } /* end while */
1049         return(retval);
1050 }
1051
1052 static
1053 cluster_nocopy_write(object, vp, uio, newEOF, devblocksize, flags)
1054         void         *object;
1055         struct vnode *vp;
1056         struct uio   *uio;
1057         off_t         newEOF;
1058         int           devblocksize;
1059         int           flags;
1060 {
1061         upl_t            upl;
1062         upl_page_info_t  *pl;
1063         off_t            upl_f_offset;
1064         vm_offset_t      upl_offset;
1065         off_t            max_io_size;
1066         int              io_size;
1067         int              upl_size;
1068         int              upl_needed_size;
1069         int              pages_in_pl;
1070         int              upl_flags;
1071         kern_return_t    kret;
1072         struct iovec     *iov;
1073         int              i;
1074         int              force_data_sync;
1075         int              error  = 0;
1076
1077         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1078                      (int)uio->uio_offset, (int)uio->uio_resid,
1079                      (int)newEOF, devblocksize, 0);
1080
1081         /*
1082          * When we enter this routine, we know
1083          *  -- the offset into the file is on a pagesize boundary
1084          *  -- the resid is a page multiple
1085          *  -- the resid will not exceed iov_len
1086          */
1087
1088         iov = uio->uio_iov;
1089
1090         while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1091
1092           io_size = uio->uio_resid;
1093           if (io_size > MAXPHYSIO)
1094             io_size = MAXPHYSIO;
1095
1096           upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1097           upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1098
1099           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1100                        (int)upl_offset, upl_needed_size, iov->iov_base, io_size, 0);
1101
1102           for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
1103             {
1104               pages_in_pl = 0;
1105               upl_size = upl_needed_size;
1106               upl_flags = UPL_COPYOUT_FROM | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1107
1108               kret = vm_map_get_upl(current_map(),
1109                                     (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1110                                     &upl_size, &upl, &pl, &pages_in_pl, &upl_flags, force_data_sync);
1111
1112               pages_in_pl = upl_size / PAGE_SIZE;
1113
1114               if (kret != KERN_SUCCESS)
1115                 {
1116                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1117                                0, 0, 0, kret, 0);
1118
1119                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1120                                (int)uio->uio_offset, (int)uio->uio_resid, kret, 1, 0);
1121
1122                   /* cluster_nocopy_write: failed to get pagelist */
1123                   /* do not return kret here */
1124                   return(0);
1125                 }
1126
1127               for(i=0; i < pages_in_pl; i++)
1128                 {
1129                   if (!upl_valid_page(pl, i))
1130                     break;
1131                 }
1132
1133               if (i == pages_in_pl)
1134                 break;
1135
1136               kernel_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1137                                      UPL_ABORT_FREE_ON_EMPTY);
1138             }
1139
1140           if (force_data_sync >= 3)
1141             {
1142               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1143                            0, 0, 0, kret, 0);
1144
1145               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1146                            (int)uio->uio_offset, (int)uio->uio_resid, kret, 2, 0);
1147               return(0);
1148             }
1149
1150           /*
1151            * Consider the possibility that upl_size wasn't satisfied.
1152            */
1153           if (upl_size != upl_needed_size)
1154             io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1155
1156           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1157                        (int)upl_offset, upl_size, iov->iov_base, io_size, 0);
1158
1159           if (io_size == 0)
1160             {
1161               kernel_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1162                                    UPL_ABORT_FREE_ON_EMPTY);
1163               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1164                      (int)uio->uio_offset, uio->uio_resid, 0, 3, 0);
1165
1166               return(0);
1167             }
1168
1169           /*
1170            * Now look for pages already in the cache
1171            * and throw them away.
1172            */
1173
1174           upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
1175           max_io_size = io_size;
1176
1177           while (max_io_size) {
1178
1179             /*
1180              * Flag UPL_POP_DUMP says if the page is found
1181              * in the page cache it must be thrown away.
1182              */
1183             memory_object_page_op(object, (vm_offset_t)upl_f_offset,
1184                                   UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1185                                   0, 0);
1186             max_io_size  -= PAGE_SIZE;
1187             upl_f_offset += PAGE_SIZE;
1188           }
1189
1190           /*
1191            * issue a synchronous write to cluster_io
1192            */
1193
1194           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1195                        (int)upl_offset, (int)uio->uio_offset, io_size, 0, 0);
1196
1197           error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1198                              io_size, 0, (struct buf *)0);
1199
1200           if (error == 0) {
1201             /*
1202              * The cluster_io write completed successfully,
1203              * update the uio structure and commit.
1204              */
1205
1206             kernel_upl_commit_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1207                             UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY,
1208                             pl, MAX_UPL_TRANSFER);
1209
1210             iov->iov_base += io_size;
1211             iov->iov_len -= io_size;
1212             uio->uio_resid -= io_size;
1213             uio->uio_offset += io_size;
1214           }
1215           else {
1216             kernel_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1217                                    UPL_ABORT_FREE_ON_EMPTY);
1218           }
1219
1220           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1221                        (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1222
1223         } /* end while */
1224
1225
1226         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1227                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1228
1229         return (error);
1230 }
1231
1232 static
1233 cluster_write_x(object, vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1234         void         *object;
1235         struct vnode *vp;
1236         struct uio   *uio;
1237         off_t         oldEOF;
1238         off_t         newEOF;
1239         off_t         headOff;
1240         off_t         tailOff;
1241         int           devblocksize;
1242         int           flags;
1243 {
1244         upl_page_info_t *pl;
1245         upl_t            upl;
1246         vm_offset_t      upl_offset;
1247         int              upl_size;
1248         off_t            upl_f_offset;
1249         int              pages_in_upl;
1250         int              start_offset;
1251         int              xfer_resid;
1252         int              io_size;
1253         int              io_size_before_rounding;
1254         int              io_flags;
1255         vm_offset_t      io_address;
1256         int              io_offset;
1257         int              bytes_to_zero;
1258         int              bytes_to_move;
1259         kern_return_t    kret;
1260         int              retval = 0;
1261         int              uio_resid;
1262         long long        total_size;
1263         long long        zero_cnt;
1264         off_t            zero_off;
1265         long long        zero_cnt1;
1266         off_t            zero_off1;
1267         daddr_t          start_blkno;
1268         daddr_t          last_blkno;
1269
1270         if (uio) {
1271                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1272                              (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1273
1274                 uio_resid = uio->uio_resid;
1275         } else {
1276                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1277                              0, 0, (int)oldEOF, (int)newEOF, 0);
1278
1279                 uio_resid = 0;
1280         }
1281         zero_cnt  = 0;
1282         zero_cnt1 = 0;
1283
1284         if (flags & IO_HEADZEROFILL) {
1285                 /*
1286                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1287                  * so we zero fill the intervening space between the old EOF and the offset
1288                  * where the next chunk of real data begins.... ftruncate will also use this
1289                  * routine to zero fill to the new EOF when growing a file... in this case, the
1290                  * uio structure will not be provided
1291                  */
1292                 if (uio) {
1293                         if (headOff < uio->uio_offset) {
1294                                 zero_cnt = uio->uio_offset - headOff;
1295                                 zero_off = headOff;
1296                         }
1297                 } else if (headOff < newEOF) {
1298                         zero_cnt = newEOF - headOff;
1299                         zero_off = headOff;
1300                 }
1301         }
1302         if (flags & IO_TAILZEROFILL) {
1303                 if (uio) {
1304                         zero_off1 = uio->uio_offset + uio->uio_resid;
1305
1306                         if (zero_off1 < tailOff)
1307                                 zero_cnt1 = tailOff - zero_off1;
1308                 }
1309         }
1310         if (zero_cnt == 0 && uio == (struct uio *) 0)
1311           {
1312             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1313                          retval, 0, 0, 0, 0);
1314             return (0);
1315           }
1316
1317         while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1318                 /*
1319                  * for this iteration of the loop, figure out where our starting point is
1320                  */
1321                 if (zero_cnt) {
1322                         start_offset = (int)(zero_off & PAGE_MASK_64);
1323                         upl_f_offset = zero_off - start_offset;
1324                 } else if (uio_resid) {
1325                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1326                         upl_f_offset = uio->uio_offset - start_offset;
1327                 } else {
1328                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1329                         upl_f_offset = zero_off1 - start_offset;
1330                 }
1331                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1332                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1333
1334                 if (total_size > (long long)MAXPHYSIO)
1335                         total_size = MAXPHYSIO;
1336
1337                 /*
1338                  * compute the size of the upl needed to encompass
1339                  * the requested write... limit each call to cluster_io
1340                  * to at most MAXPHYSIO, make sure to account for
1341                  * a starting offset that's not page aligned
1342                  */
1343                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1344
1345                 if (upl_size > MAXPHYSIO)
1346                         upl_size = MAXPHYSIO;
1347
1348                 pages_in_upl = upl_size / PAGE_SIZE;
1349                 io_size      = upl_size - start_offset;
1350
1351                 if ((long long)io_size > total_size)
1352                         io_size = total_size;
1353
1354                 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1355                 last_blkno  = start_blkno + pages_in_upl;
1356
1357                 kret = vm_fault_list_request(object,
1358                                 (vm_object_offset_t)upl_f_offset, upl_size, &upl, NULL, 0,
1359                                 (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL));
1360
1361                 if (kret != KERN_SUCCESS)
1362                         panic("cluster_write: failed to get pagelist");
1363
1364                 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1365
1366                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
1367                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
1368
1369
1370                 if (start_offset && !upl_valid_page(pl, 0)) {
1371                         int   read_size;
1372
1373                         /*
1374                          * we're starting in the middle of the first page of the upl
1375                          * and the page isn't currently valid, so we're going to have
1376                          * to read it in first... this is a synchronous operation
1377                          */
1378                         read_size = PAGE_SIZE;
1379
1380                         if ((upl_f_offset + read_size) > newEOF) {
1381                                 read_size = newEOF - upl_f_offset;
1382                                 read_size = (read_size + (devblocksize - 1)) & ~(devblocksize - 1);
1383                         }
1384                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
1385                                             CL_READ, (struct buf *)0);
1386                         if (retval) {
1387                                 /*
1388                                  * we had an error during the read which causes us to abort
1389                                  * the current cluster_write request... before we do, we need
1390                                  * to release the rest of the pages in the upl without modifying
1391                                  * there state and mark the failed page in error
1392                                  */
1393                                 kernel_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1394                                 kernel_upl_abort(upl, 0);
1395
1396                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1397                                              upl, 0, 0, retval, 0);
1398                                 break;
1399                         }
1400                 }
1401                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1402                         /*
1403                          * the last offset we're writing to in this upl does not end on a page
1404                          * boundary... if it's not beyond the old EOF, then we'll also need to
1405                          * pre-read this page in if it isn't already valid
1406                          */
1407                         upl_offset = upl_size - PAGE_SIZE;
1408
1409                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1410                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1411                                 int   read_size;
1412
1413                                 read_size = PAGE_SIZE;
1414
1415                                 if ((upl_f_offset + upl_offset + read_size) > newEOF) {
1416                                         read_size = newEOF - (upl_f_offset + upl_offset);
1417                                         read_size = (read_size + (devblocksize - 1)) & ~(devblocksize - 1);
1418                                 }
1419                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
1420                                                     CL_READ, (struct buf *)0);
1421                                 if (retval) {
1422                                         /*
1423                                          * we had an error during the read which causes us to abort
1424                                          * the current cluster_write request... before we do, we need
1425                                          * to release the rest of the pages in the upl without modifying
1426                                          * there state and mark the failed page in error
1427                                          */
1428                                         kernel_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1429                                         kernel_upl_abort(upl, 0);
1430
1431                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1432                                                      upl, 0, 0, retval, 0);
1433                                         break;
1434                                 }
1435                         }
1436                 }
1437                 if ((kret = kernel_upl_map(kernel_map, upl, &io_address)) != KERN_SUCCESS)
1438                         panic("cluster_write: kernel_upl_map failed\n");
1439                 xfer_resid = io_size;
1440                 io_offset = start_offset;
1441
1442                 while (zero_cnt && xfer_resid) {
1443
1444                         if (zero_cnt < (long long)xfer_resid)
1445                                 bytes_to_zero = zero_cnt;
1446                         else
1447                                 bytes_to_zero = xfer_resid;
1448
1449                         if ( !(flags & IO_NOZEROVALID)) {
1450                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1451
1452                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1453                                              (int)upl_f_offset + io_offset, bytes_to_zero,
1454                                              (int)zero_cnt, xfer_resid, 0);
1455                         } else {
1456                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1457
1458                                 if ( !upl_valid_page(pl, (int)(zero_off / PAGE_SIZE_64))) {
1459                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1460
1461                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1462                                                      (int)upl_f_offset + io_offset, bytes_to_zero,
1463                                                      (int)zero_cnt, xfer_resid, 0);
1464                                 }
1465                         }
1466                         xfer_resid -= bytes_to_zero;
1467                         zero_cnt   -= bytes_to_zero;
1468                         zero_off   += bytes_to_zero;
1469                         io_offset  += bytes_to_zero;
1470                 }
1471                 if (xfer_resid && uio_resid) {
1472                         bytes_to_move = min(uio_resid, xfer_resid);
1473
1474                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1475                                      (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1476
1477                         retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1478
1479                         if (retval) {
1480                                 if ((kret = kernel_upl_unmap(kernel_map, upl)) != KERN_SUCCESS)
1481                                         panic("cluster_write: kernel_upl_unmap failed\n");
1482                                 kernel_upl_abort(upl, UPL_ABORT_DUMP_PAGES);
1483
1484                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1485                                              upl, 0, 0, retval, 0);
1486                         } else {
1487                                 uio_resid  -= bytes_to_move;
1488                                 xfer_resid -= bytes_to_move;
1489                                 io_offset  += bytes_to_move;
1490                         }
1491                 }
1492                 while (xfer_resid && zero_cnt1 && retval == 0) {
1493
1494                         if (zero_cnt1 < (long long)xfer_resid)
1495                                 bytes_to_zero = zero_cnt1;
1496                         else
1497                                 bytes_to_zero = xfer_resid;
1498
1499                         if ( !(flags & IO_NOZEROVALID)) {
1500                                 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1501
1502                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1503                                              (int)upl_f_offset + io_offset,
1504                                              bytes_to_zero, (int)zero_cnt1, xfer_resid, 0);
1505                         } else {
1506                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1507                                 if ( !upl_valid_page(pl, (int)(zero_off1 / PAGE_SIZE_64))) {
1508                                         bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1509
1510                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1511                                                      (int)upl_f_offset + io_offset,
1512                                                      bytes_to_zero, (int)zero_cnt1, xfer_resid, 0);
1513                                 }
1514                         }
1515                         xfer_resid -= bytes_to_zero;
1516                         zero_cnt1  -= bytes_to_zero;
1517                         zero_off1  += bytes_to_zero;
1518                         io_offset  += bytes_to_zero;
1519                 }
1520
1521                 if (retval == 0) {
1522                         int must_push;
1523                         int can_delay;
1524
1525                         io_size += start_offset;
1526
1527                         if ((upl_f_offset + io_size) == newEOF && io_size < upl_size) {
1528                                 /*
1529                                  * if we're extending the file with this write
1530                                  * we'll zero fill the rest of the page so that
1531                                  * if the file gets extended again in such a way as to leave a
1532                                  * hole starting at this EOF, we'll have zero's in the correct spot
1533                                  */
1534                                 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1535
1536                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1537                                              (int)upl_f_offset + io_size,
1538                                              upl_size - io_size, 0, 0, 0);
1539                         }
1540                         if ((kret = kernel_upl_unmap(kernel_map, upl)) != KERN_SUCCESS)
1541                                 panic("cluster_write: kernel_upl_unmap failed\n");
1542
1543                         io_size_before_rounding = io_size;
1544
1545                         if (io_size & (devblocksize - 1))
1546                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
1547
1548                         must_push = 0;
1549                         can_delay = 0;
1550
1551                         if (vp->v_clen) {
1552                                 int newsize;
1553
1554                                 /*
1555                                  * we have an existing cluster... see if this write will extend it nicely
1556                                  */
1557                                 if (start_blkno >= vp->v_cstart) {
1558                                         if (last_blkno <= (vp->v_cstart + vp->v_clen)) {
1559                                                 /*
1560                                                  * we have a write that fits entirely
1561                                                  * within the existing cluster limits
1562                                                  */
1563                                                 if (last_blkno >= vp->v_lastw) {
1564                                                         /*
1565                                                          * if we're extending the dirty region within the cluster
1566                                                          * we need to update the cluster info... we check for blkno
1567                                                          * equality because we may be extending the file with a
1568                                                          * partial write.... this in turn changes our idea of how
1569                                                          * much data to write out (v_ciosiz) for the last page
1570                                                          */
1571                                                         vp->v_lastw = last_blkno;
1572                                                         newsize = io_size + ((start_blkno - vp->v_cstart) * PAGE_SIZE);
1573
1574                                                         if (newsize > vp->v_ciosiz)
1575                                                                 vp->v_ciosiz = newsize;
1576                                                 }
1577                                                 can_delay = 1;
1578                                                 goto finish_io;
1579                                         }
1580                                         if (start_blkno < (vp->v_cstart + vp->v_clen)) {
1581                                                 /*
1582                                                  * we have a write that starts in the middle of the current cluster
1583                                                  * but extends beyond the cluster's limit
1584                                                  * we'll clip the current cluster if we actually
1585                                                  * overlap with the new write and then push it out
1586                                                  * and start a new cluster with the current write
1587                                                  */
1588                                                  if (vp->v_lastw > start_blkno) {
1589                                                         vp->v_lastw = start_blkno;
1590                                                         vp->v_ciosiz = (vp->v_lastw - vp->v_cstart) * PAGE_SIZE;
1591                                                  }
1592                                         }
1593                                         /*
1594                                          * we also get here for the case where the current write starts
1595                                          * beyond the limit of the existing cluster
1596                                          */
1597                                         must_push = 1;
1598                                         goto check_delay;
1599                                 }
1600                                 /*
1601                                  * the current write starts in front of the current cluster
1602                                  */
1603                                 if (last_blkno > vp->v_cstart) {
1604                                         /*
1605                                          * the current write extends into the existing cluster
1606                                          */
1607                                         if ((vp->v_lastw - start_blkno) > vp->v_clen) {
1608                                                 /*
1609                                                  * if we were to combine this write with the current cluster
1610                                                  * we would exceed the cluster size limit....
1611                                                  * clip the current cluster by moving the start position
1612                                                  * to where the current write ends, and then push it
1613                                                  */
1614                                                 vp->v_ciosiz -= (last_blkno - vp->v_cstart) * PAGE_SIZE;
1615                                                 vp->v_cstart = last_blkno;
1616
1617                                                 /*
1618                                                  * round up the io_size to the nearest page size
1619                                                  * since we've coalesced with at least 1 pre-existing
1620                                                  * page in the current cluster... this write may have ended in the
1621                                                  * middle of the page which would cause io_size to give us an
1622                                                  * inaccurate view of how much I/O we actually need to do
1623                                                  */
1624                                                 io_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1625
1626                                                 must_push = 1;
1627                                                 goto check_delay;
1628                                         }
1629                                         /*
1630                                          * we can coalesce the current write with the existing cluster
1631                                          * adjust the cluster info to reflect this
1632                                          */
1633                                         if (last_blkno > vp->v_lastw) {
1634                                                 /*
1635                                                  * the current write completey overlaps
1636                                                  * the existing cluster
1637                                                  */
1638                                                 vp->v_lastw = last_blkno;
1639                                                 vp->v_ciosiz = io_size;
1640                                         } else {
1641                                                 vp->v_ciosiz += (vp->v_cstart - start_blkno) * PAGE_SIZE;
1642
1643                                                 if (io_size > vp->v_ciosiz)
1644                                                         vp->v_ciosiz = io_size;
1645                                         }
1646                                         vp->v_cstart = start_blkno;
1647                                         can_delay = 1;
1648                                         goto finish_io;
1649                                 }
1650                                 /*
1651                                  * this I/O range is entirely in front of the current cluster
1652                                  * so we need to push the current cluster out before beginning
1653                                  * a new one
1654                                  */
1655                                 must_push = 1;
1656                         }
1657 check_delay:
1658                         if (must_push)
1659                                 cluster_push(vp);
1660
1661                         if (io_size_before_rounding < MAXPHYSIO && !(flags & IO_SYNC)) {
1662                                 vp->v_clen = MAXPHYSIO / PAGE_SIZE;
1663                                 vp->v_cstart = start_blkno;
1664                                 vp->v_lastw  = last_blkno;
1665                                 vp->v_ciosiz = io_size;
1666
1667                                 can_delay = 1;
1668                         }
1669 finish_io:
1670                         if (can_delay) {
1671                                 kernel_upl_commit_range(upl, 0, upl_size,
1672                                          UPL_COMMIT_SET_DIRTY
1673                                                 | UPL_COMMIT_FREE_ON_EMPTY,
1674                                          pl, MAX_UPL_TRANSFER);
1675                                 continue;
1676                         }
1677
1678                         if ((flags & IO_SYNC) || (vp->v_numoutput > ASYNC_THROTTLE))
1679                                 io_flags = CL_COMMIT | CL_AGE;
1680                         else
1681                                 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
1682
1683                         if (vp->v_flag & VNOCACHE_DATA)
1684                                 io_flags |= CL_DUMP;
1685
1686                         retval = cluster_io(vp, upl, 0, upl_f_offset, io_size,
1687                                             io_flags, (struct buf *)0);
1688                 }
1689         }
1690         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1691                      retval, 0, 0, 0, 0);
1692
1693         return (retval);
1694 }
1695
1696 cluster_read(vp, uio, filesize, devblocksize, flags)
1697         struct vnode *vp;
1698         struct uio   *uio;
1699         off_t         filesize;
1700         int           devblocksize;
1701         int           flags;
1702 {
1703         void          *object;
1704         int           prev_resid;
1705         int           clip_size;
1706         off_t         max_io_size;
1707         struct iovec  *iov;
1708         int           retval = 0;
1709
1710         object = ubc_getobject(vp, UBC_NOREACTIVATE);
1711         if (object == (void *)NULL)
1712                 panic("cluster_read: ubc_getobject failed");
1713
1714         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
1715                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
1716
1717         /*
1718          * We set a threshhold of 4 pages to decide if the nocopy
1719          * read loop is worth the trouble...
1720          */
1721
1722         if ((!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
1723             || (uio->uio_resid < 4 * PAGE_SIZE))
1724           {
1725             retval = cluster_read_x(object, vp, uio, filesize, devblocksize, flags);
1726             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
1727                          (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
1728             return(retval);
1729
1730           }
1731
1732         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
1733           {
1734             /* we know we have a resid, so this is safe */
1735             iov = uio->uio_iov;
1736             while (iov->iov_len == 0) {
1737               uio->uio_iov++;
1738               uio->uio_iovcnt--;
1739               iov = uio->uio_iov;
1740             }
1741
1742             if (uio->uio_offset & PAGE_MASK_64)
1743               {
1744                 /* Bring the file offset read up to a pagesize boundary */
1745                 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
1746                 if (uio->uio_resid < clip_size)
1747                   clip_size = uio->uio_resid;
1748                 /*
1749                  * Fake the resid going into the cluster_read_x call
1750                  * and restore it on the way out.
1751                  */
1752                 prev_resid = uio->uio_resid;
1753                 uio->uio_resid = clip_size;
1754                 retval = cluster_read_x(object, vp, uio, filesize, devblocksize, flags);
1755                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1756               }
1757             else if ((int)iov->iov_base & PAGE_MASK_64)
1758               {
1759                 clip_size = iov->iov_len;
1760                 prev_resid = uio->uio_resid;
1761                 uio->uio_resid = clip_size;
1762                 retval = cluster_read_x(object, vp, uio, filesize, devblocksize, flags);
1763                 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1764               }
1765             else
1766               {
1767                 /*
1768                  * If we come in here, we know the offset into
1769                  * the file is on a pagesize boundary
1770                  */
1771
1772                 max_io_size = filesize - uio->uio_offset;
1773                 clip_size = uio->uio_resid;
1774                 if (iov->iov_len < clip_size)
1775                   clip_size = iov->iov_len;
1776                 if (max_io_size < clip_size)
1777                   clip_size = (int)max_io_size;
1778
1779                 if (clip_size < PAGE_SIZE)
1780                   {
1781                     /*
1782                      * Take care of the tail end of the read in this vector.
1783                      */
1784                     prev_resid = uio->uio_resid;
1785                     uio->uio_resid = clip_size;
1786                     retval = cluster_read_x(object,vp, uio, filesize, devblocksize, flags);
1787                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1788                   }
1789                 else
1790                   {
1791                     /* round clip_size down to a multiple of pagesize */
1792                     clip_size = clip_size & ~(PAGE_MASK);
1793                     prev_resid = uio->uio_resid;
1794                     uio->uio_resid = clip_size;
1795                     retval = cluster_nocopy_read(object, vp, uio, filesize, devblocksize, flags);
1796                     if ((retval==0) && uio->uio_resid)
1797                       retval = cluster_read_x(object,vp, uio, filesize, devblocksize, flags);
1798                     uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1799                   }
1800               } /* end else */
1801           } /* end while */
1802
1803         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
1804                      (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
1805
1806         return(retval);
1807 }
1808
1809 static
1810 cluster_read_x(object, vp, uio, filesize, devblocksize, flags)
1811         void         *object;
1812         struct vnode *vp;
1813         struct uio   *uio;
1814         off_t         filesize;
1815         int           devblocksize;
1816         int           flags;
1817 {
1818         upl_page_info_t *pl;
1819         upl_t            upl;
1820         vm_offset_t      upl_offset;
1821         int              upl_size;
1822         off_t            upl_f_offset;
1823         int              start_offset;
1824         int              start_pg;
1825         int              last_pg;
1826         int              uio_last;
1827         int              pages_in_upl;
1828         off_t            max_size;
1829         int              io_size;
1830         vm_offset_t      io_address;
1831         kern_return_t    kret;
1832         int              segflg;
1833         int              error  = 0;
1834         int              retval = 0;
1835         int              b_lblkno;
1836         int              e_lblkno;
1837
1838         b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
1839
1840         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
1841                 /*
1842                  * compute the size of the upl needed to encompass
1843                  * the requested read... limit each call to cluster_io
1844                  * to at most MAXPHYSIO, make sure to account for
1845                  * a starting offset that's not page aligned
1846                  */
1847                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1848                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
1849                 max_size     = filesize - uio->uio_offset;
1850
1851                 if (uio->uio_resid < max_size)
1852                         io_size = uio->uio_resid;
1853                 else
1854                         io_size = max_size;
1855 #ifdef ppc
1856                 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
1857                         segflg = uio->uio_segflg;
1858
1859                         uio->uio_segflg = UIO_PHYS_USERSPACE;
1860
1861                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
1862                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
1863
1864                         while (io_size && retval == 0) {
1865                                 int         xsize;
1866                                 vm_offset_t paddr;
1867
1868                                 if (memory_object_page_op(object, (vm_offset_t)upl_f_offset, UPL_POP_SET | UPL_POP_BUSY,
1869                                                           &paddr, 0) != KERN_SUCCESS)
1870                                         break;
1871
1872                                 xsize = PAGE_SIZE - start_offset;
1873
1874                                 if (xsize > io_size)
1875                                         xsize = io_size;
1876
1877                                 retval = uiomove((caddr_t)(paddr + start_offset), xsize, uio);
1878
1879                                 memory_object_page_op(object, (vm_offset_t)upl_f_offset, UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
1880
1881                                 io_size     -= xsize;
1882                                 start_offset = (int)
1883                                         (uio->uio_offset & PAGE_MASK_64);
1884                                 upl_f_offset = uio->uio_offset - start_offset;
1885                         }
1886                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
1887                                      (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
1888
1889                         uio->uio_segflg = segflg;
1890
1891                         if (retval)
1892                                 break;
1893
1894                         if (io_size == 0) {
1895                                 /*
1896                                  * we're already finished with this read request
1897                                  * let's see if we should do a read-ahead
1898                                  */
1899                                 e_lblkno = (int)
1900                                         ((uio->uio_offset - 1) / PAGE_SIZE_64);
1901
1902                                 if (!(vp->v_flag & VRAOFF))
1903                                         /*
1904                                          * let's try to read ahead if we're in
1905                                          * a sequential access pattern
1906                                          */
1907                                         cluster_rd_ahead(vp, object, b_lblkno, e_lblkno, filesize, devblocksize);
1908                                 vp->v_lastr = e_lblkno;
1909
1910                                 break;
1911                         }
1912                         max_size = filesize - uio->uio_offset;
1913                 }
1914 #endif
1915                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1916                 if (upl_size > MAXPHYSIO)
1917                         upl_size = MAXPHYSIO;
1918                 pages_in_upl = upl_size / PAGE_SIZE;
1919
1920                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
1921                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
1922
1923                 kret = vm_fault_list_request(object,
1924                                 (vm_object_offset_t)upl_f_offset, upl_size, &upl, NULL, 0,
1925                                 (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL));
1926
1927                 if (kret != KERN_SUCCESS)
1928                         panic("cluster_read: failed to get pagelist");
1929
1930                 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1931
1932
1933                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
1934                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
1935
1936                 /*
1937                  * scan from the beginning of the upl looking for the first
1938                  * non-valid page.... this will become the first page in
1939                  * the request we're going to make to 'cluster_io'... if all
1940                  * of the pages are valid, we won't call through to 'cluster_io'
1941                  */
1942                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
1943                         if (!upl_valid_page(pl, start_pg))
1944                                 break;
1945                 }
1946
1947                 /*
1948                  * scan from the starting invalid page looking for a valid
1949                  * page before the end of the upl is reached, if we
1950                  * find one, then it will be the last page of the request to
1951                  * 'cluster_io'
1952                  */
1953                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
1954                         if (upl_valid_page(pl, last_pg))
1955                                 break;
1956                 }
1957
1958                 if (start_pg < last_pg) {
1959                         /*
1960                          * we found a range of 'invalid' pages that must be filled
1961                          * if the last page in this range is the last page of the file
1962                          * we may have to clip the size of it to keep from reading past
1963                          * the end of the last physical block associated with the file
1964                          */
1965                         upl_offset = start_pg * PAGE_SIZE;
1966                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
1967
1968                         if ((upl_f_offset + upl_offset + io_size) > filesize) {
1969                                 io_size = filesize - (upl_f_offset + upl_offset);
1970                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
1971                         }
1972                         /*
1973                          * issue a synchronous read to cluster_io
1974                          */
1975
1976                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
1977                                            io_size, CL_READ, (struct buf *)0);
1978                 }
1979                 if (error == 0) {
1980                         /*
1981                          * if the read completed successfully, or there was no I/O request
1982                          * issued, than map the upl into kernel address space and
1983                          * move the data into user land.... we'll first add on any 'valid'
1984                          * pages that were present in the upl when we acquired it.
1985                          */
1986                         u_int  val_size;
1987                         u_int  size_of_prefetch;
1988
1989                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
1990                                 if (!upl_valid_page(pl, uio_last))
1991                                         break;
1992                         }
1993                         /*
1994                          * compute size to transfer this round,  if uio->uio_resid is
1995                          * still non-zero after this uiomove, we'll loop around and
1996                          * set up for another I/O.
1997                          */
1998                         val_size = (uio_last * PAGE_SIZE) - start_offset;
1999
2000                         if (max_size < val_size)
2001                                 val_size = max_size;
2002
2003                         if (uio->uio_resid < val_size)
2004                                 val_size = uio->uio_resid;
2005
2006                         e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2007
2008                         if (size_of_prefetch = (uio->uio_resid - val_size)) {
2009                                 /*
2010                                  * if there's still I/O left to do for this request, then issue a
2011                                  * pre-fetch I/O... the I/O wait time will overlap
2012                                  * with the copying of the data
2013                                  */
2014                                 cluster_rd_prefetch(vp, object, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
2015                         } else {
2016                                 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2017                                         /*
2018                                          * let's try to read ahead if we're in
2019                                          * a sequential access pattern
2020                                          */
2021                                         cluster_rd_ahead(vp, object, b_lblkno, e_lblkno, filesize, devblocksize);
2022                                 vp->v_lastr = e_lblkno;
2023                         }
2024 #ifdef ppc
2025                         if (uio->uio_segflg == UIO_USERSPACE) {
2026                                 int       offset;
2027
2028                                 segflg = uio->uio_segflg;
2029
2030                                 uio->uio_segflg = UIO_PHYS_USERSPACE;
2031
2032
2033                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2034                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2035
2036                                 offset = start_offset;
2037
2038                                 while (val_size && retval == 0) {
2039                                         int       csize;
2040                                         int       i;
2041                                         caddr_t   paddr;
2042
2043                                         i = offset / PAGE_SIZE;
2044                                         csize = min(PAGE_SIZE - start_offset, val_size);
2045
2046                                         paddr = (caddr_t)upl_phys_page(pl, i) + start_offset;
2047
2048                                         retval = uiomove(paddr, csize, uio);
2049
2050                                         val_size    -= csize;
2051                                         offset      += csize;
2052                                         start_offset = offset & PAGE_MASK;
2053                                 }
2054                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2055                                              (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2056
2057                                 uio->uio_segflg = segflg;
2058                         } else
2059 #endif
2060                         {
2061                                 if ((kret = kernel_upl_map(kernel_map, upl, &io_address)) != KERN_SUCCESS)
2062                                         panic("cluster_read: kernel_upl_map failed\n");
2063
2064                                 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2065
2066                                 if ((kret = kernel_upl_unmap(kernel_map, upl)) != KERN_SUCCESS)
2067                                         panic("cluster_read: kernel_upl_unmap failed\n");
2068                         }
2069                 }
2070                 if (start_pg < last_pg) {
2071                         /*
2072                          * compute the range of pages that we actually issued an I/O for
2073                          * and either commit them as valid if the I/O succeeded
2074                          * or abort them if the I/O failed
2075                          */
2076                         io_size = (last_pg - start_pg) * PAGE_SIZE;
2077
2078                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2079                                      upl, start_pg * PAGE_SIZE, io_size, error, 0);
2080
2081                         if (error || (vp->v_flag & VNOCACHE_DATA))
2082                                 kernel_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2083                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2084                         else
2085                                 kernel_upl_commit_range(upl,
2086                                         start_pg * PAGE_SIZE, io_size,
2087                                         UPL_COMMIT_CLEAR_DIRTY
2088                                                 | UPL_COMMIT_FREE_ON_EMPTY
2089                                                 | UPL_COMMIT_INACTIVATE,
2090                                         pl, MAX_UPL_TRANSFER);
2091
2092                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2093                                      upl, start_pg * PAGE_SIZE, io_size, error, 0);
2094                 }
2095                 if ((last_pg - start_pg) < pages_in_upl) {
2096                         int cur_pg;
2097                         int commit_flags;
2098
2099                         /*
2100                          * the set of pages that we issued an I/O for did not encompass
2101                          * the entire upl... so just release these without modifying
2102                          * there state
2103                          */
2104                         if (error)
2105                                 kernel_upl_abort(upl, 0);
2106                         else {
2107                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2108                                              upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2109
2110                                 if (start_pg) {
2111                                       /*
2112                                        * we found some already valid pages at the beginning of the upl
2113                                        * commit these back to the inactive list with reference cleared
2114                                        */
2115                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2116                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY | UPL_COMMIT_INACTIVATE;
2117
2118                                                 if (upl_dirty_page(pl, cur_pg))
2119                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2120
2121                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2122                                                         kernel_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2123                                                                         UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2124                                                 else
2125                                                         kernel_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2126                                                                 PAGE_SIZE, commit_flags, pl, MAX_UPL_TRANSFER);
2127                                         }
2128                                 }
2129                                 if (last_pg < uio_last) {
2130                                       /*
2131                                        * we found some already valid pages immediately after the pages we issued
2132                                        * I/O for, commit these back to the inactive list with reference cleared
2133                                        */
2134                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2135                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY | UPL_COMMIT_INACTIVATE;
2136
2137                                                 if (upl_dirty_page(pl, cur_pg))
2138                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
2139
2140                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2141                                                         kernel_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2142                                                                         UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2143                                                 else
2144                                                         kernel_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2145                                                                 PAGE_SIZE, commit_flags, pl, MAX_UPL_TRANSFER);
2146                                         }
2147                                 }
2148                                 if (uio_last < pages_in_upl) {
2149                                         /*
2150                                          * there were some invalid pages beyond the valid pages that we didn't
2151                                          * issue an I/O for, just release them unchanged
2152                                          */
2153                                         kernel_upl_abort(upl, 0);
2154                                 }
2155
2156                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2157                                              upl, -1, -1, 0, 0);
2158                         }
2159                 }
2160                 if (retval == 0)
2161                         retval = error;
2162         }
2163
2164         return (retval);
2165 }
2166
2167 static
2168 cluster_nocopy_read(object, vp, uio, filesize, devblocksize, flags)
2169         void         *object;
2170         struct vnode *vp;
2171         struct uio   *uio;
2172         off_t         filesize;
2173         int           devblocksize;
2174         int           flags;
2175 {
2176         upl_t            upl;
2177         upl_page_info_t  *pl;
2178         off_t            upl_f_offset;
2179         vm_offset_t      upl_offset;
2180         off_t            start_upl_f_offset;
2181         off_t            max_io_size;
2182         int              io_size;
2183         int              upl_size;
2184         int              upl_needed_size;
2185         int              pages_in_pl;
2186         vm_offset_t      paddr;
2187         int              upl_flags;
2188         kern_return_t    kret;
2189         int              segflg;
2190         struct iovec     *iov;
2191         int              i;
2192         int              force_data_sync;
2193         int              error  = 0;
2194         int              retval = 0;
2195
2196         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2197                      (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2198
2199         /*
2200          * When we enter this routine, we know
2201          *  -- the offset into the file is on a pagesize boundary
2202          *  -- the resid is a page multiple
2203          *  -- the resid will not exceed iov_len
2204          */
2205
2206         iov = uio->uio_iov;
2207         while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2208
2209           io_size = uio->uio_resid;
2210
2211           /*
2212            * We don't come into this routine unless
2213            * UIO_USERSPACE is set.
2214            */
2215           segflg = uio->uio_segflg;
2216
2217           uio->uio_segflg = UIO_PHYS_USERSPACE;
2218
2219           /*
2220            * First look for pages already in the cache
2221            * and move them to user space.
2222            */
2223           while (io_size && retval == 0) {
2224
2225             upl_f_offset = uio->uio_offset;
2226
2227             /*
2228              * If this call fails, it means the page is not
2229              * in the page cache.
2230              */
2231             if (memory_object_page_op(object, (vm_offset_t)upl_f_offset,
2232                                       UPL_POP_SET | UPL_POP_BUSY,
2233                                       &paddr, 0) != KERN_SUCCESS)
2234               break;
2235
2236             retval = uiomove((caddr_t)(paddr), PAGE_SIZE, uio);
2237
2238             memory_object_page_op(object, (vm_offset_t)upl_f_offset,
2239                                   UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2240
2241             io_size     -= PAGE_SIZE;
2242             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
2243                        (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2244           }
2245
2246           uio->uio_segflg = segflg;
2247
2248           if (retval)
2249             {
2250               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2251                            (int)uio->uio_offset, uio->uio_resid, 2, retval, 0);
2252               return(retval);
2253             }
2254
2255           /* If we are already finished with this read, then return */
2256           if (io_size == 0)
2257             {
2258
2259               KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2260                            (int)uio->uio_offset, uio->uio_resid, 3, io_size, 0);
2261               return(0);
2262             }
2263
2264           max_io_size = io_size;
2265           if (max_io_size > MAXPHYSIO)
2266             max_io_size = MAXPHYSIO;
2267
2268           start_upl_f_offset = uio->uio_offset;   /* this is page aligned in the file */
2269           upl_f_offset = start_upl_f_offset;
2270           io_size = 0;
2271
2272           while(io_size < max_io_size)
2273             {
2274
2275               if(memory_object_page_op(object, (vm_offset_t)upl_f_offset,
2276                        UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS)
2277               {
2278                 memory_object_page_op(object, (vm_offset_t)upl_f_offset,
2279                                     UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2280                 break;
2281               }
2282
2283               /*
2284                * Build up the io request parameters.
2285                */
2286
2287               io_size += PAGE_SIZE;
2288               upl_f_offset += PAGE_SIZE;
2289             }
2290
2291           if (io_size == 0)
2292             return(retval);
2293
2294           upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2295           upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2296
2297           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2298                        (int)upl_offset, upl_needed_size, iov->iov_base, io_size, 0);
2299
2300           for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
2301             {
2302               pages_in_pl = 0;
2303               upl_size = upl_needed_size;
2304               upl_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2305
2306               kret = vm_map_get_upl(current_map(),
2307                                     (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2308                                     &upl_size, &upl, &pl, &pages_in_pl, &upl_flags, force_data_sync);
2309
2310               pages_in_pl = upl_size / PAGE_SIZE;
2311
2312               if (kret != KERN_SUCCESS)
2313                 {
2314                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2315                                (int)upl_offset, upl_size, io_size, kret, 0);
2316
2317                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2318                                (int)uio->uio_offset, uio->uio_resid, 4, retval, 0);
2319
2320                   /* cluster_nocopy_read: failed to get pagelist */
2321                   /* do not return kret here */
2322                   return(retval);
2323                 }
2324
2325               for(i=0; i < pages_in_pl; i++)
2326                 {
2327                   if (!upl_valid_page(pl, i))
2328                     break;
2329                 }
2330               if (i == pages_in_pl)
2331                 break;
2332
2333               kernel_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2334                                      UPL_ABORT_FREE_ON_EMPTY);
2335             }
2336
2337           if (force_data_sync >= 3)
2338             {
2339                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2340                                (int)upl_offset, upl_size, io_size, kret, 0);
2341
2342                   KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2343                                (int)uio->uio_offset, uio->uio_resid, 5, retval, 0);
2344               return(retval);
2345             }
2346           /*
2347            * Consider the possibility that upl_size wasn't satisfied.
2348            */
2349           if (upl_size != upl_needed_size)
2350             io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2351
2352           if (io_size == 0)
2353             {
2354               kernel_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2355                                    UPL_ABORT_FREE_ON_EMPTY);
2356               return(retval);
2357             }
2358
2359           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2360                        (int)upl_offset, upl_size, io_size, kret, 0);
2361
2362           /*
2363            * issue a synchronous read to cluster_io
2364            */
2365
2366           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2367                        upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
2368
2369           error = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
2370                              io_size, CL_READ| CL_NOZERO, (struct buf *)0);
2371
2372           if (error == 0) {
2373             /*
2374              * The cluster_io read completed successfully,
2375              * update the uio structure and commit.
2376              */
2377
2378             kernel_upl_commit_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2379                                     UPL_COMMIT_SET_DIRTY
2380                                                 | UPL_COMMIT_FREE_ON_EMPTY,
2381                                     pl, MAX_UPL_TRANSFER);
2382
2383             iov->iov_base += io_size;
2384             iov->iov_len -= io_size;
2385             uio->uio_resid -= io_size;
2386             uio->uio_offset += io_size;
2387           }
2388           else {
2389             kernel_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2390                                    UPL_ABORT_FREE_ON_EMPTY);
2391           }
2392
2393           KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2394                        upl, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
2395
2396           if (retval == 0)
2397             retval = error;
2398
2399         } /* end while */
2400
2401
2402         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2403                      (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2404
2405         return (retval);
2406 }
2407
2408
2409
2410 /*
2411  * generate advisory I/O's in the largest chunks possible
2412  * the completed pages will be released into the VM cache
2413  */
2414 advisory_read(vp, filesize, f_offset, resid, devblocksize)
2415         struct vnode *vp;
2416         off_t         filesize;
2417         off_t         f_offset;
2418         int           resid;
2419         int           devblocksize;
2420 {
2421         void            *object;
2422         upl_page_info_t *pl;
2423         upl_t            upl;
2424         vm_offset_t      upl_offset;
2425         int              upl_size;
2426         off_t            upl_f_offset;
2427         int              start_offset;
2428         int              start_pg;
2429         int              last_pg;
2430         int              pages_in_upl;
2431         off_t            max_size;
2432         int              io_size;
2433         kern_return_t    kret;
2434         int              retval = 0;
2435
2436
2437         if (!UBCINFOEXISTS(vp))
2438                 return(EINVAL);
2439
2440         object = ubc_getobject(vp, UBC_NOREACTIVATE);
2441         if (object == (void *)NULL)
2442                 panic("advisory_read: ubc_getobject failed");
2443
2444         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
2445                      (int)f_offset, resid, (int)filesize, devblocksize, 0);
2446
2447         while (resid && f_offset < filesize && retval == 0) {
2448                 /*
2449                  * compute the size of the upl needed to encompass
2450                  * the requested read... limit each call to cluster_io
2451                  * to at most MAXPHYSIO, make sure to account for
2452                  * a starting offset that's not page aligned
2453                  */
2454                 start_offset = (int)(f_offset & PAGE_MASK_64);
2455                 upl_f_offset = f_offset - (off_t)start_offset;
2456                 max_size     = filesize - f_offset;
2457
2458                 if (resid < max_size)
2459                         io_size = resid;
2460                 else
2461                         io_size = max_size;
2462
2463                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2464                 if (upl_size > MAXPHYSIO)
2465                         upl_size = MAXPHYSIO;
2466                 pages_in_upl = upl_size / PAGE_SIZE;
2467
2468                 kret = vm_fault_list_request(object,
2469                                 (vm_object_offset_t)upl_f_offset, upl_size, &upl, NULL, 0,
2470                                 (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL));
2471
2472                 if (kret != KERN_SUCCESS)
2473                         panic("advisory_read: failed to get pagelist");
2474
2475                 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2476
2477
2478                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
2479                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
2480
2481                 /*
2482                  * scan from the beginning of the upl looking for the first
2483                  * non-valid page.... this will become the first page in
2484                  * the request we're going to make to 'cluster_io'... if all
2485                  * of the pages are valid, we won't call through to 'cluster_io'
2486                  */
2487                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2488                         if (!upl_valid_page(pl, start_pg))
2489                                 break;
2490                 }
2491
2492                 /*
2493                  * scan from the starting invalid page looking for a valid
2494                  * page before the end of the upl is reached, if we
2495                  * find one, then it will be the last page of the request to
2496                  * 'cluster_io'
2497                  */
2498                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2499                         if (upl_valid_page(pl, last_pg))
2500                                 break;
2501                 }
2502
2503                 if (start_pg < last_pg) {
2504                         /*
2505                          * we found a range of 'invalid' pages that must be filled
2506                          * if the last page in this range is the last page of the file
2507                          * we may have to clip the size of it to keep from reading past
2508                          * the end of the last physical block associated with the file
2509                          */
2510                         upl_offset = start_pg * PAGE_SIZE;
2511                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2512
2513                         if ((upl_f_offset + upl_offset + io_size) > filesize) {
2514                                 io_size = filesize - (upl_f_offset + upl_offset);
2515                                 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2516                         }
2517                         /*
2518                          * issue an asynchronous read to cluster_io
2519                          */
2520                         retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
2521                                           CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0);
2522                 }
2523                 if (start_pg) {
2524                         /*
2525                          * start_pg of non-zero indicates we found some already valid pages
2526                          * at the beginning of the upl.... we need to release these without
2527                          * modifying there state
2528                          */
2529                         kernel_upl_abort_range(upl, 0, start_pg * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
2530
2531                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 62)) | DBG_FUNC_NONE,
2532                                      upl, 0, start_pg * PAGE_SIZE, 0, 0);
2533                 }
2534                 if (last_pg < pages_in_upl) {
2535                         /*
2536                          * the set of pages that we issued an I/O for did not extend all the
2537                          * way to the end of the upl... so just release them without modifying
2538                          * there state
2539                          */
2540                         kernel_upl_abort_range(upl, last_pg * PAGE_SIZE, (pages_in_upl - last_pg) * PAGE_SIZE,
2541                                         UPL_ABORT_FREE_ON_EMPTY);
2542
2543                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 63)) | DBG_FUNC_NONE,
2544                                      upl, last_pg * PAGE_SIZE,
2545                                      (pages_in_upl - last_pg) * PAGE_SIZE, 0, 0);
2546                 }
2547                 io_size = (last_pg * PAGE_SIZE) - start_offset;
2548
2549                 if (io_size > resid)
2550                         io_size = resid;
2551                 f_offset += io_size;
2552                 resid    -= io_size;
2553         }
2554         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
2555                      (int)f_offset, resid, retval, 0, 0);
2556
2557         return(retval);
2558 }
2559
2560
2561 cluster_push(vp)
2562         struct vnode *vp;
2563 {
2564         void            *object;
2565         upl_page_info_t *pl;
2566         upl_t            upl;
2567         vm_offset_t      upl_offset;
2568         int              upl_size;
2569         off_t            upl_f_offset;
2570         int              pages_in_upl;
2571         int              start_pg;
2572         int              last_pg;
2573         int              io_size;
2574         int              io_flags;
2575         int              size;
2576         kern_return_t    kret;
2577
2578
2579         if (!UBCINFOEXISTS(vp))
2580                 return(0);
2581
2582         if (vp->v_clen == 0 || (pages_in_upl = vp->v_lastw - vp->v_cstart) == 0)
2583                 return (0);
2584         upl_size = pages_in_upl * PAGE_SIZE;
2585         upl_f_offset = ((off_t)vp->v_cstart) * PAGE_SIZE_64;
2586         size = vp->v_ciosiz;
2587         vp->v_clen = 0;
2588
2589         if (size > upl_size || (upl_size - size) > PAGE_SIZE)
2590                 panic("cluster_push: v_ciosiz doesn't match size of cluster\n");
2591
2592         object = ubc_getobject(vp, UBC_NOREACTIVATE);
2593         if (object == (void *)NULL)
2594                 panic("cluster_push: ubc_getobject failed");
2595
2596         kret = vm_fault_list_request(object,
2597                                      (vm_object_offset_t)upl_f_offset, upl_size, &upl, NULL, 0,
2598                                      (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL));
2599         if (kret != KERN_SUCCESS)
2600                 panic("cluster_push: failed to get pagelist");
2601
2602         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2603
2604         last_pg = 0;
2605
2606         while (size) {
2607
2608                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
2609                         if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
2610                                 break;
2611                 }
2612                 if (start_pg > last_pg) {
2613                         io_size = (start_pg - last_pg) * PAGE_SIZE;
2614
2615                         kernel_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size, UPL_ABORT_FREE_ON_EMPTY);
2616
2617                         if (io_size < size)
2618                                 size -= io_size;
2619                         else
2620                                 break;
2621                 }
2622                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2623                         if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
2624                                 break;
2625                 }
2626                 upl_offset = start_pg * PAGE_SIZE;
2627
2628                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
2629
2630                 if (vp->v_numoutput > ASYNC_THROTTLE)
2631                         io_flags = CL_COMMIT | CL_AGE;
2632                 else
2633                         io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
2634
2635                 if (vp->v_flag & VNOCACHE_DATA)
2636                         io_flags |= CL_DUMP;
2637
2638                 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, io_flags, (struct buf *)0);
2639
2640                 size -= io_size;
2641         }
2642         return(1);
2643 }