bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  62  */
  63
  64 #include <sys/param.h>
  65 #include <sys/proc_internal.h>
  66 #include <sys/buf_internal.h>
  67 #include <sys/mount_internal.h>
  68 #include <sys/vnode_internal.h>
  69 #include <sys/trace.h>
  70 #include <sys/malloc.h>
  71 #include <sys/time.h>
  72 #include <sys/kernel.h>
  73 #include <sys/resourcevar.h>
  74 #include <sys/uio_internal.h>
  75 #include <libkern/libkern.h>
  76 #include <machine/machine_routines.h>
  77
  78 #include <sys/ubc_internal.h>
  79
  80 #include <mach/mach_types.h>
  81 #include <mach/memory_object_types.h>
  82 #include <mach/vm_map.h>
  83 #include <mach/upl.h>
  84
  85 #include <vm/vm_kern.h>
  86 #include <vm/vm_map.h>
  87 #include <vm/vm_pageout.h>
  88
  89 #include <sys/kdebug.h>
  90
  91
  92 #define CL_READ      0x01
  93 #define CL_ASYNC     0x02
  94 #define CL_COMMIT    0x04
  95 #define CL_PAGEOUT   0x10
  96 #define CL_AGE       0x20
  97 #define CL_DUMP      0x40
  98 #define CL_NOZERO    0x80
  99 #define CL_PAGEIN    0x100
 100 #define CL_DEV_MEMORY 0x200
 101 #define CL_PRESERVE   0x400
 102 #define CL_THROTTLE   0x800
 103 #define CL_KEEPCACHED 0x1000
 104
 105
 106 struct clios {
 107         u_int  io_completed;       /* amount of io that has currently completed */
 108         u_int  io_issued;          /* amount of io that was successfully issued */
 109         int    io_error;           /* error code of first error encountered */
 110         int    io_wanted;          /* someone is sleeping waiting for a change in state */
 111 };
 112
 113 static lck_grp_t        *cl_mtx_grp;
 114 static lck_attr_t       *cl_mtx_attr;
 115 static lck_grp_attr_t   *cl_mtx_grp_attr;
 116 static lck_mtx_t        *cl_mtxp;
 117
 118
 119 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 120                       int flags, buf_t real_bp, struct clios *iostate);
 121 static int cluster_iodone(buf_t bp, void *dummy);
 122 static int cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize);
 123 static int cluster_hard_throttle_on(vnode_t vp);
 124
 125 static int cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags);
 126 static int cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
 127                            off_t headOff, off_t tailOff, int flags);
 128 static int cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize);
 129 static int cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF);
 130 static int cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize);
 131 static int cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF);
 132 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags);
 133
 134 static void     cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra);
 135
 136 static int      cluster_push_x(vnode_t vp, struct cl_extent *, off_t EOF, int flags);
 137 static void     cluster_push_EOF(vnode_t vp, off_t EOF);
 138
 139 static int      cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int can_delay, int push_all);
 140
 141 static void     sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF);
 142 static void     sparse_cluster_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_all);
 143 static void     sparse_cluster_add(struct cl_writebehind *, vnode_t vp, struct cl_extent *, off_t EOF);
 144
 145 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp);
 146 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
 147 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
 148
 149 int     is_file_clean(vnode_t, off_t);
 150
 151 /*
 152  * throttle the number of async writes that
 153  * can be outstanding on a single vnode
 154  * before we issue a synchronous write
 155  */
 156 #define HARD_THROTTLE_MAXCNT    0
 157 #define HARD_THROTTLE_MAXSIZE   (64 * 1024)
 158
 159 int hard_throttle_on_root = 0;
 160 struct timeval priority_IO_timestamp_for_root;
 161
 162
 163 void
 164 cluster_init(void) {
 165         /*
 166          * allocate lock group attribute and group
 167          */
 168         cl_mtx_grp_attr = lck_grp_attr_alloc_init();
 169         //lck_grp_attr_setstat(cl_mtx_grp_attr);
 170         cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
 171
 172         /*
 173          * allocate the lock attribute
 174          */
 175         cl_mtx_attr = lck_attr_alloc_init();
 176         //lck_attr_setdebug(clf_mtx_attr);
 177
 178         /*
 179          * allocate and initialize mutex's used to protect updates and waits
 180          * on the cluster_io context
 181          */
 182         cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
 183
 184         if (cl_mtxp == NULL)
 185                 panic("cluster_init: failed to allocate cl_mtxp");
 186 }
 187
 188
 189
 190 #define CLW_ALLOCATE            0x01
 191 #define CLW_RETURNLOCKED        0x02
 192 /*
 193  * if the read ahead context doesn't yet exist,
 194  * allocate and initialize it...
 195  * the vnode lock serializes multiple callers
 196  * during the actual assignment... first one
 197  * to grab the lock wins... the other callers
 198  * will release the now unnecessary storage
 199  *
 200  * once the context is present, try to grab (but don't block on)
 201  * the lock associated with it... if someone
 202  * else currently owns it, than the read
 203  * will run without read-ahead.  this allows
 204  * multiple readers to run in parallel and
 205  * since there's only 1 read ahead context,
 206  * there's no real loss in only allowing 1
 207  * reader to have read-ahead enabled.
 208  */
 209 static struct cl_readahead *
 210 cluster_get_rap(vnode_t vp)
 211 {
 212         struct ubc_info         *ubc;
 213         struct cl_readahead     *rap;
 214
 215         ubc = vp->v_ubcinfo;
 216
 217         if ((rap = ubc->cl_rahead) == NULL) {
 218                 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
 219
 220                 bzero(rap, sizeof *rap);
 221                 rap->cl_lastr = -1;
 222                 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
 223
 224                 vnode_lock(vp);
 225
 226                 if (ubc->cl_rahead == NULL)
 227                         ubc->cl_rahead = rap;
 228                 else {
 229                         lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
 230                         FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
 231                                 rap = ubc->cl_rahead;
 232                 }
 233                 vnode_unlock(vp);
 234         }
 235         if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
 236                 return(rap);
 237
 238         return ((struct cl_readahead *)NULL);
 239 }
 240
 241
 242 /*
 243  * if the write behind context doesn't yet exist,
 244  * and CLW_ALLOCATE is specified, allocate and initialize it...
 245  * the vnode lock serializes multiple callers
 246  * during the actual assignment... first one
 247  * to grab the lock wins... the other callers
 248  * will release the now unnecessary storage
 249  *
 250  * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
 251  * the lock associated with the write behind context before
 252  * returning
 253  */
 254
 255 static struct cl_writebehind *
 256 cluster_get_wbp(vnode_t vp, int flags)
 257 {
 258         struct ubc_info *ubc;
 259         struct cl_writebehind *wbp;
 260
 261         ubc = vp->v_ubcinfo;
 262
 263         if ((wbp = ubc->cl_wbehind) == NULL) {
 264
 265                 if ( !(flags & CLW_ALLOCATE))
 266                         return ((struct cl_writebehind *)NULL);
 267
 268                 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
 269
 270                 bzero(wbp, sizeof *wbp);
 271                 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
 272
 273                 vnode_lock(vp);
 274
 275                 if (ubc->cl_wbehind == NULL)
 276                         ubc->cl_wbehind = wbp;
 277                 else {
 278                         lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
 279                         FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
 280                                 wbp = ubc->cl_wbehind;
 281                 }
 282                 vnode_unlock(vp);
 283         }
 284         if (flags & CLW_RETURNLOCKED)
 285                 lck_mtx_lock(&wbp->cl_lockw);
 286
 287         return (wbp);
 288 }
 289
 290
 291 static int
 292 cluster_hard_throttle_on(vnode_t vp)
 293 {
 294         static struct timeval hard_throttle_maxelapsed = { 0, 200000 };
 295
 296         if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
 297                 struct timeval elapsed;
 298
 299                 if (hard_throttle_on_root)
 300                         return(1);
 301
 302                 microuptime(&elapsed);
 303                 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
 304
 305                 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
 306                         return(1);
 307         }
 308         return(0);
 309 }
 310
 311
 312 static int
 313 cluster_iodone(buf_t bp, __unused void *dummy)
 314 {
 315         int     b_flags;
 316         int     error;
 317         int     total_size;
 318         int     total_resid;
 319         int     upl_offset;
 320         int     zero_offset;
 321         upl_t   upl;
 322         buf_t   cbp;
 323         buf_t   cbp_head;
 324         buf_t   cbp_next;
 325         buf_t   real_bp;
 326         struct  clios *iostate;
 327         int     commit_size;
 328         int     pg_offset;
 329
 330         cbp_head = (buf_t)(bp->b_trans_head);
 331
 332         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 333                      (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 334
 335         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 336                 /*
 337                  * all I/O requests that are part of this transaction
 338                  * have to complete before we can process it
 339                  */
 340                 if ( !(cbp->b_flags & B_DONE)) {
 341
 342                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 343                                      (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
 344
 345                         return 0;
 346                 }
 347         }
 348         error       = 0;
 349         total_size  = 0;
 350         total_resid = 0;
 351
 352         cbp        = cbp_head;
 353         upl_offset = cbp->b_uploffset;
 354         upl        = cbp->b_upl;
 355         b_flags    = cbp->b_flags;
 356         real_bp    = cbp->b_real_bp;
 357         zero_offset= cbp->b_validend;
 358         iostate    = (struct clios *)cbp->b_iostate;
 359
 360         if (real_bp)
 361                 real_bp->b_dev = cbp->b_dev;
 362
 363         while (cbp) {
 364                 if ((cbp->b_flags & B_ERROR) && error == 0)
 365                         error = cbp->b_error;
 366
 367                 total_resid += cbp->b_resid;
 368                 total_size  += cbp->b_bcount;
 369
 370                 cbp_next = cbp->b_trans_next;
 371
 372                 free_io_buf(cbp);
 373
 374                 cbp = cbp_next;
 375         }
 376         if (zero_offset)
 377                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 378
 379         if (iostate) {
 380                 int need_wakeup = 0;
 381
 382                 /*
 383                  * someone has issued multiple I/Os asynchrounsly
 384                  * and is waiting for them to complete (streaming)
 385                  */
 386                 lck_mtx_lock(cl_mtxp);
 387
 388                 if (error && iostate->io_error == 0)
 389                         iostate->io_error = error;
 390
 391                 iostate->io_completed += total_size;
 392
 393                 if (iostate->io_wanted) {
 394                         /*
 395                          * someone is waiting for the state of
 396                          * this io stream to change
 397                          */
 398                         iostate->io_wanted = 0;
 399                         need_wakeup = 1;
 400                 }
 401                 lck_mtx_unlock(cl_mtxp);
 402
 403                 if (need_wakeup)
 404                         wakeup((caddr_t)&iostate->io_wanted);
 405         }
 406         if ((b_flags & B_NEED_IODONE) && real_bp) {
 407                 if (error) {
 408                         real_bp->b_flags |= B_ERROR;
 409                         real_bp->b_error = error;
 410                 }
 411                 real_bp->b_resid = total_resid;
 412
 413                 buf_biodone(real_bp);
 414         }
 415         if (error == 0 && total_resid)
 416                 error = EIO;
 417
 418         if (b_flags & B_COMMIT_UPL) {
 419                 pg_offset   = upl_offset & PAGE_MASK;
 420                 commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 421
 422                 if (error || (b_flags & B_NOCACHE)) {
 423                         int upl_abort_code;
 424                         int page_in  = 0;
 425                         int page_out = 0;
 426
 427                         if (b_flags & B_PAGEIO) {
 428                                 if (b_flags & B_READ)
 429                                         page_in  = 1;
 430                                 else
 431                                         page_out = 1;
 432                         }
 433                         if (b_flags & B_CACHE)          /* leave pages in the cache unchanged on error */
 434                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 435                         else if (page_out && (error != ENXIO)) /* transient error */
 436                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 437                         else if (page_in)
 438                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 439                         else
 440                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 441
 442                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 443                                                     upl_abort_code);
 444
 445                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 446                                      (int)upl, upl_offset - pg_offset, commit_size,
 447                                      0x80000000|upl_abort_code, 0);
 448
 449                 } else {
 450                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 451
 452                         if ((b_flags & B_PHYS) && (b_flags & B_READ))
 453                                 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
 454
 455                         if (b_flags & B_AGE)
 456                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 457
 458                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 459                                         upl_commit_flags);
 460
 461                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 462                                      (int)upl, upl_offset - pg_offset, commit_size,
 463                                      upl_commit_flags, 0);
 464                 }
 465         } else {
 466                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 467                              (int)upl, upl_offset, 0, error, 0);
 468         }
 469
 470         return (error);
 471 }
 472
 473
 474 void
 475 cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, buf_t bp)
 476 {
 477         upl_page_info_t *pl;
 478
 479         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
 480                      upl_offset, size, (int)bp, 0, 0);
 481
 482         if (bp == NULL || bp->b_datap == 0) {
 483
 484                 pl = ubc_upl_pageinfo(upl);
 485
 486                 while (size) {
 487                         int           page_offset;
 488                         int           page_index;
 489                         addr64_t      zero_addr;
 490                         int           zero_cnt;
 491
 492                         page_index  = upl_offset / PAGE_SIZE;
 493                         page_offset = upl_offset & PAGE_MASK;
 494
 495                         zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
 496                         zero_cnt  = min(PAGE_SIZE - page_offset, size);
 497
 498                         bzero_phys(zero_addr, zero_cnt);
 499
 500                         size       -= zero_cnt;
 501                         upl_offset += zero_cnt;
 502                 }
 503         } else
 504                 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
 505
 506         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
 507                      upl_offset, size, 0, 0, 0);
 508 }
 509
 510
 511 static int
 512 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 513            int flags, buf_t real_bp, struct clios *iostate)
 514 {
 515         buf_t   cbp;
 516         u_int   size;
 517         u_int   io_size;
 518         int     io_flags;
 519         int     bmap_flags;
 520         int     error = 0;
 521         int     retval = 0;
 522         buf_t   cbp_head = NULL;
 523         buf_t   cbp_tail = NULL;
 524         int     trans_count = 0;
 525         u_int   pg_count;
 526         int     pg_offset;
 527         u_int   max_iosize;
 528         u_int   max_vectors;
 529         int     priv;
 530         int     zero_offset = 0;
 531         int     async_throttle = 0;
 532         mount_t mp;
 533
 534         mp = vp->v_mount;
 535
 536         if (mp->mnt_devblocksize > 1) {
 537                 /*
 538                  * round the requested size up so that this I/O ends on a
 539                  * page boundary in case this is a 'write'... if the filesystem
 540                  * has blocks allocated to back the page beyond the EOF, we want to
 541                  * make sure to write out the zero's that are sitting beyond the EOF
 542                  * so that in case the filesystem doesn't explicitly zero this area
 543                  * if a hole is created via a lseek/write beyond the current EOF,
 544                  * it will return zeros when it's read back from the disk.  If the
 545                  * physical allocation doesn't extend for the whole page, we'll
 546                  * only write/read from the disk up to the end of this allocation
 547                  * via the extent info returned from the VNOP_BLOCKMAP call.
 548                  */
 549                 pg_offset = upl_offset & PAGE_MASK;
 550
 551                 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
 552         } else {
 553                 /*
 554                  * anyone advertising a blocksize of 1 byte probably
 555                  * can't deal with us rounding up the request size
 556                  * AFP is one such filesystem/device
 557                  */
 558                 size = non_rounded_size;
 559         }
 560         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 561                      (int)f_offset, size, upl_offset, flags, 0);
 562
 563         if (flags & CL_READ) {
 564                 io_flags = (B_READ);
 565                 bmap_flags = VNODE_READ;
 566
 567                 max_iosize  = mp->mnt_maxreadcnt;
 568                 max_vectors = mp->mnt_segreadcnt;
 569         } else {
 570                 io_flags = 0;
 571                 bmap_flags = VNODE_WRITE;
 572
 573                 max_iosize  = mp->mnt_maxwritecnt;
 574                 max_vectors = mp->mnt_segwritecnt;
 575         }
 576         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
 577
 578         /*
 579          * make sure the maximum iosize is a
 580          * multiple of the page size
 581          */
 582         max_iosize  &= ~PAGE_MASK;
 583
 584         if (flags & CL_THROTTLE) {
 585                 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
 586                         if (max_iosize > HARD_THROTTLE_MAXSIZE)
 587                                 max_iosize = HARD_THROTTLE_MAXSIZE;
 588                         async_throttle = HARD_THROTTLE_MAXCNT;
 589                 } else
 590                         async_throttle = VNODE_ASYNC_THROTTLE;
 591         }
 592         if (flags & CL_AGE)
 593                 io_flags |= B_AGE;
 594         if (flags & CL_DUMP)
 595                 io_flags |= B_NOCACHE;
 596         if (flags & (CL_PAGEIN | CL_PAGEOUT))
 597                 io_flags |= B_PAGEIO;
 598         if (flags & CL_COMMIT)
 599                 io_flags |= B_COMMIT_UPL;
 600         if (flags & CL_PRESERVE)
 601                 io_flags |= B_PHYS;
 602         if (flags & CL_KEEPCACHED)
 603                 io_flags |= B_CACHE;
 604
 605         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 606                 /*
 607                  * then we are going to end up
 608                  * with a page that we can't complete (the file size wasn't a multiple
 609                  * of PAGE_SIZE and we're trying to read to the end of the file
 610                  * so we'll go ahead and zero out the portion of the page we can't
 611                  * read in from the file
 612                  */
 613                 zero_offset = upl_offset + non_rounded_size;
 614         }
 615         while (size) {
 616                 int     pg_resid;
 617                 daddr64_t blkno;
 618                 daddr64_t lblkno;
 619
 620                 if (size > max_iosize)
 621                         io_size = max_iosize;
 622                 else
 623                         io_size = size;
 624
 625                 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, bmap_flags, NULL))) {
 626                         break;
 627                 }
 628                 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
 629                         real_bp->b_blkno = blkno;
 630
 631                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 632                              (int)f_offset, (int)blkno, io_size, zero_offset, 0);
 633
 634                 if (io_size == 0) {
 635                         /*
 636                          * vnop_blockmap didn't return an error... however, it did
 637                          * return an extent size of 0 which means we can't
 638                          * make forward progress on this I/O... a hole in the
 639                          * file would be returned as a blkno of -1 with a non-zero io_size
 640                          * a real extent is returned with a blkno != -1 and a non-zero io_size
 641                          */
 642                         error = EINVAL;
 643                         break;
 644                 }
 645                 if ( !(flags & CL_READ) && blkno == -1) {
 646                         off_t e_offset;
 647
 648                         /*
 649                          * we're writing into a 'hole'
 650                          */
 651                         if (flags & CL_PAGEOUT) {
 652                                 /*
 653                                  * if we got here via cluster_pageout
 654                                  * then just error the request and return
 655                                  * the 'hole' should already have been covered
 656                                  */
 657                                 error = EINVAL;
 658                                 break;
 659                         }
 660                         if ( !(flags & CL_COMMIT)) {
 661                                 /*
 662                                  * currently writes always request the commit to happen
 663                                  * as part of the io completion... however, if the CL_COMMIT
 664                                  * flag isn't specified, than we can't issue the abort_range
 665                                  * since the call site is going to abort or commit the same upl..
 666                                  * in this case we can only return an error
 667                                  */
 668                                 error = EINVAL;
 669                                 break;
 670                         }
 671                         /*
 672                          * we can get here if the cluster code happens to
 673                          * pick up a page that was dirtied via mmap vs
 674                          * a 'write' and the page targets a 'hole'...
 675                          * i.e. the writes to the cluster were sparse
 676                          * and the file was being written for the first time
 677                          *
 678                          * we can also get here if the filesystem supports
 679                          * 'holes' that are less than PAGE_SIZE.... because
 680                          * we can't know if the range in the page that covers
 681                          * the 'hole' has been dirtied via an mmap or not,
 682                          * we have to assume the worst and try to push the
 683                          * entire page to storage.
 684                          *
 685                          * Try paging out the page individually before
 686                          * giving up entirely and dumping it (the pageout
 687                          * path will insure that the zero extent accounting
 688                          * has been taken care of before we get back into cluster_io)
 689                          */
 690                         ubc_upl_abort_range(upl, trunc_page(upl_offset), PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
 691
 692                         e_offset = round_page_64(f_offset + 1);
 693
 694                         if (ubc_sync_range(vp, f_offset, e_offset, UBC_PUSHDIRTY) == 0) {
 695                                 error = EINVAL;
 696                                 break;
 697                         }
 698                         io_size = e_offset - f_offset;
 699
 700                         f_offset   += io_size;
 701                         upl_offset += io_size;
 702
 703                         if (size >= io_size)
 704                                 size -= io_size;
 705                         else
 706                                 size = 0;
 707                         /*
 708                          * keep track of how much of the original request
 709                          * that we've actually completed... non_rounded_size
 710                          * may go negative due to us rounding the request
 711                          * to a page size multiple (i.e.  size > non_rounded_size)
 712                          */
 713                         non_rounded_size -= io_size;
 714
 715                         if (non_rounded_size <= 0) {
 716                                 /*
 717                                  * we've transferred all of the data in the original
 718                                  * request, but we were unable to complete the tail
 719                                  * of the last page because the file didn't have
 720                                  * an allocation to back that portion... this is ok.
 721                                  */
 722                                 size = 0;
 723                         }
 724                         continue;
 725                 }
 726                 lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64);
 727                 /*
 728                  * we have now figured out how much I/O we can do - this is in 'io_size'
 729                  * pg_offset is the starting point in the first page for the I/O
 730                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 731                  */
 732                 pg_offset = upl_offset & PAGE_MASK;
 733
 734                 if (flags & CL_DEV_MEMORY) {
 735                         /*
 736                          * currently, can't deal with reading 'holes' in file
 737                          */
 738                         if (blkno == -1) {
 739                                 error = EINVAL;
 740                                 break;
 741                         }
 742                         /*
 743                          * treat physical requests as one 'giant' page
 744                          */
 745                         pg_count = 1;
 746                 } else
 747                         pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 748
 749                 if ((flags & CL_READ) && blkno == -1) {
 750                         int bytes_to_zero;
 751
 752                         /*
 753                          * if we're reading and blkno == -1, then we've got a
 754                          * 'hole' in the file that we need to deal with by zeroing
 755                          * out the affected area in the upl
 756                          */
 757                         if (zero_offset && io_size == size) {
 758                                 /*
 759                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
 760                                  * than 'zero_offset' will be non-zero
 761                                  * if the 'hole' returned by vnop_blockmap extends all the way to the eof
 762                                  * (indicated by the io_size finishing off the I/O request for this UPL)
 763                                  * than we're not going to issue an I/O for the
 764                                  * last page in this upl... we need to zero both the hole and the tail
 765                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
 766                                  */
 767                                 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
 768
 769                                 zero_offset = 0;
 770                         } else
 771                                 bytes_to_zero = io_size;
 772
 773                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
 774
 775                         if (cbp_head)
 776                                 /*
 777                                  * if there is a current I/O chain pending
 778                                  * then the first page of the group we just zero'd
 779                                  * will be handled by the I/O completion if the zero
 780                                  * fill started in the middle of the page
 781                                  */
 782                                 pg_count = (io_size - pg_offset) / PAGE_SIZE;
 783                         else {
 784                                 /*
 785                                  * no pending I/O to pick up that first page
 786                                  * so, we have to make sure it gets committed
 787                                  * here.
 788                                  * set the pg_offset to 0 so that the upl_commit_range
 789                                  * starts with this page
 790                                  */
 791                                 pg_count = (io_size + pg_offset) / PAGE_SIZE;
 792                                 pg_offset = 0;
 793                         }
 794                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 795                                 /*
 796                                  * if we're done with the request for this UPL
 797                                  * then we have to make sure to commit the last page
 798                                  * even if we only partially zero-filled it
 799                                  */
 800                                 pg_count++;
 801
 802                         if (pg_count) {
 803                                 if (pg_offset)
 804                                         pg_resid = PAGE_SIZE - pg_offset;
 805                                 else
 806                                         pg_resid = 0;
 807
 808                                 if (flags & CL_COMMIT)
 809                                         ubc_upl_commit_range(upl,
 810                                                         (upl_offset + pg_resid) & ~PAGE_MASK,
 811                                                         pg_count * PAGE_SIZE,
 812                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 813                         }
 814                         upl_offset += io_size;
 815                         f_offset   += io_size;
 816                         size       -= io_size;
 817                         /*
 818                          * keep track of how much of the original request
 819                          * that we've actually completed... non_rounded_size
 820                          * may go negative due to us rounding the request
 821                          * to a page size multiple (i.e.  size > non_rounded_size)
 822                          */
 823                         non_rounded_size -= io_size;
 824
 825                         if (non_rounded_size <= 0) {
 826                                 /*
 827                                  * we've transferred all of the data in the original
 828                                  * request, but we were unable to complete the tail
 829                                  * of the last page because the file didn't have
 830                                  * an allocation to back that portion... this is ok.
 831                                  */
 832                                 size = 0;
 833                         }
 834                         if (cbp_head && pg_count)
 835                                 goto start_io;
 836                         continue;
 837
 838                 }
 839                 if (pg_count > max_vectors) {
 840                         if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
 841                                 io_size = PAGE_SIZE - pg_offset;
 842                                 pg_count = 1;
 843                         } else {
 844                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 845                                 pg_count = max_vectors;
 846                         }
 847                 }
 848
 849                 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
 850                         /*
 851                          * if we're not targeting a virtual device i.e. a disk image
 852                          * it's safe to dip into the reserve pool since real devices
 853                          * can complete this I/O request without requiring additional
 854                          * bufs from the alloc_io_buf pool
 855                          */
 856                         priv = 1;
 857                 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 858                         /*
 859                          * Throttle the speculative IO
 860                          */
 861                         priv = 0;
 862                 else
 863                         priv = 1;
 864
 865                 cbp = alloc_io_buf(vp, priv);
 866
 867                 if (flags & CL_PAGEOUT) {
 868                         u_int i;
 869
 870                         for (i = 0; i < pg_count; i++) {
 871                                 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
 872                                         panic("BUSY bp found in cluster_io");
 873                         }
 874                 }
 875                 if (flags & CL_ASYNC) {
 876                         if (buf_setcallback(cbp, (void *)cluster_iodone, NULL))
 877                                 panic("buf_setcallback failed\n");
 878                 }
 879                 cbp->b_flags |= io_flags;
 880
 881                 cbp->b_lblkno = lblkno;
 882                 cbp->b_blkno  = blkno;
 883                 cbp->b_bcount = io_size;
 884
 885                 if (buf_setupl(cbp, upl, upl_offset))
 886                         panic("buf_setupl failed\n");
 887
 888                 cbp->b_trans_next = (buf_t)NULL;
 889
 890                 if ((cbp->b_iostate = (void *)iostate))
 891                         /*
 892                          * caller wants to track the state of this
 893                          * io... bump the amount issued against this stream
 894                          */
 895                         iostate->io_issued += io_size;
 896
 897                 if (flags & CL_READ) {
 898                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 899                                      (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
 900                 }
 901                 else {
 902                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 903                                      (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
 904                 }
 905
 906                 if (cbp_head) {
 907                         cbp_tail->b_trans_next = cbp;
 908                         cbp_tail = cbp;
 909                 } else {
 910                         cbp_head = cbp;
 911                         cbp_tail = cbp;
 912                 }
 913                 (buf_t)(cbp->b_trans_head) = cbp_head;
 914                 trans_count++;
 915
 916                 upl_offset += io_size;
 917                 f_offset   += io_size;
 918                 size       -= io_size;
 919                 /*
 920                  * keep track of how much of the original request
 921                  * that we've actually completed... non_rounded_size
 922                  * may go negative due to us rounding the request
 923                  * to a page size multiple (i.e.  size > non_rounded_size)
 924                  */
 925                 non_rounded_size -= io_size;
 926
 927                 if (non_rounded_size <= 0) {
 928                         /*
 929                          * we've transferred all of the data in the original
 930                          * request, but we were unable to complete the tail
 931                          * of the last page because the file didn't have
 932                          * an allocation to back that portion... this is ok.
 933                          */
 934                         size = 0;
 935                 }
 936                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || trans_count > 8)) || size == 0) {
 937                         /*
 938                          * if we have no more I/O to issue or
 939                          * the current I/O we've prepared fully
 940                          * completes the last page in this request
 941                          * and it's either an ASYNC request or
 942                          * we've already accumulated more than 8 I/O's into
 943                          * this transaction and it's not an I/O directed to
 944                          * special DEVICE memory
 945                          * then go ahead and issue the I/O
 946                          */
 947 start_io:
 948                         if (real_bp) {
 949                                 cbp_head->b_flags |= B_NEED_IODONE;
 950                                 cbp_head->b_real_bp = real_bp;
 951                         } else
 952                                 cbp_head->b_real_bp = (buf_t)NULL;
 953
 954                         if (size == 0) {
 955                                 /*
 956                                  * we're about to issue the last I/O for this upl
 957                                  * if this was a read to the eof and the eof doesn't
 958                                  * finish on a page boundary, than we need to zero-fill
 959                                  * the rest of the page....
 960                                  */
 961                                 cbp_head->b_validend = zero_offset;
 962                         } else
 963                                 cbp_head->b_validend = 0;
 964
 965                         if (flags & CL_THROTTLE)
 966                                 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, (char *)"cluster_io");
 967
 968                         for (cbp = cbp_head; cbp;) {
 969                                 buf_t   cbp_next;
 970
 971                                 if ( !(io_flags & B_READ))
 972                                         vnode_startwrite(vp);
 973
 974                                 cbp_next = cbp->b_trans_next;
 975
 976                                 (void) VNOP_STRATEGY(cbp);
 977                                 cbp = cbp_next;
 978                         }
 979                         if ( !(flags & CL_ASYNC)) {
 980                                 int dummy;
 981
 982                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 983                                         buf_biowait(cbp);
 984
 985                                 if ((error = cluster_iodone(cbp_head, (void *)&dummy))) {
 986                                         if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) == CL_PAGEOUT) && (error == ENXIO))
 987                                                 error = 0;      /* drop the error */
 988                                         else {
 989                                                 if (retval == 0)
 990                                                         retval = error;
 991                                                 error = 0;
 992                                         }
 993                                 }
 994                         }
 995                         cbp_head = (buf_t)NULL;
 996                         cbp_tail = (buf_t)NULL;
 997
 998                         trans_count = 0;
 999                 }
1000         }
1001         if (error) {
1002                 int abort_size;
1003
1004                 io_size = 0;
1005
1006                 for (cbp = cbp_head; cbp;) {
1007                         buf_t   cbp_next;
1008
1009                         upl_offset -= cbp->b_bcount;
1010                         size       += cbp->b_bcount;
1011                         io_size    += cbp->b_bcount;
1012
1013                         cbp_next = cbp->b_trans_next;
1014                         free_io_buf(cbp);
1015                         cbp = cbp_next;
1016                 }
1017                 if (iostate) {
1018                         int need_wakeup = 0;
1019
1020                         /*
1021                          * update the error condition for this stream
1022                          * since we never really issued the io
1023                          * just go ahead and adjust it back
1024                          */
1025                         lck_mtx_lock(cl_mtxp);
1026
1027                         if (iostate->io_error == 0)
1028                                 iostate->io_error = error;
1029                         iostate->io_issued -= io_size;
1030
1031                         if (iostate->io_wanted) {
1032                                 /*
1033                                  * someone is waiting for the state of
1034                                  * this io stream to change
1035                                  */
1036                                 iostate->io_wanted = 0;
1037                                 need_wakeup = 0;
1038                         }
1039                         lck_mtx_unlock(cl_mtxp);
1040
1041                         if (need_wakeup)
1042                                 wakeup((caddr_t)&iostate->io_wanted);
1043                 }
1044                 pg_offset  = upl_offset & PAGE_MASK;
1045                 abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1046
1047                 if (flags & CL_COMMIT) {
1048                         int upl_abort_code;
1049
1050                         if (flags & CL_PRESERVE) {
1051                                 ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
1052                                                      UPL_COMMIT_FREE_ON_EMPTY);
1053                         } else {
1054                                 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
1055                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
1056                                 else if (flags & CL_PAGEIN)
1057                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1058                                 else
1059                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
1060
1061                                 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
1062                                                 upl_abort_code);
1063                         }
1064                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
1065                                      (int)upl, upl_offset - pg_offset, abort_size, error, 0);
1066                 }
1067                 if (real_bp) {
1068                         real_bp->b_flags |= B_ERROR;
1069                         real_bp->b_error  = error;
1070
1071                         buf_biodone(real_bp);
1072                 }
1073                 if (retval == 0)
1074                         retval = error;
1075         }
1076         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
1077                      (int)f_offset, size, upl_offset, retval, 0);
1078
1079         return (retval);
1080 }
1081
1082
1083 static int
1084 cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize)
1085 {
1086         int           pages_in_prefetch;
1087
1088         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
1089                      (int)f_offset, size, (int)filesize, 0, 0);
1090
1091         if (f_offset >= filesize) {
1092                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1093                              (int)f_offset, 0, 0, 0, 0);
1094                 return(0);
1095         }
1096         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1097                 size = (MAX_UPL_TRANSFER * PAGE_SIZE);
1098         else
1099                 size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1100
1101         if ((off_t)size > (filesize - f_offset))
1102                 size = filesize - f_offset;
1103         pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1104
1105         advisory_read(vp, filesize, f_offset, size);
1106
1107         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1108                      (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1109
1110         return (pages_in_prefetch);
1111 }
1112
1113
1114
1115 static void
1116 cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap)
1117 {
1118         daddr64_t       r_addr;
1119         off_t           f_offset;
1120         int             size_of_prefetch;
1121
1122
1123         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
1124                      (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
1125
1126         if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
1127                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1128                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
1129                 return;
1130         }
1131         if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1) &&
1132                                    (extent->b_addr != (rap->cl_maxra + 1) || rap->cl_ralen == 0))) {
1133                 rap->cl_ralen = 0;
1134                 rap->cl_maxra = 0;
1135
1136                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1137                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
1138
1139                 return;
1140         }
1141         if (extent->e_addr < rap->cl_maxra) {
1142                 if ((rap->cl_maxra - extent->e_addr) > (MAX_UPL_TRANSFER / 4)) {
1143
1144                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1145                                      rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
1146                         return;
1147                 }
1148         }
1149         r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
1150         f_offset = (off_t)(r_addr * PAGE_SIZE_64);
1151
1152         size_of_prefetch = 0;
1153
1154         ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
1155
1156         if (size_of_prefetch) {
1157                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1158                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
1159                 return;
1160         }
1161         if (f_offset < filesize) {
1162                 daddr64_t read_size;
1163
1164                 rap->cl_ralen = rap->cl_ralen ? min(MAX_UPL_TRANSFER, rap->cl_ralen << 1) : 1;
1165
1166                 read_size = (extent->e_addr + 1) - extent->b_addr;
1167
1168                 if (read_size > rap->cl_ralen) {
1169                         if (read_size > MAX_UPL_TRANSFER)
1170                                 rap->cl_ralen = MAX_UPL_TRANSFER;
1171                         else
1172                                 rap->cl_ralen = read_size;
1173                 }
1174                 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize);
1175
1176                 if (size_of_prefetch)
1177                         rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
1178         }
1179         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1180                      rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
1181 }
1182
1183 int
1184 cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1185                 int size, off_t filesize, int flags)
1186 {
1187         int           io_size;
1188         int           rounded_size;
1189         off_t         max_size;
1190         int           local_flags;
1191         struct cl_writebehind *wbp;
1192
1193         if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
1194                 /*
1195                  * if we know we're issuing this I/O to a virtual device (i.e. disk image)
1196                  * then we don't want to enforce this throttle... if we do, we can
1197                  * potentially deadlock since we're stalling the pageout thread at a time
1198                  * when the disk image might need additional memory (which won't be available
1199                  * if the pageout thread can't run)... instead we'll just depend on the throttle
1200                  * that the pageout thread now has in place to deal with external files
1201                  */
1202                 local_flags = CL_PAGEOUT;
1203         else
1204                 local_flags = CL_PAGEOUT | CL_THROTTLE;
1205
1206         if ((flags & UPL_IOSYNC) == 0)
1207                 local_flags |= CL_ASYNC;
1208         if ((flags & UPL_NOCOMMIT) == 0)
1209                 local_flags |= CL_COMMIT;
1210         if ((flags & UPL_KEEPCACHED))
1211                 local_flags |= CL_KEEPCACHED;
1212
1213
1214         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
1215                      (int)f_offset, size, (int)filesize, local_flags, 0);
1216
1217         /*
1218          * If they didn't specify any I/O, then we are done...
1219          * we can't issue an abort because we don't know how
1220          * big the upl really is
1221          */
1222         if (size <= 0)
1223                 return (EINVAL);
1224
1225         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1226                 if (local_flags & CL_COMMIT)
1227                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1228                 return (EROFS);
1229         }
1230         /*
1231          * can't page-in from a negative offset
1232          * or if we're starting beyond the EOF
1233          * or if the file offset isn't page aligned
1234          * or the size requested isn't a multiple of PAGE_SIZE
1235          */
1236         if (f_offset < 0 || f_offset >= filesize ||
1237            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
1238                 if (local_flags & CL_COMMIT)
1239                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1240                 return (EINVAL);
1241         }
1242         max_size = filesize - f_offset;
1243
1244         if (size < max_size)
1245                 io_size = size;
1246         else
1247                 io_size = max_size;
1248
1249         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1250
1251         if (size > rounded_size) {
1252                 if (local_flags & CL_COMMIT)
1253                         ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
1254                                         UPL_ABORT_FREE_ON_EMPTY);
1255         }
1256         if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1257                 wbp->cl_hasbeenpaged = 1;
1258
1259         return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
1260                            local_flags, (buf_t)NULL, (struct clios *)NULL));
1261 }
1262
1263 int
1264 cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1265                int size, off_t filesize, int flags)
1266 {
1267         u_int         io_size;
1268         int           rounded_size;
1269         off_t         max_size;
1270         int           retval;
1271         int           local_flags = 0;
1272
1273         if (upl == NULL || size < 0)
1274                 panic("cluster_pagein: NULL upl passed in");
1275
1276         if ((flags & UPL_IOSYNC) == 0)
1277                 local_flags |= CL_ASYNC;
1278         if ((flags & UPL_NOCOMMIT) == 0)
1279                 local_flags |= CL_COMMIT;
1280
1281
1282         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1283                      (int)f_offset, size, (int)filesize, local_flags, 0);
1284
1285         /*
1286          * can't page-in from a negative offset
1287          * or if we're starting beyond the EOF
1288          * or if the file offset isn't page aligned
1289          * or the size requested isn't a multiple of PAGE_SIZE
1290          */
1291         if (f_offset < 0 || f_offset >= filesize ||
1292            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1293                 if (local_flags & CL_COMMIT)
1294                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1295                 return (EINVAL);
1296         }
1297         max_size = filesize - f_offset;
1298
1299         if (size < max_size)
1300                 io_size = size;
1301         else
1302                 io_size = max_size;
1303
1304         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1305
1306         if (size > rounded_size && (local_flags & CL_COMMIT))
1307                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1308                                     size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1309
1310         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
1311                            local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL);
1312
1313         if (retval == 0 && !(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) {
1314                 struct cl_readahead *rap;
1315
1316                 rap = cluster_get_rap(vp);
1317
1318                 if (rap != NULL) {
1319                         struct cl_extent extent;
1320
1321                         extent.b_addr = (daddr64_t)(f_offset / PAGE_SIZE_64);
1322                         extent.e_addr = (daddr64_t)((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1323
1324                         if (rounded_size == PAGE_SIZE) {
1325                                 /*
1326                                  * we haven't read the last page in of the file yet
1327                                  * so let's try to read ahead if we're in
1328                                  * a sequential access pattern
1329                                  */
1330                                 cluster_rd_ahead(vp, &extent, filesize, rap);
1331                         }
1332                         rap->cl_lastr = extent.e_addr;
1333
1334                         lck_mtx_unlock(&rap->cl_lockr);
1335                 }
1336         }
1337         return (retval);
1338 }
1339
1340 int
1341 cluster_bp(buf_t bp)
1342 {
1343         off_t  f_offset;
1344         int    flags;
1345
1346         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1347                      (int)bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1348
1349         if (bp->b_flags & B_READ)
1350                 flags = CL_ASYNC | CL_READ;
1351         else
1352                 flags = CL_ASYNC;
1353
1354         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1355
1356         return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL));
1357 }
1358
1359 int
1360 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
1361 {
1362         int           prev_resid;
1363         u_int         clip_size;
1364         off_t         max_io_size;
1365         int           upl_size;
1366         int           upl_flags;
1367         upl_t         upl;
1368         int           retval = 0;
1369         int           flags;
1370
1371         flags = xflags;
1372
1373         if (vp->v_flag & VNOCACHE_DATA)
1374                 flags |= IO_NOCACHE;
1375
1376         if ( (!(flags & IO_NOCACHE)) || (!uio) || (!UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
1377                 /*
1378                  * go do a write through the cache if one of the following is true....
1379                  *   NOCACHE is not true
1380                  *   there is no uio structure or it doesn't target USERSPACE
1381                  */
1382                 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1383         }
1384
1385 #if LP64_DEBUG
1386         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1387                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1388         }
1389 #endif /* LP64_DEBUG */
1390
1391         while (uio_resid(uio) && uio->uio_offset < newEOF && retval == 0) {
1392                 user_size_t     iov_len;
1393                 user_addr_t     iov_base;
1394
1395                 /*
1396                  * we know we have a resid, so this is safe
1397                  * skip over any emtpy vectors
1398                  */
1399                 uio_update(uio, (user_size_t)0);
1400
1401                 iov_len  = uio_curriovlen(uio);
1402                 iov_base = uio_curriovbase(uio);
1403
1404                 upl_size  = PAGE_SIZE;
1405                 upl_flags = UPL_QUERY_OBJECT_TYPE;
1406
1407                 // LP64todo - fix this!
1408                 if ((vm_map_get_upl(current_map(),
1409                                     (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1410                                     &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
1411                         /*
1412                          * the user app must have passed in an invalid address
1413                          */
1414                         return (EFAULT);
1415                 }
1416
1417                 /*
1418                  * We check every vector target but if it is physically
1419                  * contiguous space, we skip the sanity checks.
1420                  */
1421                 if (upl_flags & UPL_PHYS_CONTIG) {
1422                         int zflags;
1423
1424                         zflags = flags & ~IO_TAILZEROFILL;
1425                         zflags |= IO_HEADZEROFILL;
1426
1427                         if (flags & IO_HEADZEROFILL) {
1428                                 /*
1429                                  * in case we have additional vectors, we don't want to do this again
1430                                  */
1431                                 flags &= ~IO_HEADZEROFILL;
1432
1433                                 if ((retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, zflags)))
1434                                         return(retval);
1435                         }
1436                         retval = cluster_phys_write(vp, uio, newEOF);
1437
1438                         if (uio_resid(uio) == 0 && (flags & IO_TAILZEROFILL)) {
1439                                 return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, zflags));
1440                         }
1441                 }
1442                 else if ((uio_resid(uio) < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL))) {
1443                         /*
1444                          * we're here because we're don't have a physically contiguous target buffer
1445                          * go do a write through the cache if one of the following is true....
1446                          *   the total xfer size is less than a page...
1447                          *   we're being asked to ZEROFILL either the head or the tail of the I/O...
1448                          */
1449                         return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1450                 }
1451                 // LP64todo - fix this!
1452                 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1453                         if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1454                                 /*
1455                                  * Bring the file offset write up to a pagesize boundary
1456                                  * this will also bring the base address to a page boundary
1457                                  * since they both are currently on the same offset within a page
1458                                  * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1459                                  * so the computed clip_size must always be less than the current uio_resid
1460                                  */
1461                                 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1462
1463                                 /*
1464                                  * Fake the resid going into the cluster_write_x call
1465                                  * and restore it on the way out.
1466                                  */
1467                                 // LP64todo - fix this
1468                                 prev_resid = uio_resid(uio);
1469                                 uio_setresid(uio, clip_size);
1470
1471                                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1472
1473                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1474                         } else {
1475                                 /*
1476                                  * can't get both the file offset and the buffer offset aligned to a page boundary
1477                                  * so fire an I/O through the cache for this entire vector
1478                                  */
1479                                 // LP64todo - fix this
1480                                 clip_size = iov_len;
1481                                 // LP64todo - fix this
1482                                 prev_resid = uio_resid(uio);
1483                                 uio_setresid(uio, clip_size);
1484
1485                                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1486
1487                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1488                         }
1489                 } else {
1490                         /*
1491                          * If we come in here, we know the offset into
1492                          * the file is on a pagesize boundary and the
1493                          * target buffer address is also on a page boundary
1494                          */
1495                         max_io_size = newEOF - uio->uio_offset;
1496                         // LP64todo - fix this
1497                         clip_size = uio_resid(uio);
1498                         if (iov_len < clip_size)
1499                                 // LP64todo - fix this!
1500                                 clip_size = iov_len;
1501                         if (max_io_size < clip_size)
1502                                 clip_size = max_io_size;
1503
1504                         if (clip_size < PAGE_SIZE) {
1505                                 /*
1506                                  * Take care of tail end of write in this vector
1507                                  */
1508                                 // LP64todo - fix this
1509                                 prev_resid = uio_resid(uio);
1510                                 uio_setresid(uio, clip_size);
1511
1512                                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1513
1514                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1515                         } else {
1516                                 /* round clip_size down to a multiple of pagesize */
1517                                 clip_size = clip_size & ~(PAGE_MASK);
1518                                 // LP64todo - fix this
1519                                 prev_resid = uio_resid(uio);
1520                                 uio_setresid(uio, clip_size);
1521
1522                                 retval = cluster_nocopy_write(vp, uio, newEOF);
1523
1524                                 if ((retval == 0) && uio_resid(uio))
1525                                         retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1526
1527                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1528                         }
1529                 } /* end else */
1530         } /* end while */
1531
1532         return(retval);
1533 }
1534
1535
1536 static int
1537 cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF)
1538 {
1539         upl_t            upl;
1540         upl_page_info_t  *pl;
1541         vm_offset_t      upl_offset;
1542         int              io_size;
1543         int              io_flag;
1544         int              upl_size;
1545         int              upl_needed_size;
1546         int              pages_in_pl;
1547         int              upl_flags;
1548         kern_return_t    kret;
1549         int              i;
1550         int              force_data_sync;
1551         int              error  = 0;
1552         struct clios     iostate;
1553         struct cl_writebehind *wbp;
1554
1555
1556         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1557                      (int)uio->uio_offset, (int)uio_resid(uio),
1558                      (int)newEOF, 0, 0);
1559
1560         /*
1561          * When we enter this routine, we know
1562          *  -- the offset into the file is on a pagesize boundary
1563          *  -- the resid is a page multiple
1564          *  -- the resid will not exceed iov_len
1565          */
1566
1567         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1568
1569                 cluster_try_push(wbp, vp, newEOF, 0, 1);
1570
1571                 lck_mtx_unlock(&wbp->cl_lockw);
1572         }
1573         iostate.io_completed = 0;
1574         iostate.io_issued = 0;
1575         iostate.io_error = 0;
1576         iostate.io_wanted = 0;
1577
1578         while (uio_resid(uio) && uio->uio_offset < newEOF && error == 0) {
1579                 user_addr_t     iov_base;
1580
1581                 io_size = uio_resid(uio);
1582
1583                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1584                         io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1585
1586                 iov_base = uio_curriovbase(uio);
1587
1588                 // LP64todo - fix this!
1589                 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
1590
1591                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1592
1593                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1594                              (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
1595
1596                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1597                         pages_in_pl = 0;
1598                         upl_size = upl_needed_size;
1599                         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1600                                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1601
1602                         // LP64todo - fix this!
1603                         kret = vm_map_get_upl(current_map(),
1604                                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1605                                               &upl_size,
1606                                               &upl,
1607                                               NULL,
1608                                               &pages_in_pl,
1609                                               &upl_flags,
1610                                               force_data_sync);
1611
1612                         if (kret != KERN_SUCCESS) {
1613                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1614                                              0, 0, 0, kret, 0);
1615                                 /*
1616                                  * cluster_nocopy_write: failed to get pagelist
1617                                  *
1618                                  * we may have already spun some portion of this request
1619                                  * off as async requests... we need to wait for the I/O
1620                                  * to complete before returning
1621                                  */
1622                                 goto wait_for_writes;
1623                         }
1624                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1625                         pages_in_pl = upl_size / PAGE_SIZE;
1626
1627                         for (i = 0; i < pages_in_pl; i++) {
1628                                 if (!upl_valid_page(pl, i))
1629                                         break;
1630                         }
1631                         if (i == pages_in_pl)
1632                                 break;
1633
1634                         /*
1635                          * didn't get all the pages back that we
1636                          * needed... release this upl and try again
1637                          */
1638                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1639                                             UPL_ABORT_FREE_ON_EMPTY);
1640                 }
1641                 if (force_data_sync >= 3) {
1642                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1643                                      i, pages_in_pl, upl_size, kret, 0);
1644                         /*
1645                          * for some reason, we couldn't acquire a hold on all
1646                          * the pages needed in the user's address space
1647                          *
1648                          * we may have already spun some portion of this request
1649                          * off as async requests... we need to wait for the I/O
1650                          * to complete before returning
1651                          */
1652                         goto wait_for_writes;
1653                 }
1654
1655                 /*
1656                  * Consider the possibility that upl_size wasn't satisfied.
1657                  */
1658                 if (upl_size != upl_needed_size)
1659                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1660
1661                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1662                              (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
1663
1664                 if (io_size == 0) {
1665                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1666                                             UPL_ABORT_FREE_ON_EMPTY);
1667                         /*
1668                          * we may have already spun some portion of this request
1669                          * off as async requests... we need to wait for the I/O
1670                          * to complete before returning
1671                          */
1672                         goto wait_for_writes;
1673                 }
1674                 /*
1675                  * Now look for pages already in the cache
1676                  * and throw them away.
1677                  * uio->uio_offset is page aligned within the file
1678                  * io_size is a multiple of PAGE_SIZE
1679                  */
1680                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
1681
1682                 /*
1683                  * we want push out these writes asynchronously so that we can overlap
1684                  * the preparation of the next I/O
1685                  * if there are already too many outstanding writes
1686                  * wait until some complete before issuing the next
1687                  */
1688                 lck_mtx_lock(cl_mtxp);
1689
1690                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1691                         iostate.io_wanted = 1;
1692                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
1693                 }
1694                 lck_mtx_unlock(cl_mtxp);
1695
1696                 if (iostate.io_error) {
1697                         /*
1698                          * one of the earlier writes we issued ran into a hard error
1699                          * don't issue any more writes, cleanup the UPL
1700                          * that was just created but not used, then
1701                          * go wait for all writes that are part of this stream
1702                          * to complete before returning the error to the caller
1703                          */
1704                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1705                                             UPL_ABORT_FREE_ON_EMPTY);
1706
1707                         goto wait_for_writes;
1708                 }
1709                 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE;
1710
1711                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1712                              (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1713
1714                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1715                                    io_size, io_flag, (buf_t)NULL, &iostate);
1716
1717                 uio_update(uio, (user_size_t)io_size);
1718
1719                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1720                              (int)upl_offset, (int)uio->uio_offset, (int)uio_resid(uio), error, 0);
1721
1722         } /* end while */
1723
1724 wait_for_writes:
1725         /*
1726          * make sure all async writes issued as part of this stream
1727          * have completed before we return
1728          */
1729         lck_mtx_lock(cl_mtxp);
1730
1731         while (iostate.io_issued != iostate.io_completed) {
1732                 iostate.io_wanted = 1;
1733                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
1734         }
1735         lck_mtx_unlock(cl_mtxp);
1736
1737         if (iostate.io_error)
1738                 error = iostate.io_error;
1739
1740         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1741                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1742
1743         return (error);
1744 }
1745
1746
1747 static int
1748 cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF)
1749 {
1750         upl_page_info_t *pl;
1751         addr64_t         src_paddr;
1752         upl_t            upl;
1753         vm_offset_t      upl_offset;
1754         int              tail_size;
1755         int              io_size;
1756         int              upl_size;
1757         int              upl_needed_size;
1758         int              pages_in_pl;
1759         int              upl_flags;
1760         kern_return_t    kret;
1761         int              error  = 0;
1762         user_addr_t      iov_base;
1763         int              devblocksize;
1764         struct cl_writebehind *wbp;
1765
1766         devblocksize = vp->v_mount->mnt_devblocksize;
1767         /*
1768          * When we enter this routine, we know
1769          *  -- the resid will not exceed iov_len
1770          *  -- the vector target address is physcially contiguous
1771          */
1772         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1773
1774                 cluster_try_push(wbp, vp, newEOF, 0, 1);
1775
1776                 lck_mtx_unlock(&wbp->cl_lockw);
1777         }
1778 #if LP64_DEBUG
1779         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1780                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1781         }
1782 #endif /* LP64_DEBUG */
1783
1784         // LP64todo - fix this!
1785         io_size = (int)uio_curriovlen(uio);
1786         iov_base = uio_curriovbase(uio);
1787
1788         upl_offset = CAST_DOWN(upl_offset_t, iov_base) & PAGE_MASK;
1789         upl_needed_size = upl_offset + io_size;
1790
1791         pages_in_pl = 0;
1792         upl_size = upl_needed_size;
1793         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1794                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1795
1796         // LP64todo - fix this!
1797         kret = vm_map_get_upl(current_map(),
1798                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1799                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1800
1801         if (kret != KERN_SUCCESS) {
1802                 /*
1803                  * cluster_phys_write: failed to get pagelist
1804                  * note: return kret here
1805                  */
1806               return(EINVAL);
1807         }
1808         /*
1809          * Consider the possibility that upl_size wasn't satisfied.
1810          * This is a failure in the physical memory case.
1811          */
1812         if (upl_size < upl_needed_size) {
1813                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1814                 return(EINVAL);
1815         }
1816         pl = ubc_upl_pageinfo(upl);
1817
1818         src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
1819
1820         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1821                 int   head_size;
1822
1823                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1824
1825                 if (head_size > io_size)
1826                         head_size = io_size;
1827
1828                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0);
1829
1830                 if (error) {
1831                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1832
1833                         return(EINVAL);
1834                 }
1835                 upl_offset += head_size;
1836                 src_paddr  += head_size;
1837                 io_size    -= head_size;
1838         }
1839         tail_size = io_size & (devblocksize - 1);
1840         io_size  -= tail_size;
1841
1842         if (io_size) {
1843                 /*
1844                  * issue a synchronous write to cluster_io
1845                  */
1846                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1847                                    io_size, CL_DEV_MEMORY, (buf_t)NULL, (struct clios *)NULL);
1848         }
1849         if (error == 0) {
1850                 /*
1851                  * The cluster_io write completed successfully,
1852                  * update the uio structure
1853                  */
1854                 uio_update(uio, (user_size_t)io_size);
1855
1856                 src_paddr += io_size;
1857
1858                 if (tail_size)
1859                         error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0);
1860         }
1861         /*
1862          * just release our hold on the physically contiguous
1863          * region without changing any state
1864          */
1865         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1866
1867         return (error);
1868 }
1869
1870
1871 static int
1872 cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int flags)
1873 {
1874         upl_page_info_t *pl;
1875         upl_t            upl;
1876         vm_offset_t      upl_offset = 0;
1877         int              upl_size;
1878         off_t            upl_f_offset;
1879         int              pages_in_upl;
1880         int              start_offset;
1881         int              xfer_resid;
1882         int              io_size;
1883         int              io_offset;
1884         int              bytes_to_zero;
1885         int              bytes_to_move;
1886         kern_return_t    kret;
1887         int              retval = 0;
1888         int              io_resid;
1889         long long        total_size;
1890         long long        zero_cnt;
1891         off_t            zero_off;
1892         long long        zero_cnt1;
1893         off_t            zero_off1;
1894         struct cl_extent cl;
1895         int              intersection;
1896         struct cl_writebehind *wbp;
1897
1898         if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1899         {
1900                 if (wbp->cl_hasbeenpaged) {
1901                         /*
1902                          * this vnode had pages cleaned to it by
1903                          * the pager which indicates that either
1904                          * it's not very 'hot', or the system is
1905                          * being overwhelmed by a lot of dirty
1906                          * data being delayed in the VM cache...
1907                          * in either event, we'll push our remaining
1908                          * delayed data at this point...  this will
1909                          * be more efficient than paging out 1 page at
1910                          * a time, and will also act as a throttle
1911                          * by delaying this client from writing any
1912                          * more data until all his delayed data has
1913                          * at least been queued to the uderlying driver.
1914                          */
1915                         if (wbp->cl_number || wbp->cl_scmap)
1916                                 cluster_push_EOF(vp, newEOF);
1917
1918                         wbp->cl_hasbeenpaged = 0;
1919                 }
1920         }
1921         if (uio) {
1922                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1923                              (int)uio->uio_offset, uio_resid(uio), (int)oldEOF, (int)newEOF, 0);
1924
1925                 // LP64todo - fix this
1926                 io_resid = uio_resid(uio);
1927         } else {
1928                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1929                              0, 0, (int)oldEOF, (int)newEOF, 0);
1930
1931                 io_resid = 0;
1932         }
1933         zero_cnt  = 0;
1934         zero_cnt1 = 0;
1935         zero_off  = 0;
1936         zero_off1 = 0;
1937
1938         if (flags & IO_HEADZEROFILL) {
1939                 /*
1940                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1941                  * so we zero fill the intervening space between the old EOF and the offset
1942                  * where the next chunk of real data begins.... ftruncate will also use this
1943                  * routine to zero fill to the new EOF when growing a file... in this case, the
1944                  * uio structure will not be provided
1945                  */
1946                 if (uio) {
1947                         if (headOff < uio->uio_offset) {
1948                                 zero_cnt = uio->uio_offset - headOff;
1949                                 zero_off = headOff;
1950                         }
1951                 } else if (headOff < newEOF) {
1952                         zero_cnt = newEOF - headOff;
1953                         zero_off = headOff;
1954                 }
1955         }
1956         if (flags & IO_TAILZEROFILL) {
1957                 if (uio) {
1958                         // LP64todo - fix this
1959                         zero_off1 = uio->uio_offset + uio_resid(uio);
1960
1961                         if (zero_off1 < tailOff)
1962                                 zero_cnt1 = tailOff - zero_off1;
1963                 }
1964         }
1965         if (zero_cnt == 0 && uio == (struct uio *) 0) {
1966                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1967                              retval, 0, 0, 0, 0);
1968                 return (0);
1969         }
1970
1971         while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1972                 /*
1973                  * for this iteration of the loop, figure out where our starting point is
1974                  */
1975                 if (zero_cnt) {
1976                         start_offset = (int)(zero_off & PAGE_MASK_64);
1977                         upl_f_offset = zero_off - start_offset;
1978                 } else if (io_resid) {
1979                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1980                         upl_f_offset = uio->uio_offset - start_offset;
1981                 } else {
1982                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1983                         upl_f_offset = zero_off1 - start_offset;
1984                 }
1985                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1986                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1987
1988                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1989                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1990
1991                 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
1992
1993                 if (uio && ((flags & (IO_NOCACHE | IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
1994                         /*
1995                          * assumption... total_size <= io_resid
1996                          * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1997                          */
1998                         if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
1999                                 total_size -= start_offset;
2000                         xfer_resid = total_size;
2001
2002                         retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
2003
2004                         if (retval)
2005                                 break;
2006
2007                         io_resid   -= (total_size - xfer_resid);
2008                         total_size   = xfer_resid;
2009                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2010                         upl_f_offset = uio->uio_offset - start_offset;
2011
2012                         if (total_size == 0) {
2013                                 if (start_offset) {
2014                                         /*
2015                                          * the write did not finish on a page boundary
2016                                          * which will leave upl_f_offset pointing to the
2017                                          * beginning of the last page written instead of
2018                                          * the page beyond it... bump it in this case
2019                                          * so that the cluster code records the last page
2020                                          * written as dirty
2021                                          */
2022                                         upl_f_offset += PAGE_SIZE_64;
2023                                 }
2024                                 upl_size = 0;
2025
2026                                 goto check_cluster;
2027                         }
2028                 }
2029                 /*
2030                  * compute the size of the upl needed to encompass
2031                  * the requested write... limit each call to cluster_io
2032                  * to the maximum UPL size... cluster_io will clip if
2033                  * this exceeds the maximum io_size for the device,
2034                  * make sure to account for
2035                  * a starting offset that's not page aligned
2036                  */
2037                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2038
2039                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2040                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2041
2042                 pages_in_upl = upl_size / PAGE_SIZE;
2043                 io_size      = upl_size - start_offset;
2044
2045                 if ((long long)io_size > total_size)
2046                         io_size = total_size;
2047
2048                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
2049
2050
2051                 /*
2052                  * Gather the pages from the buffer cache.
2053                  * The UPL_WILL_MODIFY flag lets the UPL subsystem know
2054                  * that we intend to modify these pages.
2055                  */
2056                 kret = ubc_create_upl(vp,
2057                                       upl_f_offset,
2058                                       upl_size,
2059                                       &upl,
2060                                       &pl,
2061                                       UPL_SET_LITE | UPL_WILL_MODIFY);
2062                 if (kret != KERN_SUCCESS)
2063                         panic("cluster_write: failed to get pagelist");
2064
2065                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
2066                         (int)upl, (int)upl_f_offset, start_offset, 0, 0);
2067
2068                 if (start_offset && !upl_valid_page(pl, 0)) {
2069                         int   read_size;
2070
2071                         /*
2072                          * we're starting in the middle of the first page of the upl
2073                          * and the page isn't currently valid, so we're going to have
2074                          * to read it in first... this is a synchronous operation
2075                          */
2076                         read_size = PAGE_SIZE;
2077
2078                         if ((upl_f_offset + read_size) > newEOF)
2079                                 read_size = newEOF - upl_f_offset;
2080
2081                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
2082                                             CL_READ, (buf_t)NULL, (struct clios *)NULL);
2083                         if (retval) {
2084                                 /*
2085                                  * we had an error during the read which causes us to abort
2086                                  * the current cluster_write request... before we do, we need
2087                                  * to release the rest of the pages in the upl without modifying
2088                                  * there state and mark the failed page in error
2089                                  */
2090                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2091
2092                                 if (upl_size > PAGE_SIZE)
2093                                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2094
2095                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2096                                              (int)upl, 0, 0, retval, 0);
2097                                 break;
2098                         }
2099                 }
2100                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
2101                         /*
2102                          * the last offset we're writing to in this upl does not end on a page
2103                          * boundary... if it's not beyond the old EOF, then we'll also need to
2104                          * pre-read this page in if it isn't already valid
2105                          */
2106                         upl_offset = upl_size - PAGE_SIZE;
2107
2108                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
2109                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
2110                                 int   read_size;
2111
2112                                 read_size = PAGE_SIZE;
2113
2114                                 if ((upl_f_offset + upl_offset + read_size) > newEOF)
2115                                         read_size = newEOF - (upl_f_offset + upl_offset);
2116
2117                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
2118                                                     CL_READ, (buf_t)NULL, (struct clios *)NULL);
2119                                 if (retval) {
2120                                         /*
2121                                          * we had an error during the read which causes us to abort
2122                                          * the current cluster_write request... before we do, we
2123                                          * need to release the rest of the pages in the upl without
2124                                          * modifying there state and mark the failed page in error
2125                                          */
2126                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2127
2128                                         if (upl_size > PAGE_SIZE)
2129                                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2130
2131                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2132                                                      (int)upl, 0, 0, retval, 0);
2133                                         break;
2134                                 }
2135                         }
2136                 }
2137                 xfer_resid = io_size;
2138                 io_offset = start_offset;
2139
2140                 while (zero_cnt && xfer_resid) {
2141
2142                         if (zero_cnt < (long long)xfer_resid)
2143                                 bytes_to_zero = zero_cnt;
2144                         else
2145                                 bytes_to_zero = xfer_resid;
2146
2147                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2148                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2149                         } else {
2150                                 int zero_pg_index;
2151
2152                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
2153                                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
2154
2155                                 if ( !upl_valid_page(pl, zero_pg_index)) {
2156                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2157
2158                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2159                                            !upl_dirty_page(pl, zero_pg_index)) {
2160                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2161                                 }
2162                         }
2163                         xfer_resid -= bytes_to_zero;
2164                         zero_cnt   -= bytes_to_zero;
2165                         zero_off   += bytes_to_zero;
2166                         io_offset  += bytes_to_zero;
2167                 }
2168                 if (xfer_resid && io_resid) {
2169                         bytes_to_move = min(io_resid, xfer_resid);
2170
2171                         retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
2172
2173                         if (retval) {
2174
2175                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2176
2177                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2178                                              (int)upl, 0, 0, retval, 0);
2179                         } else {
2180                                 io_resid  -= bytes_to_move;
2181                                 xfer_resid -= bytes_to_move;
2182                                 io_offset  += bytes_to_move;
2183                         }
2184                 }
2185                 while (xfer_resid && zero_cnt1 && retval == 0) {
2186
2187                         if (zero_cnt1 < (long long)xfer_resid)
2188                                 bytes_to_zero = zero_cnt1;
2189                         else
2190                                 bytes_to_zero = xfer_resid;
2191
2192                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2193                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2194                         } else {
2195                                 int zero_pg_index;
2196
2197                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
2198                                 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
2199
2200                                 if ( !upl_valid_page(pl, zero_pg_index)) {
2201                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2202                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2203                                            !upl_dirty_page(pl, zero_pg_index)) {
2204                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2205                                 }
2206                         }
2207                         xfer_resid -= bytes_to_zero;
2208                         zero_cnt1  -= bytes_to_zero;
2209                         zero_off1  += bytes_to_zero;
2210                         io_offset  += bytes_to_zero;
2211                 }
2212
2213                 if (retval == 0) {
2214                         int cl_index;
2215                         int can_delay;
2216
2217                         io_size += start_offset;
2218
2219                         if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
2220                                 /*
2221                                  * if we're extending the file with this write
2222                                  * we'll zero fill the rest of the page so that
2223                                  * if the file gets extended again in such a way as to leave a
2224                                  * hole starting at this EOF, we'll have zero's in the correct spot
2225                                  */
2226                                 cluster_zero(upl, io_size, upl_size - io_size, NULL);
2227                         }
2228                         if (flags & IO_SYNC)
2229                                 /*
2230                                  * if the IO_SYNC flag is set than we need to
2231                                  * bypass any clusters and immediately issue
2232                                  * the I/O
2233                                  */
2234                                 goto issue_io;
2235 check_cluster:
2236                         /*
2237                          * take the lock to protect our accesses
2238                          * of the writebehind and sparse cluster state
2239                          */
2240                         wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
2241
2242                         /*
2243                          * calculate the last logical block number
2244                          * that this delayed I/O encompassed
2245                          */
2246                         cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
2247
2248                         if (wbp->cl_scmap) {
2249
2250                                 if ( !(flags & IO_NOCACHE)) {
2251                                         /*
2252                                          * we've fallen into the sparse
2253                                          * cluster method of delaying dirty pages
2254                                          * first, we need to release the upl if we hold one
2255                                          * since pages in it may be present in the sparse cluster map
2256                                          * and may span 2 separate buckets there... if they do and
2257                                          * we happen to have to flush a bucket to make room and it intersects
2258                                          * this upl, a deadlock may result on page BUSY
2259                                          */
2260                                         if (upl_size)
2261                                                 ubc_upl_commit_range(upl, 0, upl_size,
2262                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2263
2264                                         sparse_cluster_add(wbp, vp, &cl, newEOF);
2265
2266                                         lck_mtx_unlock(&wbp->cl_lockw);
2267
2268                                         continue;
2269                                 }
2270                                 /*
2271                                  * must have done cached writes that fell into
2272                                  * the sparse cluster mechanism... we've switched
2273                                  * to uncached writes on the file, so go ahead
2274                                  * and push whatever's in the sparse map
2275                                  * and switch back to normal clustering
2276                                  *
2277                                  * see the comment above concerning a possible deadlock...
2278                                  */
2279                                 if (upl_size) {
2280                                         ubc_upl_commit_range(upl, 0, upl_size,
2281                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2282                                         /*
2283                                          * setting upl_size to 0 keeps us from committing a
2284                                          * second time in the start_new_cluster path
2285                                          */
2286                                         upl_size = 0;
2287                                 }
2288                                 sparse_cluster_push(wbp, vp, newEOF, 1);
2289
2290                                 wbp->cl_number = 0;
2291                                 /*
2292                                  * no clusters of either type present at this point
2293                                  * so just go directly to start_new_cluster since
2294                                  * we know we need to delay this I/O since we've
2295                                  * already released the pages back into the cache
2296                                  * to avoid the deadlock with sparse_cluster_push
2297                                  */
2298                                 goto start_new_cluster;
2299                         }
2300                         upl_offset = 0;
2301
2302                         if (wbp->cl_number == 0)
2303                                 /*
2304                                  * no clusters currently present
2305                                  */
2306                                 goto start_new_cluster;
2307
2308                         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
2309                                 /*
2310                                  * check each cluster that we currently hold
2311                                  * try to merge some or all of this write into
2312                                  * one or more of the existing clusters... if
2313                                  * any portion of the write remains, start a
2314                                  * new cluster
2315                                  */
2316                                 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
2317                                         /*
2318                                          * the current write starts at or after the current cluster
2319                                          */
2320                                         if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
2321                                                 /*
2322                                                  * we have a write that fits entirely
2323                                                  * within the existing cluster limits
2324                                                  */
2325                                                 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
2326                                                         /*
2327                                                          * update our idea of where the cluster ends
2328                                                          */
2329                                                         wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2330                                                 break;
2331                                         }
2332                                         if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
2333                                                 /*
2334                                                  * we have a write that starts in the middle of the current cluster
2335                                                  * but extends beyond the cluster's limit... we know this because
2336                                                  * of the previous checks
2337                                                  * we'll extend the current cluster to the max
2338                                                  * and update the b_addr for the current write to reflect that
2339                                                  * the head of it was absorbed into this cluster...
2340                                                  * note that we'll always have a leftover tail in this case since
2341                                                  * full absorbtion would have occurred in the clause above
2342                                                  */
2343                                                 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER;
2344
2345                                                 if (upl_size) {
2346                                                         daddr64_t start_pg_in_upl;
2347
2348                                                         start_pg_in_upl = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
2349
2350                                                         if (start_pg_in_upl < wbp->cl_clusters[cl_index].e_addr) {
2351                                                                 intersection = (int)((wbp->cl_clusters[cl_index].e_addr - start_pg_in_upl) * PAGE_SIZE);
2352
2353                                                                 ubc_upl_commit_range(upl, upl_offset, intersection,
2354                                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2355                                                                 upl_f_offset += intersection;
2356                                                                 upl_offset   += intersection;
2357                                                                 upl_size     -= intersection;
2358                                                         }
2359                                                 }
2360                                                 cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
2361                                         }
2362                                         /*
2363                                          * we come here for the case where the current write starts
2364                                          * beyond the limit of the existing cluster or we have a leftover
2365                                          * tail after a partial absorbtion
2366                                          *
2367                                          * in either case, we'll check the remaining clusters before
2368                                          * starting a new one
2369                                          */
2370                                 } else {
2371                                         /*
2372                                          * the current write starts in front of the cluster we're currently considering
2373                                          */
2374                                         if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= MAX_UPL_TRANSFER) {
2375                                                 /*
2376                                                  * we can just merge the new request into
2377                                                  * this cluster and leave it in the cache
2378                                                  * since the resulting cluster is still
2379                                                  * less than the maximum allowable size
2380                                                  */
2381                                                 wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
2382
2383                                                 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
2384                                                         /*
2385                                                          * the current write completely
2386                                                          * envelops the existing cluster and since
2387                                                          * each write is limited to at most MAX_UPL_TRANSFER bytes
2388                                                          * we can just use the start and last blocknos of the write
2389                                                          * to generate the cluster limits
2390                                                          */
2391                                                         wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2392                                                 }
2393                                                 break;
2394                                         }
2395
2396                                         /*
2397                                          * if we were to combine this write with the current cluster
2398                                          * we would exceed the cluster size limit.... so,
2399                                          * let's see if there's any overlap of the new I/O with
2400                                          * the cluster we're currently considering... in fact, we'll
2401                                          * stretch the cluster out to it's full limit and see if we
2402                                          * get an intersection with the current write
2403                                          *
2404                                          */
2405                                         if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER) {
2406                                                 /*
2407                                                  * the current write extends into the proposed cluster
2408                                                  * clip the length of the current write after first combining it's
2409                                                  * tail with the newly shaped cluster
2410                                                  */
2411                                                 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER;
2412
2413                                                 if (upl_size) {
2414                                                         intersection = (int)((cl.e_addr - wbp->cl_clusters[cl_index].b_addr) * PAGE_SIZE);
2415
2416                                                         if (intersection > upl_size)
2417                                                                 /*
2418                                                                  * because the current write may consist of a number of pages found in the cache
2419                                                                  * which are not part of the UPL, we may have an intersection that exceeds
2420                                                                  * the size of the UPL that is also part of this write
2421                                                                  */
2422                                                                 intersection = upl_size;
2423
2424                                                         ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2425                                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2426                                                         upl_size -= intersection;
2427                                                 }
2428                                                 cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
2429                                         }
2430                                         /*
2431                                          * if we get here, there was no way to merge
2432                                          * any portion of this write with this cluster
2433                                          * or we could only merge part of it which
2434                                          * will leave a tail...
2435                                          * we'll check the remaining clusters before starting a new one
2436                                          */
2437                                 }
2438                         }
2439                         if (cl_index < wbp->cl_number)
2440                                 /*
2441                                  * we found an existing cluster(s) that we
2442                                  * could entirely merge this I/O into
2443                                  */
2444                                 goto delay_io;
2445
2446                         if (wbp->cl_number < MAX_CLUSTERS && !(flags & IO_NOCACHE))
2447                                 /*
2448                                  * we didn't find an existing cluster to
2449                                  * merge into, but there's room to start
2450                                  * a new one
2451                                  */
2452                                 goto start_new_cluster;
2453
2454                         /*
2455                          * no exisitng cluster to merge with and no
2456                          * room to start a new one... we'll try
2457                          * pushing one of the existing ones... if none of
2458                          * them are able to be pushed, we'll switch
2459                          * to the sparse cluster mechanism
2460                          * cluster_try_push updates cl_number to the
2461                          * number of remaining clusters... and
2462                          * returns the number of currently unused clusters
2463                          */
2464                         int ret_cluster_try_push = 0;
2465                         /* if writes are not deferred, call cluster push immediately */
2466                         if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2467                                 if (flags & IO_NOCACHE)
2468                                         can_delay = 0;
2469                                 else
2470                                         can_delay = 1;
2471
2472                                 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, can_delay, 0);
2473                         }
2474
2475                         /* execute following regardless writes are deferred or not */
2476                         if (ret_cluster_try_push == 0) {
2477                                 /*
2478                                  * no more room in the normal cluster mechanism
2479                                  * so let's switch to the more expansive but expensive
2480                                  * sparse mechanism....
2481                                  * first, we need to release the upl if we hold one
2482                                  * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2483                                  * and may span 2 separate buckets there... if they do and
2484                                  * we happen to have to flush a bucket to make room and it intersects
2485                                  * this upl, a deadlock may result on page BUSY
2486                                  */
2487                                 if (upl_size)
2488                                         ubc_upl_commit_range(upl, upl_offset, upl_size,
2489                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2490
2491                                 sparse_cluster_switch(wbp, vp, newEOF);
2492                                 sparse_cluster_add(wbp, vp, &cl, newEOF);
2493
2494                                 lck_mtx_unlock(&wbp->cl_lockw);
2495
2496                                 continue;
2497                         }
2498                         /*
2499                          * we pushed one cluster successfully, so we must be sequentially writing this file
2500                          * otherwise, we would have failed and fallen into the sparse cluster support
2501                          * so let's take the opportunity to push out additional clusters as long as we
2502                          * remain below the throttle... this will give us better I/O locality if we're
2503                          * in a copy loop (i.e.  we won't jump back and forth between the read and write points
2504                          * however, we don't want to push so much out that the write throttle kicks in and
2505                          * hangs this thread up until some of the I/O completes...
2506                          */
2507                         if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2508                                 while (wbp->cl_number && (vp->v_numoutput <= (VNODE_ASYNC_THROTTLE / 2)))
2509                                         cluster_try_push(wbp, vp, newEOF, 0, 0);
2510                         }
2511
2512 start_new_cluster:
2513                         wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
2514                         wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
2515
2516                         if (flags & IO_NOCACHE)
2517                                 wbp->cl_clusters[wbp->cl_number].io_nocache = 1;
2518                         else
2519                                 wbp->cl_clusters[wbp->cl_number].io_nocache = 0;
2520                         wbp->cl_number++;
2521 delay_io:
2522                         if (upl_size)
2523                                 ubc_upl_commit_range(upl, upl_offset, upl_size,
2524                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2525
2526                         lck_mtx_unlock(&wbp->cl_lockw);
2527
2528                         continue;
2529 issue_io:
2530                         /*
2531                          * we don't hold the vnode lock at this point
2532                          *
2533                          * because we had to ask for a UPL that provides currenty non-present pages, the
2534                          * UPL has been automatically set to clear the dirty flags (both software and hardware)
2535                          * upon committing it... this is not the behavior we want since it's possible for
2536                          * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
2537                          * in order to maintain some semblance of coherency with mapped writes
2538                          * we need to drop the current upl and pick it back up with COPYOUT_FROM set
2539                          * so that we correctly deal with a change in state of the hardware modify bit...
2540                          * we do this via cluster_push_x... by passing along the IO_SYNC flag, we force
2541                          * cluster_push_x to wait until all the I/Os have completed... cluster_push_x is also
2542                          * responsible for generating the correct sized I/O(s)
2543                          */
2544                         ubc_upl_commit_range(upl, 0, upl_size,
2545                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2546
2547                         cl.e_addr = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
2548
2549                         retval = cluster_push_x(vp, &cl, newEOF, flags);
2550                 }
2551         }
2552         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2553                      retval, 0, io_resid, 0, 0);
2554
2555         return (retval);
2556 }
2557
2558 int
2559 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
2560 {
2561         int           prev_resid;
2562         u_int         clip_size;
2563         off_t         max_io_size;
2564         int           upl_size;
2565         int           upl_flags;
2566         upl_t         upl;
2567         int           retval = 0;
2568         int           flags;
2569
2570         flags = xflags;
2571
2572         if (vp->v_flag & VNOCACHE_DATA)
2573                 flags |= IO_NOCACHE;
2574         if (vp->v_flag & VRAOFF)
2575                 flags |= IO_RAOFF;
2576
2577         if (!((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
2578                 /*
2579                  * go do a read through the cache if one of the following is true....
2580                  *   NOCACHE is not true
2581                  *   the uio request doesn't target USERSPACE
2582                  */
2583                 return (cluster_read_x(vp, uio, filesize, flags));
2584         }
2585
2586 #if LP64_DEBUG
2587         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
2588                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
2589         }
2590 #endif /* LP64_DEBUG */
2591
2592         while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
2593                 user_size_t     iov_len;
2594                 user_addr_t     iov_base;
2595
2596                 /*
2597                  * we know we have a resid, so this is safe
2598                  * skip over any emtpy vectors
2599                  */
2600                 uio_update(uio, (user_size_t)0);
2601
2602                 iov_len  = uio_curriovlen(uio);
2603                 iov_base = uio_curriovbase(uio);
2604
2605                 upl_size  = PAGE_SIZE;
2606                 upl_flags = UPL_QUERY_OBJECT_TYPE;
2607
2608                 // LP64todo - fix this!
2609                 if ((vm_map_get_upl(current_map(),
2610                                     (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2611                                     &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
2612                         /*
2613                          * the user app must have passed in an invalid address
2614                          */
2615                         return (EFAULT);
2616                 }
2617
2618                 /*
2619                  * We check every vector target but if it is physically
2620                  * contiguous space, we skip the sanity checks.
2621                  */
2622                 if (upl_flags & UPL_PHYS_CONTIG) {
2623                         retval = cluster_phys_read(vp, uio, filesize);
2624                 }
2625                 else if (uio_resid(uio) < PAGE_SIZE) {
2626                         /*
2627                          * we're here because we're don't have a physically contiguous target buffer
2628                          * go do a read through the cache if
2629                          *   the total xfer size is less than a page...
2630                          */
2631                         return (cluster_read_x(vp, uio, filesize, flags));
2632                 }
2633                 // LP64todo - fix this!
2634                 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2635                        if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2636                                /*
2637                                 * Bring the file offset read up to a pagesize boundary
2638                                 * this will also bring the base address to a page boundary
2639                                 * since they both are currently on the same offset within a page
2640                                 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2641                                 * so the computed clip_size must always be less than the current uio_resid
2642                                 */
2643                                clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2644
2645                                /*
2646                                 * Fake the resid going into the cluster_read_x call
2647                                 * and restore it on the way out.
2648                                 */
2649                                prev_resid = uio_resid(uio);
2650                                // LP64todo - fix this
2651                                uio_setresid(uio, clip_size);
2652
2653                                retval = cluster_read_x(vp, uio, filesize, flags);
2654
2655                                uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2656                        } else {
2657                                /*
2658                                 * can't get both the file offset and the buffer offset aligned to a page boundary
2659                                 * so fire an I/O through the cache for this entire vector
2660                                 */
2661                                // LP64todo - fix this!
2662                                clip_size = iov_len;
2663                                prev_resid = uio_resid(uio);
2664                                uio_setresid(uio, clip_size);
2665
2666                                retval = cluster_read_x(vp, uio, filesize, flags);
2667
2668                                uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2669                        }
2670                 } else {
2671                         /*
2672                          * If we come in here, we know the offset into
2673                          * the file is on a pagesize boundary
2674                          */
2675                         max_io_size = filesize - uio->uio_offset;
2676                         // LP64todo - fix this
2677                         clip_size = uio_resid(uio);
2678                         if (iov_len < clip_size)
2679                                 clip_size = iov_len;
2680                         if (max_io_size < clip_size)
2681                                 clip_size = (int)max_io_size;
2682
2683                         if (clip_size < PAGE_SIZE) {
2684                                 /*
2685                                  * Take care of the tail end of the read in this vector.
2686                                  */
2687                                 // LP64todo - fix this
2688                                 prev_resid = uio_resid(uio);
2689                                 uio_setresid(uio, clip_size);
2690
2691                                 retval = cluster_read_x(vp, uio, filesize, flags);
2692
2693                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2694                         } else {
2695                                 /* round clip_size down to a multiple of pagesize */
2696                                 clip_size = clip_size & ~(PAGE_MASK);
2697                                 // LP64todo - fix this
2698                                 prev_resid = uio_resid(uio);
2699                                 uio_setresid(uio, clip_size);
2700
2701                                 retval = cluster_nocopy_read(vp, uio, filesize);
2702
2703                                 if ((retval==0) && uio_resid(uio))
2704                                         retval = cluster_read_x(vp, uio, filesize, flags);
2705
2706                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2707                         }
2708                 } /* end else */
2709         } /* end while */
2710
2711         return(retval);
2712 }
2713
2714 static int
2715 cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags)
2716 {
2717         upl_page_info_t *pl;
2718         upl_t            upl;
2719         vm_offset_t      upl_offset;
2720         int              upl_size;
2721         off_t            upl_f_offset;
2722         int              start_offset;
2723         int              start_pg;
2724         int              last_pg;
2725         int              uio_last = 0;
2726         int              pages_in_upl;
2727         off_t            max_size;
2728         off_t            last_ioread_offset;
2729         off_t            last_request_offset;
2730         u_int            size_of_prefetch;
2731         u_int            io_size;
2732         kern_return_t    kret;
2733         int              error  = 0;
2734         int              retval = 0;
2735         u_int            max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2736         u_int            rd_ahead_enabled = 1;
2737         u_int            prefetch_enabled = 1;
2738         struct cl_readahead *   rap;
2739         struct clios            iostate;
2740         struct cl_extent        extent;
2741
2742         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2743                      (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
2744
2745         // LP64todo - fix this
2746         last_request_offset = uio->uio_offset + uio_resid(uio);
2747
2748         if ((flags & (IO_RAOFF|IO_NOCACHE)) ||
2749                 ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
2750                 rd_ahead_enabled = 0;
2751                 rap = NULL;
2752         } else {
2753                 if (cluster_hard_throttle_on(vp)) {
2754                         rd_ahead_enabled = 0;
2755                         prefetch_enabled = 0;
2756
2757                         max_rd_size = HARD_THROTTLE_MAXSIZE;
2758                 }
2759                 if ((rap = cluster_get_rap(vp)) == NULL)
2760                         rd_ahead_enabled = 0;
2761         }
2762         if (last_request_offset > filesize)
2763                 last_request_offset = filesize;
2764         extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
2765         extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
2766
2767         if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
2768                 /*
2769                  * determine if we already have a read-ahead in the pipe courtesy of the
2770                  * last read systemcall that was issued...
2771                  * if so, pick up it's extent to determine where we should start
2772                  * with respect to any read-ahead that might be necessary to
2773                  * garner all the data needed to complete this read systemcall
2774                  */
2775                 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
2776
2777                 if (last_ioread_offset < uio->uio_offset)
2778                         last_ioread_offset = (off_t)0;
2779                 else if (last_ioread_offset > last_request_offset)
2780                         last_ioread_offset = last_request_offset;
2781         } else
2782                 last_ioread_offset = (off_t)0;
2783
2784         while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
2785                 /*
2786                  * compute the size of the upl needed to encompass
2787                  * the requested read... limit each call to cluster_io
2788                  * to the maximum UPL size... cluster_io will clip if
2789                  * this exceeds the maximum io_size for the device,
2790                  * make sure to account for
2791                  * a starting offset that's not page aligned
2792                  */
2793                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2794                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2795                 max_size     = filesize - uio->uio_offset;
2796
2797         // LP64todo - fix this!
2798                 if ((off_t)((unsigned int)uio_resid(uio)) < max_size)
2799                         io_size = uio_resid(uio);
2800                 else
2801                         io_size = max_size;
2802
2803                 if (!(flags & IO_NOCACHE)) {
2804
2805                         while (io_size) {
2806                                 u_int io_resid;
2807                                 u_int io_requested;
2808
2809                                 /*
2810                                  * if we keep finding the pages we need already in the cache, then
2811                                  * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2812                                  * to determine that we have all the pages we need... once we miss in
2813                                  * the cache and have issued an I/O, than we'll assume that we're likely
2814                                  * to continue to miss in the cache and it's to our advantage to try and prefetch
2815                                  */
2816                                 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
2817                                         if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
2818                                                 /*
2819                                                  * we've already issued I/O for this request and
2820                                                  * there's still work to do and
2821                                                  * our prefetch stream is running dry, so issue a
2822                                                  * pre-fetch I/O... the I/O latency will overlap
2823                                                  * with the copying of the data
2824                                                  */
2825                                                 if (size_of_prefetch > max_rd_size)
2826                                                         size_of_prefetch = max_rd_size;
2827
2828                                                 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
2829
2830                                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2831
2832                                                 if (last_ioread_offset > last_request_offset)
2833                                                         last_ioread_offset = last_request_offset;
2834                                         }
2835                                 }
2836                                 /*
2837                                  * limit the size of the copy we're about to do so that
2838                                  * we can notice that our I/O pipe is running dry and
2839                                  * get the next I/O issued before it does go dry
2840                                  */
2841                                 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
2842                                         io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
2843                                 else
2844                                         io_resid = io_size;
2845
2846                                 io_requested = io_resid;
2847
2848                                 retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
2849
2850                                 io_size -= (io_requested - io_resid);
2851
2852                                 if (retval || io_resid)
2853                                         /*
2854                                          * if we run into a real error or
2855                                          * a page that is not in the cache
2856                                          * we need to leave streaming mode
2857                                          */
2858                                         break;
2859
2860                                 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
2861                                         /*
2862                                          * we're already finished the I/O for this read request
2863                                          * let's see if we should do a read-ahead
2864                                          */
2865                                         cluster_rd_ahead(vp, &extent, filesize, rap);
2866                                 }
2867                         }
2868                         if (retval)
2869                                 break;
2870                         if (io_size == 0) {
2871                                 if (rap != NULL) {
2872                                         if (extent.e_addr < rap->cl_lastr)
2873                                                 rap->cl_maxra = 0;
2874                                         rap->cl_lastr = extent.e_addr;
2875                                 }
2876                                 break;
2877                         }
2878                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2879                         upl_f_offset = uio->uio_offset - (off_t)start_offset;
2880                         max_size     = filesize - uio->uio_offset;
2881                 }
2882                 if (io_size > max_rd_size)
2883                         io_size = max_rd_size;
2884
2885                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2886
2887                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2888                         upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
2889                 pages_in_upl = upl_size / PAGE_SIZE;
2890
2891                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2892                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2893
2894                 kret = ubc_create_upl(vp,
2895                                       upl_f_offset,
2896                                       upl_size,
2897                                       &upl,
2898                                       &pl,
2899                                       UPL_SET_LITE);
2900                 if (kret != KERN_SUCCESS)
2901                         panic("cluster_read: failed to get pagelist");
2902
2903                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2904                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2905
2906                 /*
2907                  * scan from the beginning of the upl looking for the first
2908                  * non-valid page.... this will become the first page in
2909                  * the request we're going to make to 'cluster_io'... if all
2910                  * of the pages are valid, we won't call through to 'cluster_io'
2911                  */
2912                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2913                         if (!upl_valid_page(pl, start_pg))
2914                                 break;
2915                 }
2916
2917                 /*
2918                  * scan from the starting invalid page looking for a valid
2919                  * page before the end of the upl is reached, if we
2920                  * find one, then it will be the last page of the request to
2921                  * 'cluster_io'
2922                  */
2923                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2924                         if (upl_valid_page(pl, last_pg))
2925                                 break;
2926                 }
2927                 iostate.io_completed = 0;
2928                 iostate.io_issued = 0;
2929                 iostate.io_error = 0;
2930                 iostate.io_wanted = 0;
2931
2932                 if (start_pg < last_pg) {
2933                         /*
2934                          * we found a range of 'invalid' pages that must be filled
2935                          * if the last page in this range is the last page of the file
2936                          * we may have to clip the size of it to keep from reading past
2937                          * the end of the last physical block associated with the file
2938                          */
2939                         upl_offset = start_pg * PAGE_SIZE;
2940                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2941
2942                         if ((upl_f_offset + upl_offset + io_size) > filesize)
2943                                 io_size = filesize - (upl_f_offset + upl_offset);
2944
2945                         /*
2946                          * issue an asynchronous read to cluster_io
2947                          */
2948
2949                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2950                                            io_size, CL_READ | CL_ASYNC, (buf_t)NULL, &iostate);
2951                 }
2952                 if (error == 0) {
2953                         /*
2954                          * if the read completed successfully, or there was no I/O request
2955                          * issued, than copy the data into user land via 'cluster_upl_copy_data'
2956                          * we'll first add on any 'valid'
2957                          * pages that were present in the upl when we acquired it.
2958                          */
2959                         u_int  val_size;
2960
2961                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2962                                 if (!upl_valid_page(pl, uio_last))
2963                                         break;
2964                         }
2965                         /*
2966                          * compute size to transfer this round,  if uio->uio_resid is
2967                          * still non-zero after this attempt, we'll loop around and
2968                          * set up for another I/O.
2969                          */
2970                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2971
2972                         if (val_size > max_size)
2973                                 val_size = max_size;
2974
2975                         if (val_size > uio_resid(uio))
2976         // LP64todo - fix this
2977                                 val_size = uio_resid(uio);
2978
2979                         if (last_ioread_offset == 0)
2980                                 last_ioread_offset = uio->uio_offset + val_size;
2981
2982                         if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
2983                                 /*
2984                                  * if there's still I/O left to do for this request, and...
2985                                  * we're not in hard throttle mode, then issue a
2986                                  * pre-fetch I/O... the I/O latency will overlap
2987                                  * with the copying of the data
2988                                  */
2989                                 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
2990
2991                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2992
2993                                 if (last_ioread_offset > last_request_offset)
2994                                         last_ioread_offset = last_request_offset;
2995
2996                         } else if ((uio->uio_offset + val_size) == last_request_offset) {
2997                                 /*
2998                                  * this transfer will finish this request, so...
2999                                  * let's try to read ahead if we're in
3000                                  * a sequential access pattern and we haven't
3001                                  * explicitly disabled it
3002                                  */
3003                                 if (rd_ahead_enabled)
3004                                         cluster_rd_ahead(vp, &extent, filesize, rap);
3005
3006                                 if (rap != NULL) {
3007                                         if (extent.e_addr < rap->cl_lastr)
3008                                                 rap->cl_maxra = 0;
3009                                         rap->cl_lastr = extent.e_addr;
3010                                 }
3011                         }
3012                         lck_mtx_lock(cl_mtxp);
3013
3014                         while (iostate.io_issued != iostate.io_completed) {
3015                                 iostate.io_wanted = 1;
3016                                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_x", 0);
3017                         }
3018                         lck_mtx_unlock(cl_mtxp);
3019
3020                         if (iostate.io_error)
3021                                 error = iostate.io_error;
3022                         else
3023                                 retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
3024                 }
3025                 if (start_pg < last_pg) {
3026                         /*
3027                          * compute the range of pages that we actually issued an I/O for
3028                          * and either commit them as valid if the I/O succeeded
3029                          * or abort them if the I/O failed
3030                          */
3031                         io_size = (last_pg - start_pg) * PAGE_SIZE;
3032
3033                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3034                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3035
3036                         if (error || (flags & IO_NOCACHE))
3037                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
3038                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3039                         else
3040                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
3041                                                      UPL_COMMIT_CLEAR_DIRTY |
3042                                                      UPL_COMMIT_FREE_ON_EMPTY |
3043                                                      UPL_COMMIT_INACTIVATE);
3044
3045                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
3046                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3047                 }
3048                 if ((last_pg - start_pg) < pages_in_upl) {
3049                         int cur_pg;
3050                         int commit_flags;
3051
3052                         /*
3053                          * the set of pages that we issued an I/O for did not encompass
3054                          * the entire upl... so just release these without modifying
3055                          * their state
3056                          */
3057                         if (error)
3058                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3059                         else {
3060                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3061                                              (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
3062
3063                                 if (start_pg) {
3064                                         /*
3065                                          * we found some already valid pages at the beginning of
3066                                          * the upl commit these back to the inactive list with
3067                                          * reference cleared
3068                                          */
3069                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
3070                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
3071                                                                    | UPL_COMMIT_INACTIVATE;
3072
3073                                                 if (upl_dirty_page(pl, cur_pg))
3074                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
3075
3076                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
3077                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3078                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3079                                                 else
3080                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3081                                                                 PAGE_SIZE, commit_flags);
3082                                         }
3083                                 }
3084                                 if (last_pg < uio_last) {
3085                                         /*
3086                                          * we found some already valid pages immediately after the
3087                                          * pages we issued I/O for, commit these back to the
3088                                          * inactive list with reference cleared
3089                                          */
3090                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
3091                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
3092                                                                                 | UPL_COMMIT_INACTIVATE;
3093
3094                                                 if (upl_dirty_page(pl, cur_pg))
3095                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
3096
3097                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
3098                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3099                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3100                                                 else
3101                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3102                                                                 PAGE_SIZE, commit_flags);
3103                                         }
3104                                 }
3105                                 if (uio_last < pages_in_upl) {
3106                                         /*
3107                                          * there were some invalid pages beyond the valid pages
3108                                          * that we didn't issue an I/O for, just release them
3109                                          * unchanged
3110                                          */
3111                                         ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
3112                                                             (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
3113                                 }
3114
3115                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
3116                                         (int)upl, -1, -1, 0, 0);
3117                         }
3118                 }
3119                 if (retval == 0)
3120                         retval = error;
3121
3122                 if ( uio_resid(uio) ) {
3123                         if (cluster_hard_throttle_on(vp)) {
3124                                 rd_ahead_enabled = 0;
3125                                 prefetch_enabled = 0;
3126
3127                                 max_rd_size = HARD_THROTTLE_MAXSIZE;
3128                         } else {
3129                                 if (rap != NULL)
3130                                         rd_ahead_enabled = 1;
3131                                 prefetch_enabled = 1;
3132
3133                                 max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3134                         }
3135                 }
3136         }
3137         if (rap != NULL) {
3138                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3139                              (int)uio->uio_offset, uio_resid(uio), rap->cl_lastr, retval, 0);
3140
3141                 lck_mtx_unlock(&rap->cl_lockr);
3142         } else {
3143                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3144                              (int)uio->uio_offset, uio_resid(uio), 0, retval, 0);
3145         }
3146
3147         return (retval);
3148 }
3149
3150
3151 static int
3152 cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize)
3153 {
3154         upl_t            upl;
3155         upl_page_info_t  *pl;
3156         vm_offset_t      upl_offset;
3157         off_t            max_io_size;
3158         int              io_size;
3159         int              upl_size;
3160         int              upl_needed_size;
3161         int              pages_in_pl;
3162         int              upl_flags;
3163         kern_return_t    kret;
3164         int              i;
3165         int              force_data_sync;
3166         int              retval = 0;
3167         int              no_zero_fill = 0;
3168         int              abort_flag = 0;
3169         struct clios     iostate;
3170         u_int            max_rd_size  = MAX_UPL_TRANSFER * PAGE_SIZE;
3171         u_int            max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
3172
3173
3174         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
3175                      (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
3176
3177         /*
3178          * When we enter this routine, we know
3179          *  -- the offset into the file is on a pagesize boundary
3180          *  -- the resid is a page multiple
3181          *  -- the resid will not exceed iov_len
3182          */
3183
3184         iostate.io_completed = 0;
3185         iostate.io_issued = 0;
3186         iostate.io_error = 0;
3187         iostate.io_wanted = 0;
3188
3189         while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
3190                 user_addr_t     iov_base;
3191
3192                 if (cluster_hard_throttle_on(vp)) {
3193                         max_rd_size  = HARD_THROTTLE_MAXSIZE;
3194                         max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
3195                 } else {
3196                         max_rd_size  = MAX_UPL_TRANSFER * PAGE_SIZE;
3197                         max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 8;
3198                 }
3199                 max_io_size = filesize - uio->uio_offset;
3200
3201         // LP64todo - fix this
3202                 if (max_io_size < (off_t)((unsigned int)uio_resid(uio)))
3203                         io_size = max_io_size;
3204                 else
3205                         io_size = uio_resid(uio);
3206
3207                 /*
3208                  * First look for pages already in the cache
3209                  * and move them to user space.
3210                  */
3211                 retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
3212
3213                 if (retval) {
3214                         /*
3215                          * we may have already spun some portion of this request
3216                          * off as async requests... we need to wait for the I/O
3217                          * to complete before returning
3218                          */
3219                         goto wait_for_reads;
3220                 }
3221                 /*
3222                  * If we are already finished with this read, then return
3223                  */
3224                 if (io_size == 0) {
3225                         /*
3226                          * we may have already spun some portion of this request
3227                          * off as async requests... we need to wait for the I/O
3228                          * to complete before returning
3229                          */
3230                         goto wait_for_reads;
3231                 }
3232                 max_io_size = io_size;
3233
3234                 if (max_io_size > max_rd_size)
3235                         max_io_size = max_rd_size;
3236
3237                 io_size = 0;
3238
3239                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
3240
3241                 if (io_size == 0)
3242                         /*
3243                          * we may have already spun some portion of this request
3244                          * off as async requests... we need to wait for the I/O
3245                          * to complete before returning
3246                          */
3247                         goto wait_for_reads;
3248
3249                 iov_base = uio_curriovbase(uio);
3250
3251                 // LP64todo - fix this!
3252                 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
3253                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
3254
3255                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
3256                              (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
3257
3258                 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
3259                         no_zero_fill = 1;
3260                         abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY;
3261                 } else {
3262                         no_zero_fill = 0;
3263                         abort_flag = UPL_ABORT_FREE_ON_EMPTY;
3264                 }
3265                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
3266                         pages_in_pl = 0;
3267                         upl_size = upl_needed_size;
3268                         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3269
3270                         if (no_zero_fill)
3271                                 upl_flags |= UPL_NOZEROFILL;
3272                         if (force_data_sync)
3273                                 upl_flags |= UPL_FORCE_DATA_SYNC;
3274
3275                         // LP64todo - fix this!
3276                         kret = vm_map_create_upl(current_map(),
3277                                                  (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
3278                                                  &upl_size, &upl, NULL, &pages_in_pl, &upl_flags);
3279
3280                         if (kret != KERN_SUCCESS) {
3281                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3282                                              (int)upl_offset, upl_size, io_size, kret, 0);
3283                                 /*
3284                                  * cluster_nocopy_read: failed to get pagelist
3285                                  *
3286                                  * we may have already spun some portion of this request
3287                                  * off as async requests... we need to wait for the I/O
3288                                  * to complete before returning
3289                                  */
3290                                 goto wait_for_reads;
3291                         }
3292                         pages_in_pl = upl_size / PAGE_SIZE;
3293                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3294
3295                         for (i = 0; i < pages_in_pl; i++) {
3296                                 if (!upl_valid_page(pl, i))
3297                                         break;
3298                         }
3299                         if (i == pages_in_pl)
3300                                 break;
3301
3302                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3303                 }
3304                 if (force_data_sync >= 3) {
3305                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3306                                      (int)upl_offset, upl_size, io_size, kret, 0);
3307
3308                         goto wait_for_reads;
3309                 }
3310                 /*
3311                  * Consider the possibility that upl_size wasn't satisfied.
3312                  */
3313                 if (upl_size != upl_needed_size)
3314                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
3315
3316                 if (io_size == 0) {
3317                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3318                         goto wait_for_reads;
3319                 }
3320                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3321                              (int)upl_offset, upl_size, io_size, kret, 0);
3322
3323                 /*
3324                  * request asynchronously so that we can overlap
3325                  * the preparation of the next I/O
3326                  * if there are already too many outstanding reads
3327                  * wait until some have completed before issuing the next read
3328                  */
3329                 lck_mtx_lock(cl_mtxp);
3330
3331                 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
3332                         iostate.io_wanted = 1;
3333                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
3334                 }
3335                 lck_mtx_unlock(cl_mtxp);
3336
3337                 if (iostate.io_error) {
3338                         /*
3339                          * one of the earlier reads we issued ran into a hard error
3340                          * don't issue any more reads, cleanup the UPL
3341                          * that was just created but not used, then
3342                          * go wait for any other reads to complete before
3343                          * returning the error to the caller
3344                          */
3345                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3346
3347                         goto wait_for_reads;
3348                 }
3349                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
3350                              (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
3351
3352                 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size,
3353                                    CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
3354                                    (buf_t)NULL, &iostate);
3355
3356                 /*
3357                  * update the uio structure
3358                  */
3359                 uio_update(uio, (user_size_t)io_size);
3360
3361                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
3362                              (int)upl, (int)uio->uio_offset, (int)uio_resid(uio), retval, 0);
3363
3364         } /* end while */
3365
3366 wait_for_reads:
3367         /*
3368          * make sure all async reads that are part of this stream
3369          * have completed before we return
3370          */
3371         lck_mtx_lock(cl_mtxp);
3372
3373         while (iostate.io_issued != iostate.io_completed) {
3374                 iostate.io_wanted = 1;
3375                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
3376         }
3377         lck_mtx_unlock(cl_mtxp);
3378
3379         if (iostate.io_error)
3380                 retval = iostate.io_error;
3381
3382         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
3383                      (int)uio->uio_offset, (int)uio_resid(uio), 6, retval, 0);
3384
3385         return (retval);
3386 }
3387
3388
3389 static int
3390 cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize)
3391 {
3392         upl_page_info_t *pl;
3393         upl_t            upl;
3394         vm_offset_t      upl_offset;
3395         addr64_t         dst_paddr;
3396         off_t            max_size;
3397         int              io_size;
3398         user_size_t      iov_len;
3399         user_addr_t      iov_base;
3400         int              tail_size;
3401         int              upl_size;
3402         int              upl_needed_size;
3403         int              pages_in_pl;
3404         int              upl_flags;
3405         kern_return_t    kret;
3406         struct clios     iostate;
3407         int              error;
3408         int              devblocksize;
3409
3410         devblocksize = vp->v_mount->mnt_devblocksize;
3411         /*
3412          * When we enter this routine, we know
3413          *  -- the resid will not exceed iov_len
3414          *  -- the target address is physically contiguous
3415          */
3416
3417 #if LP64_DEBUG
3418         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
3419                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
3420         }
3421 #endif /* LP64_DEBUG */
3422
3423         iov_len = uio_curriovlen(uio);
3424         iov_base = uio_curriovbase(uio);
3425
3426         max_size = filesize - uio->uio_offset;
3427
3428         // LP64todo - fix this!
3429         if (max_size < 0 || (u_int64_t)max_size > iov_len)
3430                 io_size = iov_len;
3431         else
3432                 io_size = max_size;
3433
3434         // LP64todo - fix this!
3435         upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
3436         upl_needed_size = upl_offset + io_size;
3437
3438         error       = 0;
3439         pages_in_pl = 0;
3440         upl_size = upl_needed_size;
3441         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3442
3443         kret = vm_map_get_upl(current_map(),
3444                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
3445                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
3446
3447         if (kret != KERN_SUCCESS) {
3448                 /*
3449                  * cluster_phys_read: failed to get pagelist
3450                  */
3451                 return(EINVAL);
3452         }
3453         if (upl_size < upl_needed_size) {
3454                 /*
3455                  * The upl_size wasn't satisfied.
3456                  */
3457                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3458
3459                 return(EINVAL);
3460         }
3461         pl = ubc_upl_pageinfo(upl);
3462
3463         dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
3464
3465         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3466                 int   head_size;
3467
3468                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
3469
3470                 if (head_size > io_size)
3471                         head_size = io_size;
3472
3473                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ);
3474
3475                 if (error) {
3476                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3477
3478                         return(EINVAL);
3479                 }
3480                 upl_offset += head_size;
3481                 dst_paddr  += head_size;
3482                 io_size    -= head_size;
3483         }
3484         tail_size = io_size & (devblocksize - 1);
3485         io_size  -= tail_size;
3486
3487         iostate.io_completed = 0;
3488         iostate.io_issued = 0;
3489         iostate.io_error = 0;
3490         iostate.io_wanted = 0;
3491
3492         while (io_size && error == 0) {
3493                 int  xsize;
3494
3495                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3496                         xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
3497                 else
3498                         xsize = io_size;
3499                 /*
3500                  * request asynchronously so that we can overlap
3501                  * the preparation of the next I/O... we'll do
3502                  * the commit after all the I/O has completed
3503                  * since its all issued against the same UPL
3504                  * if there are already too many outstanding reads
3505                  * wait until some have completed before issuing the next
3506                  */
3507                 lck_mtx_lock(cl_mtxp);
3508
3509                 while ((iostate.io_issued - iostate.io_completed) > (8 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
3510                         iostate.io_wanted = 1;
3511                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
3512                 }
3513                 lck_mtx_unlock(cl_mtxp);
3514
3515                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize,
3516                                    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3517                                    (buf_t)NULL, &iostate);
3518                 /*
3519                  * The cluster_io read was issued successfully,
3520                  * update the uio structure
3521                  */
3522                 if (error == 0) {
3523                         uio_update(uio, (user_size_t)xsize);
3524
3525                         dst_paddr  += xsize;
3526                         upl_offset += xsize;
3527                         io_size    -= xsize;
3528                 }
3529         }
3530         /*
3531          * make sure all async reads that are part of this stream
3532          * have completed before we proceed
3533          */
3534         lck_mtx_lock(cl_mtxp);
3535
3536         while (iostate.io_issued != iostate.io_completed) {
3537                 iostate.io_wanted = 1;
3538                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
3539         }
3540         lck_mtx_unlock(cl_mtxp);
3541
3542         if (iostate.io_error)
3543                 error = iostate.io_error;
3544
3545         if (error == 0 && tail_size)
3546                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ);
3547
3548         /*
3549          * just release our hold on the physically contiguous
3550          * region without changing any state
3551          */
3552         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3553
3554         return (error);
3555 }
3556
3557
3558 /*
3559  * generate advisory I/O's in the largest chunks possible
3560  * the completed pages will be released into the VM cache
3561  */
3562 int
3563 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
3564 {
3565         upl_page_info_t *pl;
3566         upl_t            upl;
3567         vm_offset_t      upl_offset;
3568         int              upl_size;
3569         off_t            upl_f_offset;
3570         int              start_offset;
3571         int              start_pg;
3572         int              last_pg;
3573         int              pages_in_upl;
3574         off_t            max_size;
3575         int              io_size;
3576         kern_return_t    kret;
3577         int              retval = 0;
3578         int              issued_io;
3579         int              skip_range;
3580
3581         if ( !UBCINFOEXISTS(vp))
3582                 return(EINVAL);
3583
3584         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3585                      (int)f_offset, resid, (int)filesize, 0, 0);
3586
3587         while (resid && f_offset < filesize && retval == 0) {
3588                 /*
3589                  * compute the size of the upl needed to encompass
3590                  * the requested read... limit each call to cluster_io
3591                  * to the maximum UPL size... cluster_io will clip if
3592                  * this exceeds the maximum io_size for the device,
3593                  * make sure to account for
3594                  * a starting offset that's not page aligned
3595                  */
3596                 start_offset = (int)(f_offset & PAGE_MASK_64);
3597                 upl_f_offset = f_offset - (off_t)start_offset;
3598                 max_size     = filesize - f_offset;
3599
3600                 if (resid < max_size)
3601                         io_size = resid;
3602                 else
3603                         io_size = max_size;
3604
3605                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3606                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3607                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3608
3609                 skip_range = 0;
3610                 /*
3611                  * return the number of contiguously present pages in the cache
3612                  * starting at upl_f_offset within the file
3613                  */
3614                 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
3615
3616                 if (skip_range) {
3617                         /*
3618                          * skip over pages already present in the cache
3619                          */
3620                         io_size = skip_range - start_offset;
3621
3622                         f_offset += io_size;
3623                         resid    -= io_size;
3624
3625                         if (skip_range == upl_size)
3626                                 continue;
3627                         /*
3628                          * have to issue some real I/O
3629                          * at this point, we know it's starting on a page boundary
3630                          * because we've skipped over at least the first page in the request
3631                          */
3632                         start_offset = 0;
3633                         upl_f_offset += skip_range;
3634                         upl_size     -= skip_range;
3635                 }
3636                 pages_in_upl = upl_size / PAGE_SIZE;
3637
3638                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
3639                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3640
3641                 kret = ubc_create_upl(vp,
3642                                       upl_f_offset,
3643                                       upl_size,
3644                                       &upl,
3645                                       &pl,
3646                                       UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
3647                 if (kret != KERN_SUCCESS)
3648                         return(retval);
3649                 issued_io = 0;
3650
3651                 /*
3652                  * before we start marching forward, we must make sure we end on
3653                  * a present page, otherwise we will be working with a freed
3654                  * upl
3655                  */
3656                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3657                         if (upl_page_present(pl, last_pg))
3658                                 break;
3659                 }
3660                 pages_in_upl = last_pg + 1;
3661
3662
3663                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
3664                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3665
3666
3667                 for (last_pg = 0; last_pg < pages_in_upl; ) {
3668                         /*
3669                          * scan from the beginning of the upl looking for the first
3670                          * page that is present.... this will become the first page in
3671                          * the request we're going to make to 'cluster_io'... if all
3672                          * of the pages are absent, we won't call through to 'cluster_io'
3673                          */
3674                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3675                                 if (upl_page_present(pl, start_pg))
3676                                         break;
3677                         }
3678
3679                         /*
3680                          * scan from the starting present page looking for an absent
3681                          * page before the end of the upl is reached, if we
3682                          * find one, then it will terminate the range of pages being
3683                          * presented to 'cluster_io'
3684                          */
3685                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3686                                 if (!upl_page_present(pl, last_pg))
3687                                         break;
3688                         }
3689
3690                         if (last_pg > start_pg) {
3691                                 /*
3692                                  * we found a range of pages that must be filled
3693                                  * if the last page in this range is the last page of the file
3694                                  * we may have to clip the size of it to keep from reading past
3695                                  * the end of the last physical block associated with the file
3696                                  */
3697                                 upl_offset = start_pg * PAGE_SIZE;
3698                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
3699
3700                                 if ((upl_f_offset + upl_offset + io_size) > filesize)
3701                                         io_size = filesize - (upl_f_offset + upl_offset);
3702
3703                                 /*
3704                                  * issue an asynchronous read to cluster_io
3705                                  */
3706                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
3707                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (buf_t)NULL, (struct clios *)NULL);
3708
3709                                 issued_io = 1;
3710                         }
3711                 }
3712                 if (issued_io == 0)
3713                         ubc_upl_abort(upl, 0);
3714
3715                 io_size = upl_size - start_offset;
3716
3717                 if (io_size > resid)
3718                         io_size = resid;
3719                 f_offset += io_size;
3720                 resid    -= io_size;
3721         }
3722
3723         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3724                      (int)f_offset, resid, retval, 0, 0);
3725
3726         return(retval);
3727 }
3728
3729
3730 int
3731 cluster_push(vnode_t vp, int flags)
3732 {
3733         int     retval;
3734         struct  cl_writebehind *wbp;
3735
3736         if ( !UBCINFOEXISTS(vp)) {
3737                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -1, 0);
3738                 return (0);
3739         }
3740         /* return if deferred write is set */
3741         if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
3742                 return (0);
3743         }
3744         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
3745                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -2, 0);
3746                 return (0);
3747         }
3748         if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
3749                 lck_mtx_unlock(&wbp->cl_lockw);
3750
3751                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -3, 0);
3752                 return(0);
3753         }
3754         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3755                      (int)wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
3756
3757         if (wbp->cl_scmap) {
3758                 sparse_cluster_push(wbp, vp, ubc_getsize(vp), 1);
3759
3760                 retval = 1;
3761         } else
3762                 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), 0, 1);
3763
3764         lck_mtx_unlock(&wbp->cl_lockw);
3765
3766         if (flags & IO_SYNC)
3767                 (void)vnode_waitforwrites(vp, 0, 0, 0, (char *)"cluster_push");
3768
3769         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3770                      (int)wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
3771
3772         return (retval);
3773 }
3774
3775
3776 __private_extern__ void
3777 cluster_release(struct ubc_info *ubc)
3778 {
3779         struct cl_writebehind *wbp;
3780         struct cl_readahead   *rap;
3781
3782         if ((wbp = ubc->cl_wbehind)) {
3783
3784                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
3785
3786                 if (wbp->cl_scmap)
3787                         vfs_drt_control(&(wbp->cl_scmap), 0);
3788         } else {
3789                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, 0, 0, 0, 0);
3790         }
3791
3792         rap = ubc->cl_rahead;
3793
3794         if (wbp != NULL) {
3795                 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
3796                 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
3797         }
3798         if ((rap = ubc->cl_rahead)) {
3799                 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
3800                 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
3801         }
3802         ubc->cl_rahead  = NULL;
3803         ubc->cl_wbehind = NULL;
3804
3805         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)ubc, (int)rap, (int)wbp, 0, 0);
3806 }
3807
3808
3809 static void
3810 cluster_push_EOF(vnode_t vp, off_t EOF)
3811 {
3812         struct cl_writebehind *wbp;
3813
3814         wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3815
3816         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3817                      (int)wbp->cl_scmap, wbp->cl_number, (int)EOF, 0, 0);
3818
3819         if (wbp->cl_scmap)
3820                 sparse_cluster_push(wbp, vp, EOF, 1);
3821         else
3822                 cluster_try_push(wbp, vp, EOF, 0, 1);
3823
3824         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3825                      (int)wbp->cl_scmap, wbp->cl_number, 0, 0, 0);
3826
3827         lck_mtx_unlock(&wbp->cl_lockw);
3828 }
3829
3830
3831 static int
3832 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int can_delay, int push_all)
3833 {
3834         int cl_index;
3835         int cl_index1;
3836         int min_index;
3837         int cl_len;
3838         int cl_pushed = 0;
3839         struct cl_wextent l_clusters[MAX_CLUSTERS];
3840
3841         /*
3842          * the write behind context exists and has
3843          * already been locked...
3844          *
3845          * make a local 'sorted' copy of the clusters
3846          * and clear wbp->cl_number so that new clusters can
3847          * be developed
3848          */
3849         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3850                 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
3851                         if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
3852                                 continue;
3853                         if (min_index == -1)
3854                                 min_index = cl_index1;
3855                         else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
3856                                 min_index = cl_index1;
3857                 }
3858                 if (min_index == -1)
3859                         break;
3860                 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
3861                 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
3862                 l_clusters[cl_index].io_nocache = wbp->cl_clusters[min_index].io_nocache;
3863
3864                 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
3865         }
3866         wbp->cl_number = 0;
3867
3868         cl_len = cl_index;
3869
3870         if (can_delay && cl_len == MAX_CLUSTERS) {
3871                 int   i;
3872
3873                 /*
3874                  * determine if we appear to be writing the file sequentially
3875                  * if not, by returning without having pushed any clusters
3876                  * we will cause this vnode to be pushed into the sparse cluster mechanism
3877                  * used for managing more random I/O patterns
3878                  *
3879                  * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3880                  * that's why we're in try_push with can_delay true...
3881                  *
3882                  * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3883                  * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3884                  * so we can just make a simple pass through, up to, but not including the last one...
3885                  * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
3886                  * are sequential
3887                  *
3888                  * we let the last one be partial as long as it was adjacent to the previous one...
3889                  * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3890                  * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3891                  */
3892                 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
3893                         if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != MAX_UPL_TRANSFER)
3894                                 goto dont_try;
3895                         if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
3896                                 goto dont_try;
3897                 }
3898         }
3899         /*
3900          * drop the lock while we're firing off the I/Os...
3901          * this is safe since I'm working off of a private sorted copy
3902          * of the clusters, and I'm going to re-evaluate the public
3903          * state after I retake the lock
3904          */
3905         lck_mtx_unlock(&wbp->cl_lockw);
3906
3907         for (cl_index = 0; cl_index < cl_len; cl_index++) {
3908                 int flags;
3909                 struct cl_extent cl;
3910
3911                 /*
3912                  * try to push each cluster in turn...
3913                  */
3914                 if (l_clusters[cl_index].io_nocache)
3915                         flags = IO_NOCACHE;
3916                 else
3917                         flags = 0;
3918                 cl.b_addr = l_clusters[cl_index].b_addr;
3919                 cl.e_addr = l_clusters[cl_index].e_addr;
3920
3921                 cluster_push_x(vp, &cl, EOF, flags);
3922
3923                 l_clusters[cl_index].b_addr = 0;
3924                 l_clusters[cl_index].e_addr = 0;
3925
3926                 cl_pushed++;
3927
3928                 if (push_all == 0)
3929                         break;
3930         }
3931         lck_mtx_lock(&wbp->cl_lockw);
3932
3933 dont_try:
3934         if (cl_len > cl_pushed) {
3935                /*
3936                 * we didn't push all of the clusters, so
3937                 * lets try to merge them back in to the vnode
3938                 */
3939                 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
3940                         /*
3941                          * we picked up some new clusters while we were trying to
3942                          * push the old ones... this can happen because I've dropped
3943                          * the vnode lock... the sum of the
3944                          * leftovers plus the new cluster count exceeds our ability
3945                          * to represent them, so switch to the sparse cluster mechanism
3946                          *
3947                          * collect the active public clusters...
3948                          */
3949                         sparse_cluster_switch(wbp, vp, EOF);
3950
3951                         for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
3952                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
3953                                         continue;
3954                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3955                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3956                                 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
3957
3958                                 cl_index1++;
3959                         }
3960                         /*
3961                          * update the cluster count
3962                          */
3963                         wbp->cl_number = cl_index1;
3964
3965                         /*
3966                          * and collect the original clusters that were moved into the
3967                          * local storage for sorting purposes
3968                          */
3969                         sparse_cluster_switch(wbp, vp, EOF);
3970
3971                 } else {
3972                         /*
3973                          * we've got room to merge the leftovers back in
3974                          * just append them starting at the next 'hole'
3975                          * represented by wbp->cl_number
3976                          */
3977                         for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
3978                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
3979                                         continue;
3980
3981                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3982                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3983                                 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
3984
3985                                 cl_index1++;
3986                         }
3987                         /*
3988                          * update the cluster count
3989                          */
3990                         wbp->cl_number = cl_index1;
3991                 }
3992         }
3993         return(MAX_CLUSTERS - wbp->cl_number);
3994 }
3995
3996
3997
3998 static int
3999 cluster_push_x(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags)
4000 {
4001         upl_page_info_t *pl;
4002         upl_t            upl;
4003         vm_offset_t      upl_offset;
4004         int              upl_size;
4005         off_t            upl_f_offset;
4006         int              pages_in_upl;
4007         int              start_pg;
4008         int              last_pg;
4009         int              io_size;
4010         int              io_flags;
4011         int              upl_flags;
4012         int              size;
4013         int              error = 0;
4014         int              retval;
4015         kern_return_t    kret;
4016
4017
4018         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
4019                      (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
4020
4021         if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
4022                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
4023
4024                 return (0);
4025         }
4026         upl_size = pages_in_upl * PAGE_SIZE;
4027         upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4028
4029         if (upl_f_offset + upl_size >= EOF) {
4030
4031                 if (upl_f_offset >= EOF) {
4032                         /*
4033                          * must have truncated the file and missed
4034                          * clearing a dangling cluster (i.e. it's completely
4035                          * beyond the new EOF
4036                          */
4037                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
4038
4039                         return(0);
4040                 }
4041                 size = EOF - upl_f_offset;
4042
4043                 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4044                 pages_in_upl = upl_size / PAGE_SIZE;
4045         } else
4046                 size = upl_size;
4047
4048         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
4049
4050         /*
4051          * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
4052          *
4053          * - only pages that are currently dirty are returned... these are the ones we need to clean
4054          * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
4055          * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
4056          * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
4057          *   someone dirties this page while the I/O is in progress, we don't lose track of the new state
4058          *
4059          * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
4060          */
4061
4062         if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE))
4063                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
4064         else
4065                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
4066
4067         kret = ubc_create_upl(vp,
4068                                 upl_f_offset,
4069                                 upl_size,
4070                                 &upl,
4071                                 &pl,
4072                                 upl_flags);
4073         if (kret != KERN_SUCCESS)
4074                 panic("cluster_push: failed to get pagelist");
4075
4076         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
4077
4078         /*
4079          * since we only asked for the dirty pages back
4080          * it's possible that we may only get a few or even none, so...
4081          * before we start marching forward, we must make sure we know
4082          * where the last present page is in the UPL, otherwise we could
4083          * end up working with a freed upl due to the FREE_ON_EMPTY semantics
4084          * employed by commit_range and abort_range.
4085          */
4086         for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
4087                 if (upl_page_present(pl, last_pg))
4088                         break;
4089         }
4090         pages_in_upl = last_pg + 1;
4091
4092         if (pages_in_upl == 0) {
4093                 ubc_upl_abort(upl, 0);
4094
4095                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
4096                 return(0);
4097         }
4098
4099         for (last_pg = 0; last_pg < pages_in_upl; ) {
4100                 /*
4101                  * find the next dirty page in the UPL
4102                  * this will become the first page in the
4103                  * next I/O to generate
4104                  */
4105                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
4106                         if (upl_dirty_page(pl, start_pg))
4107                                 break;
4108                         if (upl_page_present(pl, start_pg))
4109                                 /*
4110                                  * RET_ONLY_DIRTY will return non-dirty 'precious' pages
4111                                  * just release these unchanged since we're not going
4112                                  * to steal them or change their state
4113                                  */
4114                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4115                 }
4116                 if (start_pg >= pages_in_upl)
4117                         /*
4118                          * done... no more dirty pages to push
4119                          */
4120                         break;
4121                 if (start_pg > last_pg)
4122                         /*
4123                          * skipped over some non-dirty pages
4124                          */
4125                         size -= ((start_pg - last_pg) * PAGE_SIZE);
4126
4127                 /*
4128                  * find a range of dirty pages to write
4129                  */
4130                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4131                         if (!upl_dirty_page(pl, last_pg))
4132                                 break;
4133                 }
4134                 upl_offset = start_pg * PAGE_SIZE;
4135
4136                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
4137
4138                 io_flags = CL_THROTTLE | CL_COMMIT;
4139
4140                 if ( !(flags & IO_SYNC))
4141                         io_flags |= CL_ASYNC;
4142
4143                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
4144                                     io_flags, (buf_t)NULL, (struct clios *)NULL);
4145
4146                 if (error == 0 && retval)
4147                         error = retval;
4148
4149                 size -= io_size;
4150         }
4151         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
4152
4153         return(error);
4154 }
4155
4156
4157 /*
4158  * sparse_cluster_switch is called with the write behind lock held
4159  */
4160 static void
4161 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF)
4162 {
4163         int     cl_index;
4164
4165         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4166
4167         if (wbp->cl_scmap == NULL)
4168                 wbp->cl_scdirty = 0;
4169
4170         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
4171                 int       flags;
4172                 struct cl_extent cl;
4173
4174                 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
4175
4176                         if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
4177                                 if (flags & UPL_POP_DIRTY) {
4178                                         cl.e_addr = cl.b_addr + 1;
4179
4180                                         sparse_cluster_add(wbp, vp, &cl, EOF);
4181                                 }
4182                         }
4183                 }
4184         }
4185         wbp->cl_number = 0;
4186
4187         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4188 }
4189
4190
4191 /*
4192  * sparse_cluster_push is called with the write behind lock held
4193  */
4194 static void
4195 sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_all)
4196 {
4197         struct cl_extent cl;
4198         off_t           offset;
4199         u_int           length;
4200
4201         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, push_all, 0);
4202
4203         if (push_all)
4204                 vfs_drt_control(&(wbp->cl_scmap), 1);
4205
4206         for (;;) {
4207                 if (vfs_drt_get_cluster(&(wbp->cl_scmap), &offset, &length) != KERN_SUCCESS)
4208                         break;
4209
4210                 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
4211                 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
4212
4213                 wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr);
4214
4215                 cluster_push_x(vp, &cl, EOF, 0);
4216
4217                 if (push_all == 0)
4218                         break;
4219         }
4220         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4221 }
4222
4223
4224 /*
4225  * sparse_cluster_add is called with the write behind lock held
4226  */
4227 static void
4228 sparse_cluster_add(struct cl_writebehind *wbp, vnode_t vp, struct cl_extent *cl, off_t EOF)
4229 {
4230         u_int   new_dirty;
4231         u_int   length;
4232         off_t   offset;
4233
4234         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)wbp->cl_scmap, wbp->cl_scdirty, (int)cl->b_addr, (int)cl->e_addr, 0);
4235
4236         offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4237         length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
4238
4239         while (vfs_drt_mark_pages(&(wbp->cl_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
4240                 /*
4241                  * no room left in the map
4242                  * only a partial update was done
4243                  * push out some pages and try again
4244                  */
4245                 wbp->cl_scdirty += new_dirty;
4246
4247                 sparse_cluster_push(wbp, vp, EOF, 0);
4248
4249                 offset += (new_dirty * PAGE_SIZE_64);
4250                 length -= (new_dirty * PAGE_SIZE);
4251         }
4252         wbp->cl_scdirty += new_dirty;
4253
4254         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4255 }
4256
4257
4258 static int
4259 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags)
4260 {
4261         upl_page_info_t  *pl;
4262         upl_t            upl;
4263         addr64_t         ubc_paddr;
4264         kern_return_t    kret;
4265         int              error = 0;
4266         int              did_read = 0;
4267         int              abort_flags;
4268         int              upl_flags;
4269
4270         upl_flags = UPL_SET_LITE;
4271         if (! (flags & CL_READ)) {
4272                 /*
4273                  * "write" operation:  let the UPL subsystem know
4274                  * that we intend to modify the buffer cache pages
4275                  * we're gathering.
4276                  */
4277                 upl_flags |= UPL_WILL_MODIFY;
4278         }
4279
4280         kret = ubc_create_upl(vp,
4281                               uio->uio_offset & ~PAGE_MASK_64,
4282                               PAGE_SIZE,
4283                               &upl,
4284                               &pl,
4285                               upl_flags);
4286
4287         if (kret != KERN_SUCCESS)
4288                 return(EINVAL);
4289
4290         if (!upl_valid_page(pl, 0)) {
4291                 /*
4292                  * issue a synchronous read to cluster_io
4293                  */
4294                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4295                                    CL_READ, (buf_t)NULL, (struct clios *)NULL);
4296                 if (error) {
4297                           ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4298
4299                           return(error);
4300                 }
4301                 did_read = 1;
4302         }
4303         ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
4304
4305 /*
4306  *      NOTE:  There is no prototype for the following in BSD. It, and the definitions
4307  *      of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
4308  *      osfmk/ppc/mappings.h.  They are not included here because there appears to be no
4309  *      way to do so without exporting them to kexts as well.
4310  */
4311         if (flags & CL_READ)
4312 //              copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);    /* Copy physical to physical and flush the destination */
4313                 copypv(ubc_paddr, usr_paddr, xsize,        2 |        1 |        4);    /* Copy physical to physical and flush the destination */
4314         else
4315 //              copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);    /* Copy physical to physical and flush the source */
4316                 copypv(usr_paddr, ubc_paddr, xsize,        2 |        1 |        8);    /* Copy physical to physical and flush the source */
4317
4318         if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
4319                 /*
4320                  * issue a synchronous write to cluster_io
4321                  */
4322                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4323                                         0, (buf_t)NULL, (struct clios *)NULL);
4324         }
4325         if (error == 0)
4326                 uio_update(uio, (user_size_t)xsize);
4327
4328         if (did_read)
4329                 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4330         else
4331                 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
4332
4333         ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
4334
4335         return (error);
4336 }
4337
4338
4339
4340 int
4341 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
4342 {
4343         int       pg_offset;
4344         int       pg_index;
4345         int       csize;
4346         int       segflg;
4347         int       retval = 0;
4348         upl_page_info_t *pl;
4349
4350         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
4351                      (int)uio->uio_offset, uio_resid(uio), upl_offset, xsize, 0);
4352
4353         segflg = uio->uio_segflg;
4354
4355         switch(segflg) {
4356
4357           case UIO_USERSPACE32:
4358           case UIO_USERISPACE32:
4359                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4360                 break;
4361
4362           case UIO_USERSPACE:
4363           case UIO_USERISPACE:
4364                 uio->uio_segflg = UIO_PHYS_USERSPACE;
4365                 break;
4366
4367           case UIO_USERSPACE64:
4368           case UIO_USERISPACE64:
4369                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4370                 break;
4371
4372           case UIO_SYSSPACE32:
4373                 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4374                 break;
4375
4376           case UIO_SYSSPACE:
4377                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4378                 break;
4379
4380           case UIO_SYSSPACE64:
4381                 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4382                 break;
4383         }
4384         pl = ubc_upl_pageinfo(upl);
4385
4386         pg_index  = upl_offset / PAGE_SIZE;
4387         pg_offset = upl_offset & PAGE_MASK;
4388         csize     = min(PAGE_SIZE - pg_offset, xsize);
4389
4390         while (xsize && retval == 0) {
4391                 addr64_t  paddr;
4392
4393                 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
4394
4395                 retval = uiomove64(paddr, csize, uio);
4396
4397                 pg_index += 1;
4398                 pg_offset = 0;
4399                 xsize    -= csize;
4400                 csize     = min(PAGE_SIZE, xsize);
4401         }
4402         uio->uio_segflg = segflg;
4403
4404         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4405                      (int)uio->uio_offset, uio_resid(uio), retval, segflg, 0);
4406
4407         return (retval);
4408 }
4409
4410
4411 int
4412 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
4413 {
4414         int       segflg;
4415         int       io_size;
4416         int       xsize;
4417         int       start_offset;
4418         int       retval = 0;
4419         memory_object_control_t  control;
4420
4421
4422         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
4423                      (int)uio->uio_offset, uio_resid(uio), 0, *io_resid, 0);
4424
4425         control = ubc_getobject(vp, UBC_FLAGS_NONE);
4426         if (control == MEMORY_OBJECT_CONTROL_NULL) {
4427                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4428                              (int)uio->uio_offset, uio_resid(uio), retval, 3, 0);
4429
4430                 return(0);
4431         }
4432         segflg = uio->uio_segflg;
4433
4434         switch(segflg) {
4435
4436           case UIO_USERSPACE32:
4437           case UIO_USERISPACE32:
4438                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4439                 break;
4440
4441           case UIO_USERSPACE64:
4442           case UIO_USERISPACE64:
4443                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4444                 break;
4445
4446           case UIO_SYSSPACE32:
4447                 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4448                 break;
4449
4450           case UIO_SYSSPACE64:
4451                 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4452                 break;
4453
4454           case UIO_USERSPACE:
4455           case UIO_USERISPACE:
4456                 uio->uio_segflg = UIO_PHYS_USERSPACE;
4457                 break;
4458
4459           case UIO_SYSSPACE:
4460                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4461                 break;
4462         }
4463
4464         if ( (io_size = *io_resid) ) {
4465                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4466                 xsize = uio_resid(uio);
4467
4468                 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset,
4469                                                        uio, start_offset, io_size, mark_dirty);
4470                 xsize -= uio_resid(uio);
4471                 io_size -= xsize;
4472         }
4473         uio->uio_segflg = segflg;
4474         *io_resid       = io_size;
4475
4476         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4477                      (int)uio->uio_offset, uio_resid(uio), retval, 0x80000000 | segflg, 0);
4478
4479         return(retval);
4480 }
4481
4482
4483 int
4484 is_file_clean(vnode_t vp, off_t filesize)
4485 {
4486         off_t f_offset;
4487         int   flags;
4488         int   total_dirty = 0;
4489
4490         for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
4491                 if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
4492                         if (flags & UPL_POP_DIRTY) {
4493                                 total_dirty++;
4494                         }
4495                 }
4496         }
4497         if (total_dirty)
4498                 return(EINVAL);
4499
4500         return (0);
4501 }
4502
4503
4504
4505 /*
4506  * Dirty region tracking/clustering mechanism.
4507  *
4508  * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4509  * dirty regions within a larger space (file).  It is primarily intended to
4510  * support clustering in large files with many dirty areas.
4511  *
4512  * The implementation assumes that the dirty regions are pages.
4513  *
4514  * To represent dirty pages within the file, we store bit vectors in a
4515  * variable-size circular hash.
4516  */
4517
4518 /*
4519  * Bitvector size.  This determines the number of pages we group in a
4520  * single hashtable entry.  Each hashtable entry is aligned to this
4521  * size within the file.
4522  */
4523 #define DRT_BITVECTOR_PAGES             256
4524
4525 /*
4526  * File offset handling.
4527  *
4528  * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4529  * the correct formula is  (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4530  */
4531 #define DRT_ADDRESS_MASK                (~((1 << 20) - 1))
4532 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK)
4533
4534 /*
4535  * Hashtable address field handling.
4536  *
4537  * The low-order bits of the hashtable address are used to conserve
4538  * space.
4539  *
4540  * DRT_HASH_COUNT_MASK must be large enough to store the range
4541  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4542  * to indicate that the bucket is actually unoccupied.
4543  */
4544 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4545 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \
4546         do {                                                                                            \
4547                 (scm)->scm_hashtable[(i)].dhe_control =                                                 \
4548                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4549         } while (0)
4550 #define DRT_HASH_COUNT_MASK             0x1ff
4551 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4552 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \
4553         do {                                                                                                            \
4554                 (scm)->scm_hashtable[(i)].dhe_control =                                                                 \
4555                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \
4556         } while (0)
4557 #define DRT_HASH_CLEAR(scm, i)                                                                                          \
4558         do {                                                                                                            \
4559                 (scm)->scm_hashtable[(i)].dhe_control = 0;                                                              \
4560         } while (0)
4561 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4562 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4563 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \
4564         do {                                                                                            \
4565                 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \
4566                 DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \
4567         } while(0);
4568
4569
4570 /*
4571  * Hash table moduli.
4572  *
4573  * Since the hashtable entry's size is dependent on the size of
4574  * the bitvector, and since the hashtable size is constrained to
4575  * both being prime and fitting within the desired allocation
4576  * size, these values need to be manually determined.
4577  *
4578  * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4579  *
4580  * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4581  * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4582  */
4583 #define DRT_HASH_SMALL_MODULUS  23
4584 #define DRT_HASH_LARGE_MODULUS  401
4585
4586 #define DRT_SMALL_ALLOCATION    1024    /* 104 bytes spare */
4587 #define DRT_LARGE_ALLOCATION    16384   /* 344 bytes spare */
4588
4589 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4590
4591 /*
4592  * Hashtable bitvector handling.
4593  *
4594  * Bitvector fields are 32 bits long.
4595  */
4596
4597 #define DRT_HASH_SET_BIT(scm, i, bit)                           \
4598         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4599
4600 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \
4601         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4602
4603 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \
4604         ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4605
4606 #define DRT_BITVECTOR_CLEAR(scm, i)                             \
4607         bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4608
4609 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \
4610         bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \
4611             &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \
4612             (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4613
4614
4615
4616 /*
4617  * Hashtable entry.
4618  */
4619 struct vfs_drt_hashentry {
4620         u_int64_t       dhe_control;
4621         u_int32_t       dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
4622 };
4623
4624 /*
4625  * Dirty Region Tracking structure.
4626  *
4627  * The hashtable is allocated entirely inside the DRT structure.
4628  *
4629  * The hash is a simple circular prime modulus arrangement, the structure
4630  * is resized from small to large if it overflows.
4631  */
4632
4633 struct vfs_drt_clustermap {
4634         u_int32_t               scm_magic;      /* sanity/detection */
4635 #define DRT_SCM_MAGIC           0x12020003
4636         u_int32_t               scm_modulus;    /* current ring size */
4637         u_int32_t               scm_buckets;    /* number of occupied buckets */
4638         u_int32_t               scm_lastclean;  /* last entry we cleaned */
4639         u_int32_t               scm_iskips;     /* number of slot skips */
4640
4641         struct vfs_drt_hashentry scm_hashtable[0];
4642 };
4643
4644
4645 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus)
4646 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus)
4647
4648 /*
4649  * Debugging codes and arguments.
4650  */
4651 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4652 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4653 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4654 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4655 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4656                                                             * dirty */
4657                                                            /* 0, setcount */
4658                                                            /* 1 (clean, no map) */
4659                                                            /* 2 (map alloc fail) */
4660                                                            /* 3, resid (partial) */
4661 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87))
4662 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4663                                                             * lastclean, iskips */
4664
4665
4666 static kern_return_t    vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
4667 static kern_return_t    vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
4668 static kern_return_t    vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
4669         u_int64_t offset, int *indexp);
4670 static kern_return_t    vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
4671         u_int64_t offset,
4672         int *indexp,
4673         int recursed);
4674 static kern_return_t    vfs_drt_do_mark_pages(
4675         void            **cmapp,
4676         u_int64_t       offset,
4677         u_int           length,
4678         int             *setcountp,
4679         int             dirty);
4680 static void             vfs_drt_trace(
4681         struct vfs_drt_clustermap *cmap,
4682         int code,
4683         int arg1,
4684         int arg2,
4685         int arg3,
4686         int arg4);
4687
4688
4689 /*
4690  * Allocate and initialise a sparse cluster map.
4691  *
4692  * Will allocate a new map, resize or compact an existing map.
4693  *
4694  * XXX we should probably have at least one intermediate map size,
4695  * as the 1:16 ratio seems a bit drastic.
4696  */
4697 static kern_return_t
4698 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
4699 {
4700         struct vfs_drt_clustermap *cmap, *ocmap;
4701         kern_return_t   kret;
4702         u_int64_t       offset;
4703         int             nsize, i, active_buckets, index, copycount;
4704
4705         ocmap = NULL;
4706         if (cmapp != NULL)
4707                 ocmap = *cmapp;
4708
4709         /*
4710          * Decide on the size of the new map.
4711          */
4712         if (ocmap == NULL) {
4713                 nsize = DRT_HASH_SMALL_MODULUS;
4714         } else {
4715                 /* count the number of active buckets in the old map */
4716                 active_buckets = 0;
4717                 for (i = 0; i < ocmap->scm_modulus; i++) {
4718                         if (!DRT_HASH_VACANT(ocmap, i) &&
4719                             (DRT_HASH_GET_COUNT(ocmap, i) != 0))
4720                                 active_buckets++;
4721                 }
4722                 /*
4723                  * If we're currently using the small allocation, check to
4724                  * see whether we should grow to the large one.
4725                  */
4726                 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
4727                         /* if the ring is nearly full */
4728                         if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
4729                                 nsize = DRT_HASH_LARGE_MODULUS;
4730                         } else {
4731                                 nsize = DRT_HASH_SMALL_MODULUS;
4732                         }
4733                 } else {
4734                         /* already using the large modulus */
4735                         nsize = DRT_HASH_LARGE_MODULUS;
4736                         /*
4737                          * If the ring is completely full, there's
4738                          * nothing useful for us to do.  Behave as
4739                          * though we had compacted into the new
4740                          * array and return.
4741                          */
4742                         if (active_buckets >= DRT_HASH_LARGE_MODULUS)
4743                                 return(KERN_SUCCESS);
4744                 }
4745         }
4746
4747         /*
4748          * Allocate and initialise the new map.
4749          */
4750
4751         kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
4752             (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4753         if (kret != KERN_SUCCESS)
4754                 return(kret);
4755         cmap->scm_magic = DRT_SCM_MAGIC;
4756         cmap->scm_modulus = nsize;
4757         cmap->scm_buckets = 0;
4758         cmap->scm_lastclean = 0;
4759         cmap->scm_iskips = 0;
4760         for (i = 0; i < cmap->scm_modulus; i++) {
4761                 DRT_HASH_CLEAR(cmap, i);
4762                 DRT_HASH_VACATE(cmap, i);
4763                 DRT_BITVECTOR_CLEAR(cmap, i);
4764         }
4765
4766         /*
4767          * If there's an old map, re-hash entries from it into the new map.
4768          */
4769         copycount = 0;
4770         if (ocmap != NULL) {
4771                 for (i = 0; i < ocmap->scm_modulus; i++) {
4772                         /* skip empty buckets */
4773                         if (DRT_HASH_VACANT(ocmap, i) ||
4774                             (DRT_HASH_GET_COUNT(ocmap, i) == 0))
4775                                 continue;
4776                         /* get new index */
4777                         offset = DRT_HASH_GET_ADDRESS(ocmap, i);
4778                         kret = vfs_drt_get_index(&cmap, offset, &index, 1);
4779                         if (kret != KERN_SUCCESS) {
4780                                 /* XXX need to bail out gracefully here */
4781                                 panic("vfs_drt: new cluster map mysteriously too small");
4782                         }
4783                         /* copy */
4784                         DRT_HASH_COPY(ocmap, i, cmap, index);
4785                         copycount++;
4786                 }
4787         }
4788
4789         /* log what we've done */
4790         vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
4791
4792         /*
4793          * It's important to ensure that *cmapp always points to
4794          * a valid map, so we must overwrite it before freeing
4795          * the old map.
4796          */
4797         *cmapp = cmap;
4798         if (ocmap != NULL) {
4799                 /* emit stats into trace buffer */
4800                 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
4801                               ocmap->scm_modulus,
4802                               ocmap->scm_buckets,
4803                               ocmap->scm_lastclean,
4804                               ocmap->scm_iskips);
4805
4806                 vfs_drt_free_map(ocmap);
4807         }
4808         return(KERN_SUCCESS);
4809 }
4810
4811
4812 /*
4813  * Free a sparse cluster map.
4814  */
4815 static kern_return_t
4816 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
4817 {
4818         kmem_free(kernel_map, (vm_offset_t)cmap,
4819                   (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4820         return(KERN_SUCCESS);
4821 }
4822
4823
4824 /*
4825  * Find the hashtable slot currently occupied by an entry for the supplied offset.
4826  */
4827 static kern_return_t
4828 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
4829 {
4830         int             index, i;
4831
4832         offset = DRT_ALIGN_ADDRESS(offset);
4833         index = DRT_HASH(cmap, offset);
4834
4835         /* traverse the hashtable */
4836         for (i = 0; i < cmap->scm_modulus; i++) {
4837
4838                 /*
4839                  * If the slot is vacant, we can stop.
4840                  */
4841                 if (DRT_HASH_VACANT(cmap, index))
4842                         break;
4843
4844                 /*
4845                  * If the address matches our offset, we have success.
4846                  */
4847                 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
4848                         *indexp = index;
4849                         return(KERN_SUCCESS);
4850                 }
4851
4852                 /*
4853                  * Move to the next slot, try again.
4854                  */
4855                 index = DRT_HASH_NEXT(cmap, index);
4856         }
4857         /*
4858          * It's not there.
4859          */
4860         return(KERN_FAILURE);
4861 }
4862
4863 /*
4864  * Find the hashtable slot for the supplied offset.  If we haven't allocated
4865  * one yet, allocate one and populate the address field.  Note that it will
4866  * not have a nonzero page count and thus will still technically be free, so
4867  * in the case where we are called to clean pages, the slot will remain free.
4868  */
4869 static kern_return_t
4870 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
4871 {
4872         struct vfs_drt_clustermap *cmap;
4873         kern_return_t   kret;
4874         int             index, i;
4875
4876         cmap = *cmapp;
4877
4878         /* look for an existing entry */
4879         kret = vfs_drt_search_index(cmap, offset, indexp);
4880         if (kret == KERN_SUCCESS)
4881                 return(kret);
4882
4883         /* need to allocate an entry */
4884         offset = DRT_ALIGN_ADDRESS(offset);
4885         index = DRT_HASH(cmap, offset);
4886
4887         /* scan from the index forwards looking for a vacant slot */
4888         for (i = 0; i < cmap->scm_modulus; i++) {
4889                 /* slot vacant? */
4890                 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
4891                         cmap->scm_buckets++;
4892                         if (index < cmap->scm_lastclean)
4893                                 cmap->scm_lastclean = index;
4894                         DRT_HASH_SET_ADDRESS(cmap, index, offset);
4895                         DRT_HASH_SET_COUNT(cmap, index, 0);
4896                         DRT_BITVECTOR_CLEAR(cmap, index);
4897                         *indexp = index;
4898                         vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
4899                         return(KERN_SUCCESS);
4900                 }
4901                 cmap->scm_iskips += i;
4902                 index = DRT_HASH_NEXT(cmap, index);
4903         }
4904
4905         /*
4906          * We haven't found a vacant slot, so the map is full.  If we're not
4907          * already recursed, try reallocating/compacting it.
4908          */
4909         if (recursed)
4910                 return(KERN_FAILURE);
4911         kret = vfs_drt_alloc_map(cmapp);
4912         if (kret == KERN_SUCCESS) {
4913                 /* now try to insert again */
4914                 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
4915         }
4916         return(kret);
4917 }
4918
4919 /*
4920  * Implementation of set dirty/clean.
4921  *
4922  * In the 'clean' case, not finding a map is OK.
4923  */
4924 static kern_return_t
4925 vfs_drt_do_mark_pages(
4926         void            **private,
4927         u_int64_t       offset,
4928         u_int           length,
4929         int             *setcountp,
4930         int             dirty)
4931 {
4932         struct vfs_drt_clustermap *cmap, **cmapp;
4933         kern_return_t   kret;
4934         int             i, index, pgoff, pgcount, setcount, ecount;
4935
4936         cmapp = (struct vfs_drt_clustermap **)private;
4937         cmap = *cmapp;
4938
4939         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
4940
4941         if (setcountp != NULL)
4942                 *setcountp = 0;
4943
4944         /* allocate a cluster map if we don't already have one */
4945         if (cmap == NULL) {
4946                 /* no cluster map, nothing to clean */
4947                 if (!dirty) {
4948                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
4949                         return(KERN_SUCCESS);
4950                 }
4951                 kret = vfs_drt_alloc_map(cmapp);
4952                 if (kret != KERN_SUCCESS) {
4953                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
4954                         return(kret);
4955                 }
4956         }
4957         setcount = 0;
4958
4959         /*
4960          * Iterate over the length of the region.
4961          */
4962         while (length > 0) {
4963                 /*
4964                  * Get the hashtable index for this offset.
4965                  *
4966                  * XXX this will add blank entries if we are clearing a range
4967                  * that hasn't been dirtied.
4968                  */
4969                 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
4970                 cmap = *cmapp;  /* may have changed! */
4971                 /* this may be a partial-success return */
4972                 if (kret != KERN_SUCCESS) {
4973                         if (setcountp != NULL)
4974                                 *setcountp = setcount;
4975                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
4976
4977                         return(kret);
4978                 }
4979
4980                 /*
4981                  * Work out how many pages we're modifying in this
4982                  * hashtable entry.
4983                  */
4984                 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
4985                 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
4986
4987                 /*
4988                  * Iterate over pages, dirty/clearing as we go.
4989                  */
4990                 ecount = DRT_HASH_GET_COUNT(cmap, index);
4991                 for (i = 0; i < pgcount; i++) {
4992                         if (dirty) {
4993                                 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4994                                         DRT_HASH_SET_BIT(cmap, index, pgoff + i);
4995                                         ecount++;
4996                                         setcount++;
4997                                 }
4998                         } else {
4999                                 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
5000                                         DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
5001                                         ecount--;
5002                                         setcount++;
5003                                 }
5004                         }
5005                 }
5006                 DRT_HASH_SET_COUNT(cmap, index, ecount);
5007
5008                 offset += pgcount * PAGE_SIZE;
5009                 length -= pgcount * PAGE_SIZE;
5010         }
5011         if (setcountp != NULL)
5012                 *setcountp = setcount;
5013
5014         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
5015
5016         return(KERN_SUCCESS);
5017 }
5018
5019 /*
5020  * Mark a set of pages as dirty/clean.
5021  *
5022  * This is a public interface.
5023  *
5024  * cmapp
5025  *      Pointer to storage suitable for holding a pointer.  Note that
5026  *      this must either be NULL or a value set by this function.
5027  *
5028  * size
5029  *      Current file size in bytes.
5030  *
5031  * offset
5032  *      Offset of the first page to be marked as dirty, in bytes.  Must be
5033  *      page-aligned.
5034  *
5035  * length
5036  *      Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE.
5037  *
5038  * setcountp
5039  *      Number of pages newly marked dirty by this call (optional).
5040  *
5041  * Returns KERN_SUCCESS if all the pages were successfully marked.
5042  */
5043 static kern_return_t
5044 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp)
5045 {
5046         /* XXX size unused, drop from interface */
5047         return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
5048 }
5049
5050 #if 0
5051 static kern_return_t
5052 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
5053 {
5054         return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
5055 }
5056 #endif
5057
5058 /*
5059  * Get a cluster of dirty pages.
5060  *
5061  * This is a public interface.
5062  *
5063  * cmapp
5064  *      Pointer to storage managed by drt_mark_pages.  Note that this must
5065  *      be NULL or a value set by drt_mark_pages.
5066  *
5067  * offsetp
5068  *      Returns the byte offset into the file of the first page in the cluster.
5069  *
5070  * lengthp
5071  *      Returns the length in bytes of the cluster of dirty pages.
5072  *
5073  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there
5074  * are no dirty pages meeting the minmum size criteria.  Private storage will
5075  * be released if there are no more dirty pages left in the map
5076  *
5077  */
5078 static kern_return_t
5079 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
5080 {
5081         struct vfs_drt_clustermap *cmap;
5082         u_int64_t       offset;
5083         u_int           length;
5084         int             index, i, j, fs, ls;
5085
5086         /* sanity */
5087         if ((cmapp == NULL) || (*cmapp == NULL))
5088                 return(KERN_FAILURE);
5089         cmap = *cmapp;
5090
5091         /* walk the hashtable */
5092         for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
5093                 index = DRT_HASH(cmap, offset);
5094
5095                 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
5096                         continue;
5097
5098                 /* scan the bitfield for a string of bits */
5099                 fs = -1;
5100
5101                 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5102                         if (DRT_HASH_TEST_BIT(cmap, index, i)) {
5103                                 fs = i;
5104                                 break;
5105                         }
5106                 }
5107                 if (fs == -1) {
5108                         /*  didn't find any bits set */
5109                         panic("vfs_drt: entry summary count > 0 but no bits set in map");
5110                 }
5111                 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
5112                         if (!DRT_HASH_TEST_BIT(cmap, index, i))
5113                                 break;
5114                 }
5115
5116                 /* compute offset and length, mark pages clean */
5117                 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
5118                 length = ls * PAGE_SIZE;
5119                 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
5120                 cmap->scm_lastclean = index;
5121
5122                 /* return successful */
5123                 *offsetp = (off_t)offset;
5124                 *lengthp = length;
5125
5126                 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
5127                 return(KERN_SUCCESS);
5128         }
5129         /*
5130          * We didn't find anything... hashtable is empty
5131          * emit stats into trace buffer and
5132          * then free it
5133          */
5134         vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5135                       cmap->scm_modulus,
5136                       cmap->scm_buckets,
5137                       cmap->scm_lastclean,
5138                       cmap->scm_iskips);
5139
5140         vfs_drt_free_map(cmap);
5141         *cmapp = NULL;
5142
5143         return(KERN_FAILURE);
5144 }
5145
5146
5147 static kern_return_t
5148 vfs_drt_control(void **cmapp, int op_type)
5149 {
5150         struct vfs_drt_clustermap *cmap;
5151
5152         /* sanity */
5153         if ((cmapp == NULL) || (*cmapp == NULL))
5154                 return(KERN_FAILURE);
5155         cmap = *cmapp;
5156
5157         switch (op_type) {
5158         case 0:
5159                 /* emit stats into trace buffer */
5160                 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5161                               cmap->scm_modulus,
5162                               cmap->scm_buckets,
5163                               cmap->scm_lastclean,
5164                               cmap->scm_iskips);
5165
5166                 vfs_drt_free_map(cmap);
5167                 *cmapp = NULL;
5168                 break;
5169
5170         case 1:
5171                 cmap->scm_lastclean = 0;
5172                 break;
5173         }
5174         return(KERN_SUCCESS);
5175 }
5176
5177
5178
5179 /*
5180  * Emit a summary of the state of the clustermap into the trace buffer
5181  * along with some caller-provided data.
5182  */
5183 #if KDEBUG
5184 static void
5185 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
5186 {
5187         KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
5188 }
5189 #else
5190 static void
5191 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
5192                           __unused int arg1, __unused int arg2, __unused int arg3,
5193                           __unused int arg4)
5194 {
5195 }
5196 #endif
5197
5198 #if 0
5199 /*
5200  * Perform basic sanity check on the hash entry summary count
5201  * vs. the actual bits set in the entry.
5202  */
5203 static void
5204 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
5205 {
5206         int index, i;
5207         int bits_on;
5208
5209         for (index = 0; index < cmap->scm_modulus; index++) {
5210                 if (DRT_HASH_VACANT(cmap, index))
5211                         continue;
5212
5213                 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5214                         if (DRT_HASH_TEST_BIT(cmap, index, i))
5215                                 bits_on++;
5216                 }
5217                 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
5218                         panic("bits_on = %d,  index = %d\n", bits_on, index);
5219         }
5220 }
5221 #endif