bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All Rights Reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  31 /*
  32  * Copyright (c) 1993
  33  *      The Regents of the University of California.  All rights reserved.
  34  *
  35  * Redistribution and use in source and binary forms, with or without
  36  * modification, are permitted provided that the following conditions
  37  * are met:
  38  * 1. Redistributions of source code must retain the above copyright
  39  *    notice, this list of conditions and the following disclaimer.
  40  * 2. Redistributions in binary form must reproduce the above copyright
  41  *    notice, this list of conditions and the following disclaimer in the
  42  *    documentation and/or other materials provided with the distribution.
  43  * 3. All advertising materials mentioning features or use of this software
  44  *    must display the following acknowledgement:
  45  *      This product includes software developed by the University of
  46  *      California, Berkeley and its contributors.
  47  * 4. Neither the name of the University nor the names of its contributors
  48  *    may be used to endorse or promote products derived from this software
  49  *    without specific prior written permission.
  50  *
  51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  61  * SUCH DAMAGE.
  62  *
  63  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  64  */
  65
  66 #include <sys/param.h>
  67 #include <sys/proc_internal.h>
  68 #include <sys/buf_internal.h>
  69 #include <sys/mount_internal.h>
  70 #include <sys/vnode_internal.h>
  71 #include <sys/trace.h>
  72 #include <sys/malloc.h>
  73 #include <sys/time.h>
  74 #include <sys/kernel.h>
  75 #include <sys/resourcevar.h>
  76 #include <sys/uio_internal.h>
  77 #include <libkern/libkern.h>
  78 #include <machine/machine_routines.h>
  79
  80 #include <sys/ubc_internal.h>
  81
  82 #include <mach/mach_types.h>
  83 #include <mach/memory_object_types.h>
  84 #include <mach/vm_map.h>
  85 #include <mach/upl.h>
  86
  87 #include <vm/vm_kern.h>
  88 #include <vm/vm_map.h>
  89 #include <vm/vm_pageout.h>
  90
  91 #include <sys/kdebug.h>
  92
  93
  94 #define CL_READ      0x01
  95 #define CL_ASYNC     0x02
  96 #define CL_COMMIT    0x04
  97 #define CL_PAGEOUT   0x10
  98 #define CL_AGE       0x20
  99 #define CL_DUMP      0x40
 100 #define CL_NOZERO    0x80
 101 #define CL_PAGEIN    0x100
 102 #define CL_DEV_MEMORY 0x200
 103 #define CL_PRESERVE   0x400
 104 #define CL_THROTTLE   0x800
 105 #define CL_KEEPCACHED 0x1000
 106
 107
 108 struct clios {
 109         u_int  io_completed;       /* amount of io that has currently completed */
 110         u_int  io_issued;          /* amount of io that was successfully issued */
 111         int    io_error;           /* error code of first error encountered */
 112         int    io_wanted;          /* someone is sleeping waiting for a change in state */
 113 };
 114
 115 static lck_grp_t        *cl_mtx_grp;
 116 static lck_attr_t       *cl_mtx_attr;
 117 static lck_grp_attr_t   *cl_mtx_grp_attr;
 118 static lck_mtx_t        *cl_mtxp;
 119
 120
 121 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 122                       int flags, buf_t real_bp, struct clios *iostate);
 123 static int cluster_iodone(buf_t bp, void *dummy);
 124 static int cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize);
 125 static int cluster_hard_throttle_on(vnode_t vp);
 126
 127 static int cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags);
 128 static int cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
 129                            off_t headOff, off_t tailOff, int flags);
 130 static int cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize);
 131 static int cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF);
 132 static int cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize);
 133 static int cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF);
 134 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags);
 135
 136 static void     cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra);
 137
 138 static int      cluster_push_x(vnode_t vp, struct cl_extent *, off_t EOF, int flags);
 139 static void     cluster_push_EOF(vnode_t vp, off_t EOF);
 140
 141 static int      cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int can_delay, int push_all);
 142
 143 static void     sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF);
 144 static void     sparse_cluster_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_all);
 145 static void     sparse_cluster_add(struct cl_writebehind *, vnode_t vp, struct cl_extent *, off_t EOF);
 146
 147 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp);
 148 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
 149 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
 150
 151 int     is_file_clean(vnode_t, off_t);
 152
 153 /*
 154  * throttle the number of async writes that
 155  * can be outstanding on a single vnode
 156  * before we issue a synchronous write
 157  */
 158 #define HARD_THROTTLE_MAXCNT    0
 159 #define HARD_THROTTLE_MAXSIZE   (64 * 1024)
 160
 161 int hard_throttle_on_root = 0;
 162 struct timeval priority_IO_timestamp_for_root;
 163
 164
 165 void
 166 cluster_init(void) {
 167         /*
 168          * allocate lock group attribute and group
 169          */
 170         cl_mtx_grp_attr = lck_grp_attr_alloc_init();
 171         //lck_grp_attr_setstat(cl_mtx_grp_attr);
 172         cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
 173
 174         /*
 175          * allocate the lock attribute
 176          */
 177         cl_mtx_attr = lck_attr_alloc_init();
 178         //lck_attr_setdebug(clf_mtx_attr);
 179
 180         /*
 181          * allocate and initialize mutex's used to protect updates and waits
 182          * on the cluster_io context
 183          */
 184         cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
 185
 186         if (cl_mtxp == NULL)
 187                 panic("cluster_init: failed to allocate cl_mtxp");
 188 }
 189
 190
 191
 192 #define CLW_ALLOCATE            0x01
 193 #define CLW_RETURNLOCKED        0x02
 194 /*
 195  * if the read ahead context doesn't yet exist,
 196  * allocate and initialize it...
 197  * the vnode lock serializes multiple callers
 198  * during the actual assignment... first one
 199  * to grab the lock wins... the other callers
 200  * will release the now unnecessary storage
 201  *
 202  * once the context is present, try to grab (but don't block on)
 203  * the lock associated with it... if someone
 204  * else currently owns it, than the read
 205  * will run without read-ahead.  this allows
 206  * multiple readers to run in parallel and
 207  * since there's only 1 read ahead context,
 208  * there's no real loss in only allowing 1
 209  * reader to have read-ahead enabled.
 210  */
 211 static struct cl_readahead *
 212 cluster_get_rap(vnode_t vp)
 213 {
 214         struct ubc_info         *ubc;
 215         struct cl_readahead     *rap;
 216
 217         ubc = vp->v_ubcinfo;
 218
 219         if ((rap = ubc->cl_rahead) == NULL) {
 220                 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
 221
 222                 bzero(rap, sizeof *rap);
 223                 rap->cl_lastr = -1;
 224                 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
 225
 226                 vnode_lock(vp);
 227
 228                 if (ubc->cl_rahead == NULL)
 229                         ubc->cl_rahead = rap;
 230                 else {
 231                         lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
 232                         FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
 233                                 rap = ubc->cl_rahead;
 234                 }
 235                 vnode_unlock(vp);
 236         }
 237         if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
 238                 return(rap);
 239
 240         return ((struct cl_readahead *)NULL);
 241 }
 242
 243
 244 /*
 245  * if the write behind context doesn't yet exist,
 246  * and CLW_ALLOCATE is specified, allocate and initialize it...
 247  * the vnode lock serializes multiple callers
 248  * during the actual assignment... first one
 249  * to grab the lock wins... the other callers
 250  * will release the now unnecessary storage
 251  *
 252  * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
 253  * the lock associated with the write behind context before
 254  * returning
 255  */
 256
 257 static struct cl_writebehind *
 258 cluster_get_wbp(vnode_t vp, int flags)
 259 {
 260         struct ubc_info *ubc;
 261         struct cl_writebehind *wbp;
 262
 263         ubc = vp->v_ubcinfo;
 264
 265         if ((wbp = ubc->cl_wbehind) == NULL) {
 266
 267                 if ( !(flags & CLW_ALLOCATE))
 268                         return ((struct cl_writebehind *)NULL);
 269
 270                 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
 271
 272                 bzero(wbp, sizeof *wbp);
 273                 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
 274
 275                 vnode_lock(vp);
 276
 277                 if (ubc->cl_wbehind == NULL)
 278                         ubc->cl_wbehind = wbp;
 279                 else {
 280                         lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
 281                         FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
 282                                 wbp = ubc->cl_wbehind;
 283                 }
 284                 vnode_unlock(vp);
 285         }
 286         if (flags & CLW_RETURNLOCKED)
 287                 lck_mtx_lock(&wbp->cl_lockw);
 288
 289         return (wbp);
 290 }
 291
 292
 293 static int
 294 cluster_hard_throttle_on(vnode_t vp)
 295 {
 296         static struct timeval hard_throttle_maxelapsed = { 0, 200000 };
 297
 298         if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
 299                 struct timeval elapsed;
 300
 301                 if (hard_throttle_on_root)
 302                         return(1);
 303
 304                 microuptime(&elapsed);
 305                 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
 306
 307                 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
 308                         return(1);
 309         }
 310         return(0);
 311 }
 312
 313
 314 static int
 315 cluster_iodone(buf_t bp, __unused void *dummy)
 316 {
 317         int     b_flags;
 318         int     error;
 319         int     total_size;
 320         int     total_resid;
 321         int     upl_offset;
 322         int     zero_offset;
 323         upl_t   upl;
 324         buf_t   cbp;
 325         buf_t   cbp_head;
 326         buf_t   cbp_next;
 327         buf_t   real_bp;
 328         struct  clios *iostate;
 329         int     commit_size;
 330         int     pg_offset;
 331
 332         cbp_head = (buf_t)(bp->b_trans_head);
 333
 334         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 335                      (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 336
 337         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 338                 /*
 339                  * all I/O requests that are part of this transaction
 340                  * have to complete before we can process it
 341                  */
 342                 if ( !(cbp->b_flags & B_DONE)) {
 343
 344                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 345                                      (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
 346
 347                         return 0;
 348                 }
 349         }
 350         error       = 0;
 351         total_size  = 0;
 352         total_resid = 0;
 353
 354         cbp        = cbp_head;
 355         upl_offset = cbp->b_uploffset;
 356         upl        = cbp->b_upl;
 357         b_flags    = cbp->b_flags;
 358         real_bp    = cbp->b_real_bp;
 359         zero_offset= cbp->b_validend;
 360         iostate    = (struct clios *)cbp->b_iostate;
 361
 362         if (real_bp)
 363                 real_bp->b_dev = cbp->b_dev;
 364
 365         while (cbp) {
 366                 if ((cbp->b_flags & B_ERROR) && error == 0)
 367                         error = cbp->b_error;
 368
 369                 total_resid += cbp->b_resid;
 370                 total_size  += cbp->b_bcount;
 371
 372                 cbp_next = cbp->b_trans_next;
 373
 374                 free_io_buf(cbp);
 375
 376                 cbp = cbp_next;
 377         }
 378         if (zero_offset)
 379                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 380
 381         if (iostate) {
 382                 int need_wakeup = 0;
 383
 384                 /*
 385                  * someone has issued multiple I/Os asynchrounsly
 386                  * and is waiting for them to complete (streaming)
 387                  */
 388                 lck_mtx_lock(cl_mtxp);
 389
 390                 if (error && iostate->io_error == 0)
 391                         iostate->io_error = error;
 392
 393                 iostate->io_completed += total_size;
 394
 395                 if (iostate->io_wanted) {
 396                         /*
 397                          * someone is waiting for the state of
 398                          * this io stream to change
 399                          */
 400                         iostate->io_wanted = 0;
 401                         need_wakeup = 1;
 402                 }
 403                 lck_mtx_unlock(cl_mtxp);
 404
 405                 if (need_wakeup)
 406                         wakeup((caddr_t)&iostate->io_wanted);
 407         }
 408         if ((b_flags & B_NEED_IODONE) && real_bp) {
 409                 if (error) {
 410                         real_bp->b_flags |= B_ERROR;
 411                         real_bp->b_error = error;
 412                 }
 413                 real_bp->b_resid = total_resid;
 414
 415                 buf_biodone(real_bp);
 416         }
 417         if (error == 0 && total_resid)
 418                 error = EIO;
 419
 420         if (b_flags & B_COMMIT_UPL) {
 421                 pg_offset   = upl_offset & PAGE_MASK;
 422                 commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 423
 424                 if (error || (b_flags & B_NOCACHE)) {
 425                         int upl_abort_code;
 426                         int page_in  = 0;
 427                         int page_out = 0;
 428
 429                         if (b_flags & B_PAGEIO) {
 430                                 if (b_flags & B_READ)
 431                                         page_in  = 1;
 432                                 else
 433                                         page_out = 1;
 434                         }
 435                         if (b_flags & B_CACHE)          /* leave pages in the cache unchanged on error */
 436                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 437                         else if (page_out && (error != ENXIO)) /* transient error */
 438                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 439                         else if (page_in)
 440                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 441                         else
 442                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 443
 444                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 445                                                     upl_abort_code);
 446
 447                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 448                                      (int)upl, upl_offset - pg_offset, commit_size,
 449                                      0x80000000|upl_abort_code, 0);
 450
 451                 } else {
 452                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 453
 454                         if ((b_flags & B_PHYS) && (b_flags & B_READ))
 455                                 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
 456
 457                         if (b_flags & B_AGE)
 458                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 459
 460                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 461                                         upl_commit_flags);
 462
 463                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 464                                      (int)upl, upl_offset - pg_offset, commit_size,
 465                                      upl_commit_flags, 0);
 466                 }
 467         } else {
 468                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 469                              (int)upl, upl_offset, 0, error, 0);
 470         }
 471
 472         return (error);
 473 }
 474
 475
 476 void
 477 cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, buf_t bp)
 478 {
 479         upl_page_info_t *pl;
 480
 481         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
 482                      upl_offset, size, (int)bp, 0, 0);
 483
 484         if (bp == NULL || bp->b_datap == 0) {
 485
 486                 pl = ubc_upl_pageinfo(upl);
 487
 488                 while (size) {
 489                         int           page_offset;
 490                         int           page_index;
 491                         addr64_t      zero_addr;
 492                         int           zero_cnt;
 493
 494                         page_index  = upl_offset / PAGE_SIZE;
 495                         page_offset = upl_offset & PAGE_MASK;
 496
 497                         zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
 498                         zero_cnt  = min(PAGE_SIZE - page_offset, size);
 499
 500                         bzero_phys(zero_addr, zero_cnt);
 501
 502                         size       -= zero_cnt;
 503                         upl_offset += zero_cnt;
 504                 }
 505         } else
 506                 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
 507
 508         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
 509                      upl_offset, size, 0, 0, 0);
 510 }
 511
 512
 513 static int
 514 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 515            int flags, buf_t real_bp, struct clios *iostate)
 516 {
 517         buf_t   cbp;
 518         u_int   size;
 519         u_int   io_size;
 520         int     io_flags;
 521         int     bmap_flags;
 522         int     error = 0;
 523         int     retval = 0;
 524         buf_t   cbp_head = NULL;
 525         buf_t   cbp_tail = NULL;
 526         int     trans_count = 0;
 527         u_int   pg_count;
 528         int     pg_offset;
 529         u_int   max_iosize;
 530         u_int   max_vectors;
 531         int     priv;
 532         int     zero_offset = 0;
 533         int     async_throttle = 0;
 534         mount_t mp;
 535
 536         mp = vp->v_mount;
 537
 538         if (mp->mnt_devblocksize > 1) {
 539                 /*
 540                  * round the requested size up so that this I/O ends on a
 541                  * page boundary in case this is a 'write'... if the filesystem
 542                  * has blocks allocated to back the page beyond the EOF, we want to
 543                  * make sure to write out the zero's that are sitting beyond the EOF
 544                  * so that in case the filesystem doesn't explicitly zero this area
 545                  * if a hole is created via a lseek/write beyond the current EOF,
 546                  * it will return zeros when it's read back from the disk.  If the
 547                  * physical allocation doesn't extend for the whole page, we'll
 548                  * only write/read from the disk up to the end of this allocation
 549                  * via the extent info returned from the VNOP_BLOCKMAP call.
 550                  */
 551                 pg_offset = upl_offset & PAGE_MASK;
 552
 553                 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
 554         } else {
 555                 /*
 556                  * anyone advertising a blocksize of 1 byte probably
 557                  * can't deal with us rounding up the request size
 558                  * AFP is one such filesystem/device
 559                  */
 560                 size = non_rounded_size;
 561         }
 562         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 563                      (int)f_offset, size, upl_offset, flags, 0);
 564
 565         if (flags & CL_READ) {
 566                 io_flags = (B_READ);
 567                 bmap_flags = VNODE_READ;
 568
 569                 max_iosize  = mp->mnt_maxreadcnt;
 570                 max_vectors = mp->mnt_segreadcnt;
 571         } else {
 572                 io_flags = 0;
 573                 bmap_flags = VNODE_WRITE;
 574
 575                 max_iosize  = mp->mnt_maxwritecnt;
 576                 max_vectors = mp->mnt_segwritecnt;
 577         }
 578         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
 579
 580         /*
 581          * make sure the maximum iosize is a
 582          * multiple of the page size
 583          */
 584         max_iosize  &= ~PAGE_MASK;
 585
 586         if (flags & CL_THROTTLE) {
 587                 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
 588                         if (max_iosize > HARD_THROTTLE_MAXSIZE)
 589                                 max_iosize = HARD_THROTTLE_MAXSIZE;
 590                         async_throttle = HARD_THROTTLE_MAXCNT;
 591                 } else
 592                         async_throttle = VNODE_ASYNC_THROTTLE;
 593         }
 594         if (flags & CL_AGE)
 595                 io_flags |= B_AGE;
 596         if (flags & CL_DUMP)
 597                 io_flags |= B_NOCACHE;
 598         if (flags & (CL_PAGEIN | CL_PAGEOUT))
 599                 io_flags |= B_PAGEIO;
 600         if (flags & CL_COMMIT)
 601                 io_flags |= B_COMMIT_UPL;
 602         if (flags & CL_PRESERVE)
 603                 io_flags |= B_PHYS;
 604         if (flags & CL_KEEPCACHED)
 605                 io_flags |= B_CACHE;
 606
 607         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 608                 /*
 609                  * then we are going to end up
 610                  * with a page that we can't complete (the file size wasn't a multiple
 611                  * of PAGE_SIZE and we're trying to read to the end of the file
 612                  * so we'll go ahead and zero out the portion of the page we can't
 613                  * read in from the file
 614                  */
 615                 zero_offset = upl_offset + non_rounded_size;
 616         }
 617         while (size) {
 618                 int     pg_resid;
 619                 daddr64_t blkno;
 620                 daddr64_t lblkno;
 621
 622                 if (size > max_iosize)
 623                         io_size = max_iosize;
 624                 else
 625                         io_size = size;
 626
 627                 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, bmap_flags, NULL))) {
 628                         break;
 629                 }
 630                 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
 631                         real_bp->b_blkno = blkno;
 632
 633                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 634                              (int)f_offset, (int)blkno, io_size, zero_offset, 0);
 635
 636                 if (io_size == 0) {
 637                         /*
 638                          * vnop_blockmap didn't return an error... however, it did
 639                          * return an extent size of 0 which means we can't
 640                          * make forward progress on this I/O... a hole in the
 641                          * file would be returned as a blkno of -1 with a non-zero io_size
 642                          * a real extent is returned with a blkno != -1 and a non-zero io_size
 643                          */
 644                         error = EINVAL;
 645                         break;
 646                 }
 647                 if ( !(flags & CL_READ) && blkno == -1) {
 648                         off_t e_offset;
 649
 650                         /*
 651                          * we're writing into a 'hole'
 652                          */
 653                         if (flags & CL_PAGEOUT) {
 654                                 /*
 655                                  * if we got here via cluster_pageout
 656                                  * then just error the request and return
 657                                  * the 'hole' should already have been covered
 658                                  */
 659                                 error = EINVAL;
 660                                 break;
 661                         }
 662                         if ( !(flags & CL_COMMIT)) {
 663                                 /*
 664                                  * currently writes always request the commit to happen
 665                                  * as part of the io completion... however, if the CL_COMMIT
 666                                  * flag isn't specified, than we can't issue the abort_range
 667                                  * since the call site is going to abort or commit the same upl..
 668                                  * in this case we can only return an error
 669                                  */
 670                                 error = EINVAL;
 671                                 break;
 672                         }
 673                         /*
 674                          * we can get here if the cluster code happens to
 675                          * pick up a page that was dirtied via mmap vs
 676                          * a 'write' and the page targets a 'hole'...
 677                          * i.e. the writes to the cluster were sparse
 678                          * and the file was being written for the first time
 679                          *
 680                          * we can also get here if the filesystem supports
 681                          * 'holes' that are less than PAGE_SIZE.... because
 682                          * we can't know if the range in the page that covers
 683                          * the 'hole' has been dirtied via an mmap or not,
 684                          * we have to assume the worst and try to push the
 685                          * entire page to storage.
 686                          *
 687                          * Try paging out the page individually before
 688                          * giving up entirely and dumping it (the pageout
 689                          * path will insure that the zero extent accounting
 690                          * has been taken care of before we get back into cluster_io)
 691                          */
 692                         ubc_upl_abort_range(upl, trunc_page(upl_offset), PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
 693
 694                         e_offset = round_page_64(f_offset + 1);
 695
 696                         if (ubc_sync_range(vp, f_offset, e_offset, UBC_PUSHDIRTY) == 0) {
 697                                 error = EINVAL;
 698                                 break;
 699                         }
 700                         io_size = e_offset - f_offset;
 701
 702                         f_offset   += io_size;
 703                         upl_offset += io_size;
 704
 705                         if (size >= io_size)
 706                                 size -= io_size;
 707                         else
 708                                 size = 0;
 709                         /*
 710                          * keep track of how much of the original request
 711                          * that we've actually completed... non_rounded_size
 712                          * may go negative due to us rounding the request
 713                          * to a page size multiple (i.e.  size > non_rounded_size)
 714                          */
 715                         non_rounded_size -= io_size;
 716
 717                         if (non_rounded_size <= 0) {
 718                                 /*
 719                                  * we've transferred all of the data in the original
 720                                  * request, but we were unable to complete the tail
 721                                  * of the last page because the file didn't have
 722                                  * an allocation to back that portion... this is ok.
 723                                  */
 724                                 size = 0;
 725                         }
 726                         continue;
 727                 }
 728                 lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64);
 729                 /*
 730                  * we have now figured out how much I/O we can do - this is in 'io_size'
 731                  * pg_offset is the starting point in the first page for the I/O
 732                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 733                  */
 734                 pg_offset = upl_offset & PAGE_MASK;
 735
 736                 if (flags & CL_DEV_MEMORY) {
 737                         /*
 738                          * currently, can't deal with reading 'holes' in file
 739                          */
 740                         if (blkno == -1) {
 741                                 error = EINVAL;
 742                                 break;
 743                         }
 744                         /*
 745                          * treat physical requests as one 'giant' page
 746                          */
 747                         pg_count = 1;
 748                 } else
 749                         pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 750
 751                 if ((flags & CL_READ) && blkno == -1) {
 752                         int bytes_to_zero;
 753
 754                         /*
 755                          * if we're reading and blkno == -1, then we've got a
 756                          * 'hole' in the file that we need to deal with by zeroing
 757                          * out the affected area in the upl
 758                          */
 759                         if (zero_offset && io_size == size) {
 760                                 /*
 761                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
 762                                  * than 'zero_offset' will be non-zero
 763                                  * if the 'hole' returned by vnop_blockmap extends all the way to the eof
 764                                  * (indicated by the io_size finishing off the I/O request for this UPL)
 765                                  * than we're not going to issue an I/O for the
 766                                  * last page in this upl... we need to zero both the hole and the tail
 767                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
 768                                  */
 769                                 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
 770
 771                                 zero_offset = 0;
 772                         } else
 773                                 bytes_to_zero = io_size;
 774
 775                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
 776
 777                         if (cbp_head)
 778                                 /*
 779                                  * if there is a current I/O chain pending
 780                                  * then the first page of the group we just zero'd
 781                                  * will be handled by the I/O completion if the zero
 782                                  * fill started in the middle of the page
 783                                  */
 784                                 pg_count = (io_size - pg_offset) / PAGE_SIZE;
 785                         else {
 786                                 /*
 787                                  * no pending I/O to pick up that first page
 788                                  * so, we have to make sure it gets committed
 789                                  * here.
 790                                  * set the pg_offset to 0 so that the upl_commit_range
 791                                  * starts with this page
 792                                  */
 793                                 pg_count = (io_size + pg_offset) / PAGE_SIZE;
 794                                 pg_offset = 0;
 795                         }
 796                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 797                                 /*
 798                                  * if we're done with the request for this UPL
 799                                  * then we have to make sure to commit the last page
 800                                  * even if we only partially zero-filled it
 801                                  */
 802                                 pg_count++;
 803
 804                         if (pg_count) {
 805                                 if (pg_offset)
 806                                         pg_resid = PAGE_SIZE - pg_offset;
 807                                 else
 808                                         pg_resid = 0;
 809
 810                                 if (flags & CL_COMMIT)
 811                                         ubc_upl_commit_range(upl,
 812                                                         (upl_offset + pg_resid) & ~PAGE_MASK,
 813                                                         pg_count * PAGE_SIZE,
 814                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 815                         }
 816                         upl_offset += io_size;
 817                         f_offset   += io_size;
 818                         size       -= io_size;
 819                         /*
 820                          * keep track of how much of the original request
 821                          * that we've actually completed... non_rounded_size
 822                          * may go negative due to us rounding the request
 823                          * to a page size multiple (i.e.  size > non_rounded_size)
 824                          */
 825                         non_rounded_size -= io_size;
 826
 827                         if (non_rounded_size <= 0) {
 828                                 /*
 829                                  * we've transferred all of the data in the original
 830                                  * request, but we were unable to complete the tail
 831                                  * of the last page because the file didn't have
 832                                  * an allocation to back that portion... this is ok.
 833                                  */
 834                                 size = 0;
 835                         }
 836                         if (cbp_head && pg_count)
 837                                 goto start_io;
 838                         continue;
 839
 840                 }
 841                 if (pg_count > max_vectors) {
 842                         if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
 843                                 io_size = PAGE_SIZE - pg_offset;
 844                                 pg_count = 1;
 845                         } else {
 846                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 847                                 pg_count = max_vectors;
 848                         }
 849                 }
 850
 851                 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
 852                         /*
 853                          * if we're not targeting a virtual device i.e. a disk image
 854                          * it's safe to dip into the reserve pool since real devices
 855                          * can complete this I/O request without requiring additional
 856                          * bufs from the alloc_io_buf pool
 857                          */
 858                         priv = 1;
 859                 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 860                         /*
 861                          * Throttle the speculative IO
 862                          */
 863                         priv = 0;
 864                 else
 865                         priv = 1;
 866
 867                 cbp = alloc_io_buf(vp, priv);
 868
 869                 if (flags & CL_PAGEOUT) {
 870                         u_int i;
 871
 872                         for (i = 0; i < pg_count; i++) {
 873                                 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
 874                                         panic("BUSY bp found in cluster_io");
 875                         }
 876                 }
 877                 if (flags & CL_ASYNC) {
 878                         if (buf_setcallback(cbp, (void *)cluster_iodone, NULL))
 879                                 panic("buf_setcallback failed\n");
 880                 }
 881                 cbp->b_flags |= io_flags;
 882
 883                 cbp->b_lblkno = lblkno;
 884                 cbp->b_blkno  = blkno;
 885                 cbp->b_bcount = io_size;
 886
 887                 if (buf_setupl(cbp, upl, upl_offset))
 888                         panic("buf_setupl failed\n");
 889
 890                 cbp->b_trans_next = (buf_t)NULL;
 891
 892                 if ((cbp->b_iostate = (void *)iostate))
 893                         /*
 894                          * caller wants to track the state of this
 895                          * io... bump the amount issued against this stream
 896                          */
 897                         iostate->io_issued += io_size;
 898
 899                 if (flags & CL_READ) {
 900                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 901                                      (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
 902                 }
 903                 else {
 904                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 905                                      (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
 906                 }
 907
 908                 if (cbp_head) {
 909                         cbp_tail->b_trans_next = cbp;
 910                         cbp_tail = cbp;
 911                 } else {
 912                         cbp_head = cbp;
 913                         cbp_tail = cbp;
 914                 }
 915                 (buf_t)(cbp->b_trans_head) = cbp_head;
 916                 trans_count++;
 917
 918                 upl_offset += io_size;
 919                 f_offset   += io_size;
 920                 size       -= io_size;
 921                 /*
 922                  * keep track of how much of the original request
 923                  * that we've actually completed... non_rounded_size
 924                  * may go negative due to us rounding the request
 925                  * to a page size multiple (i.e.  size > non_rounded_size)
 926                  */
 927                 non_rounded_size -= io_size;
 928
 929                 if (non_rounded_size <= 0) {
 930                         /*
 931                          * we've transferred all of the data in the original
 932                          * request, but we were unable to complete the tail
 933                          * of the last page because the file didn't have
 934                          * an allocation to back that portion... this is ok.
 935                          */
 936                         size = 0;
 937                 }
 938                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || trans_count > 8)) || size == 0) {
 939                         /*
 940                          * if we have no more I/O to issue or
 941                          * the current I/O we've prepared fully
 942                          * completes the last page in this request
 943                          * and it's either an ASYNC request or
 944                          * we've already accumulated more than 8 I/O's into
 945                          * this transaction and it's not an I/O directed to
 946                          * special DEVICE memory
 947                          * then go ahead and issue the I/O
 948                          */
 949 start_io:
 950                         if (real_bp) {
 951                                 cbp_head->b_flags |= B_NEED_IODONE;
 952                                 cbp_head->b_real_bp = real_bp;
 953                         } else
 954                                 cbp_head->b_real_bp = (buf_t)NULL;
 955
 956                         if (size == 0) {
 957                                 /*
 958                                  * we're about to issue the last I/O for this upl
 959                                  * if this was a read to the eof and the eof doesn't
 960                                  * finish on a page boundary, than we need to zero-fill
 961                                  * the rest of the page....
 962                                  */
 963                                 cbp_head->b_validend = zero_offset;
 964                         } else
 965                                 cbp_head->b_validend = 0;
 966
 967                         if (flags & CL_THROTTLE)
 968                                 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, (char *)"cluster_io");
 969
 970                         for (cbp = cbp_head; cbp;) {
 971                                 buf_t   cbp_next;
 972
 973                                 if ( !(io_flags & B_READ))
 974                                         vnode_startwrite(vp);
 975
 976                                 cbp_next = cbp->b_trans_next;
 977
 978                                 (void) VNOP_STRATEGY(cbp);
 979                                 cbp = cbp_next;
 980                         }
 981                         if ( !(flags & CL_ASYNC)) {
 982                                 int dummy;
 983
 984                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 985                                         buf_biowait(cbp);
 986
 987                                 if ((error = cluster_iodone(cbp_head, (void *)&dummy))) {
 988                                         if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) == CL_PAGEOUT) && (error == ENXIO))
 989                                                 error = 0;      /* drop the error */
 990                                         else {
 991                                                 if (retval == 0)
 992                                                         retval = error;
 993                                                 error = 0;
 994                                         }
 995                                 }
 996                         }
 997                         cbp_head = (buf_t)NULL;
 998                         cbp_tail = (buf_t)NULL;
 999
1000                         trans_count = 0;
1001                 }
1002         }
1003         if (error) {
1004                 int abort_size;
1005
1006                 io_size = 0;
1007
1008                 for (cbp = cbp_head; cbp;) {
1009                         buf_t   cbp_next;
1010
1011                         upl_offset -= cbp->b_bcount;
1012                         size       += cbp->b_bcount;
1013                         io_size    += cbp->b_bcount;
1014
1015                         cbp_next = cbp->b_trans_next;
1016                         free_io_buf(cbp);
1017                         cbp = cbp_next;
1018                 }
1019                 if (iostate) {
1020                         int need_wakeup = 0;
1021
1022                         /*
1023                          * update the error condition for this stream
1024                          * since we never really issued the io
1025                          * just go ahead and adjust it back
1026                          */
1027                         lck_mtx_lock(cl_mtxp);
1028
1029                         if (iostate->io_error == 0)
1030                                 iostate->io_error = error;
1031                         iostate->io_issued -= io_size;
1032
1033                         if (iostate->io_wanted) {
1034                                 /*
1035                                  * someone is waiting for the state of
1036                                  * this io stream to change
1037                                  */
1038                                 iostate->io_wanted = 0;
1039                                 need_wakeup = 0;
1040                         }
1041                         lck_mtx_unlock(cl_mtxp);
1042
1043                         if (need_wakeup)
1044                                 wakeup((caddr_t)&iostate->io_wanted);
1045                 }
1046                 pg_offset  = upl_offset & PAGE_MASK;
1047                 abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1048
1049                 if (flags & CL_COMMIT) {
1050                         int upl_abort_code;
1051
1052                         if (flags & CL_PRESERVE) {
1053                                 ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
1054                                                      UPL_COMMIT_FREE_ON_EMPTY);
1055                         } else {
1056                                 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
1057                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
1058                                 else if (flags & CL_PAGEIN)
1059                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1060                                 else
1061                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
1062
1063                                 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
1064                                                 upl_abort_code);
1065                         }
1066                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
1067                                      (int)upl, upl_offset - pg_offset, abort_size, error, 0);
1068                 }
1069                 if (real_bp) {
1070                         real_bp->b_flags |= B_ERROR;
1071                         real_bp->b_error  = error;
1072
1073                         buf_biodone(real_bp);
1074                 }
1075                 if (retval == 0)
1076                         retval = error;
1077         }
1078         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
1079                      (int)f_offset, size, upl_offset, retval, 0);
1080
1081         return (retval);
1082 }
1083
1084
1085 static int
1086 cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize)
1087 {
1088         int           pages_in_prefetch;
1089
1090         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
1091                      (int)f_offset, size, (int)filesize, 0, 0);
1092
1093         if (f_offset >= filesize) {
1094                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1095                              (int)f_offset, 0, 0, 0, 0);
1096                 return(0);
1097         }
1098         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1099                 size = (MAX_UPL_TRANSFER * PAGE_SIZE);
1100         else
1101                 size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1102
1103         if ((off_t)size > (filesize - f_offset))
1104                 size = filesize - f_offset;
1105         pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1106
1107         advisory_read(vp, filesize, f_offset, size);
1108
1109         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1110                      (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1111
1112         return (pages_in_prefetch);
1113 }
1114
1115
1116
1117 static void
1118 cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap)
1119 {
1120         daddr64_t       r_addr;
1121         off_t           f_offset;
1122         int             size_of_prefetch;
1123
1124
1125         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
1126                      (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
1127
1128         if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
1129                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1130                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
1131                 return;
1132         }
1133         if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1) &&
1134                                    (extent->b_addr != (rap->cl_maxra + 1) || rap->cl_ralen == 0))) {
1135                 rap->cl_ralen = 0;
1136                 rap->cl_maxra = 0;
1137
1138                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1139                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
1140
1141                 return;
1142         }
1143         if (extent->e_addr < rap->cl_maxra) {
1144                 if ((rap->cl_maxra - extent->e_addr) > (MAX_UPL_TRANSFER / 4)) {
1145
1146                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1147                                      rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
1148                         return;
1149                 }
1150         }
1151         r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
1152         f_offset = (off_t)(r_addr * PAGE_SIZE_64);
1153
1154         size_of_prefetch = 0;
1155
1156         ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
1157
1158         if (size_of_prefetch) {
1159                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1160                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
1161                 return;
1162         }
1163         if (f_offset < filesize) {
1164                 daddr64_t read_size;
1165
1166                 rap->cl_ralen = rap->cl_ralen ? min(MAX_UPL_TRANSFER, rap->cl_ralen << 1) : 1;
1167
1168                 read_size = (extent->e_addr + 1) - extent->b_addr;
1169
1170                 if (read_size > rap->cl_ralen) {
1171                         if (read_size > MAX_UPL_TRANSFER)
1172                                 rap->cl_ralen = MAX_UPL_TRANSFER;
1173                         else
1174                                 rap->cl_ralen = read_size;
1175                 }
1176                 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize);
1177
1178                 if (size_of_prefetch)
1179                         rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
1180         }
1181         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1182                      rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
1183 }
1184
1185 int
1186 cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1187                 int size, off_t filesize, int flags)
1188 {
1189         int           io_size;
1190         int           rounded_size;
1191         off_t         max_size;
1192         int           local_flags;
1193         struct cl_writebehind *wbp;
1194
1195         if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
1196                 /*
1197                  * if we know we're issuing this I/O to a virtual device (i.e. disk image)
1198                  * then we don't want to enforce this throttle... if we do, we can
1199                  * potentially deadlock since we're stalling the pageout thread at a time
1200                  * when the disk image might need additional memory (which won't be available
1201                  * if the pageout thread can't run)... instead we'll just depend on the throttle
1202                  * that the pageout thread now has in place to deal with external files
1203                  */
1204                 local_flags = CL_PAGEOUT;
1205         else
1206                 local_flags = CL_PAGEOUT | CL_THROTTLE;
1207
1208         if ((flags & UPL_IOSYNC) == 0)
1209                 local_flags |= CL_ASYNC;
1210         if ((flags & UPL_NOCOMMIT) == 0)
1211                 local_flags |= CL_COMMIT;
1212         if ((flags & UPL_KEEPCACHED))
1213                 local_flags |= CL_KEEPCACHED;
1214
1215
1216         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
1217                      (int)f_offset, size, (int)filesize, local_flags, 0);
1218
1219         /*
1220          * If they didn't specify any I/O, then we are done...
1221          * we can't issue an abort because we don't know how
1222          * big the upl really is
1223          */
1224         if (size <= 0)
1225                 return (EINVAL);
1226
1227         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1228                 if (local_flags & CL_COMMIT)
1229                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1230                 return (EROFS);
1231         }
1232         /*
1233          * can't page-in from a negative offset
1234          * or if we're starting beyond the EOF
1235          * or if the file offset isn't page aligned
1236          * or the size requested isn't a multiple of PAGE_SIZE
1237          */
1238         if (f_offset < 0 || f_offset >= filesize ||
1239            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
1240                 if (local_flags & CL_COMMIT)
1241                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1242                 return (EINVAL);
1243         }
1244         max_size = filesize - f_offset;
1245
1246         if (size < max_size)
1247                 io_size = size;
1248         else
1249                 io_size = max_size;
1250
1251         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1252
1253         if (size > rounded_size) {
1254                 if (local_flags & CL_COMMIT)
1255                         ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
1256                                         UPL_ABORT_FREE_ON_EMPTY);
1257         }
1258         if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1259                 wbp->cl_hasbeenpaged = 1;
1260
1261         return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
1262                            local_flags, (buf_t)NULL, (struct clios *)NULL));
1263 }
1264
1265 int
1266 cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1267                int size, off_t filesize, int flags)
1268 {
1269         u_int         io_size;
1270         int           rounded_size;
1271         off_t         max_size;
1272         int           retval;
1273         int           local_flags = 0;
1274
1275         if (upl == NULL || size < 0)
1276                 panic("cluster_pagein: NULL upl passed in");
1277
1278         if ((flags & UPL_IOSYNC) == 0)
1279                 local_flags |= CL_ASYNC;
1280         if ((flags & UPL_NOCOMMIT) == 0)
1281                 local_flags |= CL_COMMIT;
1282
1283
1284         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1285                      (int)f_offset, size, (int)filesize, local_flags, 0);
1286
1287         /*
1288          * can't page-in from a negative offset
1289          * or if we're starting beyond the EOF
1290          * or if the file offset isn't page aligned
1291          * or the size requested isn't a multiple of PAGE_SIZE
1292          */
1293         if (f_offset < 0 || f_offset >= filesize ||
1294            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1295                 if (local_flags & CL_COMMIT)
1296                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1297                 return (EINVAL);
1298         }
1299         max_size = filesize - f_offset;
1300
1301         if (size < max_size)
1302                 io_size = size;
1303         else
1304                 io_size = max_size;
1305
1306         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1307
1308         if (size > rounded_size && (local_flags & CL_COMMIT))
1309                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1310                                     size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1311
1312         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
1313                            local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL);
1314
1315         if (retval == 0 && !(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) {
1316                 struct cl_readahead *rap;
1317
1318                 rap = cluster_get_rap(vp);
1319
1320                 if (rap != NULL) {
1321                         struct cl_extent extent;
1322
1323                         extent.b_addr = (daddr64_t)(f_offset / PAGE_SIZE_64);
1324                         extent.e_addr = (daddr64_t)((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1325
1326                         if (rounded_size == PAGE_SIZE) {
1327                                 /*
1328                                  * we haven't read the last page in of the file yet
1329                                  * so let's try to read ahead if we're in
1330                                  * a sequential access pattern
1331                                  */
1332                                 cluster_rd_ahead(vp, &extent, filesize, rap);
1333                         }
1334                         rap->cl_lastr = extent.e_addr;
1335
1336                         lck_mtx_unlock(&rap->cl_lockr);
1337                 }
1338         }
1339         return (retval);
1340 }
1341
1342 int
1343 cluster_bp(buf_t bp)
1344 {
1345         off_t  f_offset;
1346         int    flags;
1347
1348         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1349                      (int)bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1350
1351         if (bp->b_flags & B_READ)
1352                 flags = CL_ASYNC | CL_READ;
1353         else
1354                 flags = CL_ASYNC;
1355
1356         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1357
1358         return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL));
1359 }
1360
1361 int
1362 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
1363 {
1364         int           prev_resid;
1365         u_int         clip_size;
1366         off_t         max_io_size;
1367         int           upl_size;
1368         int           upl_flags;
1369         upl_t         upl;
1370         int           retval = 0;
1371         int           flags;
1372
1373         flags = xflags;
1374
1375         if (vp->v_flag & VNOCACHE_DATA)
1376                 flags |= IO_NOCACHE;
1377
1378         if ( (!(flags & IO_NOCACHE)) || (!uio) || (!UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
1379                 /*
1380                  * go do a write through the cache if one of the following is true....
1381                  *   NOCACHE is not true
1382                  *   there is no uio structure or it doesn't target USERSPACE
1383                  */
1384                 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1385         }
1386
1387 #if LP64_DEBUG
1388         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1389                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1390         }
1391 #endif /* LP64_DEBUG */
1392
1393         while (uio_resid(uio) && uio->uio_offset < newEOF && retval == 0) {
1394                 user_size_t     iov_len;
1395                 user_addr_t     iov_base;
1396
1397                 /*
1398                  * we know we have a resid, so this is safe
1399                  * skip over any emtpy vectors
1400                  */
1401                 uio_update(uio, (user_size_t)0);
1402
1403                 iov_len  = uio_curriovlen(uio);
1404                 iov_base = uio_curriovbase(uio);
1405
1406                 upl_size  = PAGE_SIZE;
1407                 upl_flags = UPL_QUERY_OBJECT_TYPE;
1408
1409                 // LP64todo - fix this!
1410                 if ((vm_map_get_upl(current_map(),
1411                                     (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1412                                     &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
1413                         /*
1414                          * the user app must have passed in an invalid address
1415                          */
1416                         return (EFAULT);
1417                 }
1418
1419                 /*
1420                  * We check every vector target but if it is physically
1421                  * contiguous space, we skip the sanity checks.
1422                  */
1423                 if (upl_flags & UPL_PHYS_CONTIG) {
1424                         int zflags;
1425
1426                         zflags = flags & ~IO_TAILZEROFILL;
1427                         zflags |= IO_HEADZEROFILL;
1428
1429                         if (flags & IO_HEADZEROFILL) {
1430                                 /*
1431                                  * in case we have additional vectors, we don't want to do this again
1432                                  */
1433                                 flags &= ~IO_HEADZEROFILL;
1434
1435                                 if ((retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, zflags)))
1436                                         return(retval);
1437                         }
1438                         retval = cluster_phys_write(vp, uio, newEOF);
1439
1440                         if (uio_resid(uio) == 0 && (flags & IO_TAILZEROFILL)) {
1441                                 return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, zflags));
1442                         }
1443                 }
1444                 else if ((uio_resid(uio) < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL))) {
1445                         /*
1446                          * we're here because we're don't have a physically contiguous target buffer
1447                          * go do a write through the cache if one of the following is true....
1448                          *   the total xfer size is less than a page...
1449                          *   we're being asked to ZEROFILL either the head or the tail of the I/O...
1450                          */
1451                         return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1452                 }
1453                 // LP64todo - fix this!
1454                 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1455                         if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1456                                 /*
1457                                  * Bring the file offset write up to a pagesize boundary
1458                                  * this will also bring the base address to a page boundary
1459                                  * since they both are currently on the same offset within a page
1460                                  * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1461                                  * so the computed clip_size must always be less than the current uio_resid
1462                                  */
1463                                 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1464
1465                                 /*
1466                                  * Fake the resid going into the cluster_write_x call
1467                                  * and restore it on the way out.
1468                                  */
1469                                 // LP64todo - fix this
1470                                 prev_resid = uio_resid(uio);
1471                                 uio_setresid(uio, clip_size);
1472
1473                                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1474
1475                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1476                         } else {
1477                                 /*
1478                                  * can't get both the file offset and the buffer offset aligned to a page boundary
1479                                  * so fire an I/O through the cache for this entire vector
1480                                  */
1481                                 // LP64todo - fix this
1482                                 clip_size = iov_len;
1483                                 // LP64todo - fix this
1484                                 prev_resid = uio_resid(uio);
1485                                 uio_setresid(uio, clip_size);
1486
1487                                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1488
1489                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1490                         }
1491                 } else {
1492                         /*
1493                          * If we come in here, we know the offset into
1494                          * the file is on a pagesize boundary and the
1495                          * target buffer address is also on a page boundary
1496                          */
1497                         max_io_size = newEOF - uio->uio_offset;
1498                         // LP64todo - fix this
1499                         clip_size = uio_resid(uio);
1500                         if (iov_len < clip_size)
1501                                 // LP64todo - fix this!
1502                                 clip_size = iov_len;
1503                         if (max_io_size < clip_size)
1504                                 clip_size = max_io_size;
1505
1506                         if (clip_size < PAGE_SIZE) {
1507                                 /*
1508                                  * Take care of tail end of write in this vector
1509                                  */
1510                                 // LP64todo - fix this
1511                                 prev_resid = uio_resid(uio);
1512                                 uio_setresid(uio, clip_size);
1513
1514                                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1515
1516                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1517                         } else {
1518                                 /* round clip_size down to a multiple of pagesize */
1519                                 clip_size = clip_size & ~(PAGE_MASK);
1520                                 // LP64todo - fix this
1521                                 prev_resid = uio_resid(uio);
1522                                 uio_setresid(uio, clip_size);
1523
1524                                 retval = cluster_nocopy_write(vp, uio, newEOF);
1525
1526                                 if ((retval == 0) && uio_resid(uio))
1527                                         retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1528
1529                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1530                         }
1531                 } /* end else */
1532         } /* end while */
1533
1534         return(retval);
1535 }
1536
1537
1538 static int
1539 cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF)
1540 {
1541         upl_t            upl;
1542         upl_page_info_t  *pl;
1543         vm_offset_t      upl_offset;
1544         int              io_size;
1545         int              io_flag;
1546         int              upl_size;
1547         int              upl_needed_size;
1548         int              pages_in_pl;
1549         int              upl_flags;
1550         kern_return_t    kret;
1551         int              i;
1552         int              force_data_sync;
1553         int              error  = 0;
1554         struct clios     iostate;
1555         struct cl_writebehind *wbp;
1556
1557
1558         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1559                      (int)uio->uio_offset, (int)uio_resid(uio),
1560                      (int)newEOF, 0, 0);
1561
1562         /*
1563          * When we enter this routine, we know
1564          *  -- the offset into the file is on a pagesize boundary
1565          *  -- the resid is a page multiple
1566          *  -- the resid will not exceed iov_len
1567          */
1568
1569         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1570
1571                 cluster_try_push(wbp, vp, newEOF, 0, 1);
1572
1573                 lck_mtx_unlock(&wbp->cl_lockw);
1574         }
1575         iostate.io_completed = 0;
1576         iostate.io_issued = 0;
1577         iostate.io_error = 0;
1578         iostate.io_wanted = 0;
1579
1580         while (uio_resid(uio) && uio->uio_offset < newEOF && error == 0) {
1581                 user_addr_t     iov_base;
1582
1583                 io_size = uio_resid(uio);
1584
1585                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1586                         io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1587
1588                 iov_base = uio_curriovbase(uio);
1589
1590                 // LP64todo - fix this!
1591                 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
1592
1593                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1594
1595                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1596                              (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
1597
1598                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1599                         pages_in_pl = 0;
1600                         upl_size = upl_needed_size;
1601                         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1602                                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1603
1604                         // LP64todo - fix this!
1605                         kret = vm_map_get_upl(current_map(),
1606                                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1607                                               &upl_size,
1608                                               &upl,
1609                                               NULL,
1610                                               &pages_in_pl,
1611                                               &upl_flags,
1612                                               force_data_sync);
1613
1614                         if (kret != KERN_SUCCESS) {
1615                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1616                                              0, 0, 0, kret, 0);
1617                                 /*
1618                                  * cluster_nocopy_write: failed to get pagelist
1619                                  *
1620                                  * we may have already spun some portion of this request
1621                                  * off as async requests... we need to wait for the I/O
1622                                  * to complete before returning
1623                                  */
1624                                 goto wait_for_writes;
1625                         }
1626                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1627                         pages_in_pl = upl_size / PAGE_SIZE;
1628
1629                         for (i = 0; i < pages_in_pl; i++) {
1630                                 if (!upl_valid_page(pl, i))
1631                                         break;
1632                         }
1633                         if (i == pages_in_pl)
1634                                 break;
1635
1636                         /*
1637                          * didn't get all the pages back that we
1638                          * needed... release this upl and try again
1639                          */
1640                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1641                                             UPL_ABORT_FREE_ON_EMPTY);
1642                 }
1643                 if (force_data_sync >= 3) {
1644                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1645                                      i, pages_in_pl, upl_size, kret, 0);
1646                         /*
1647                          * for some reason, we couldn't acquire a hold on all
1648                          * the pages needed in the user's address space
1649                          *
1650                          * we may have already spun some portion of this request
1651                          * off as async requests... we need to wait for the I/O
1652                          * to complete before returning
1653                          */
1654                         goto wait_for_writes;
1655                 }
1656
1657                 /*
1658                  * Consider the possibility that upl_size wasn't satisfied.
1659                  */
1660                 if (upl_size != upl_needed_size)
1661                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1662
1663                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1664                              (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
1665
1666                 if (io_size == 0) {
1667                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1668                                             UPL_ABORT_FREE_ON_EMPTY);
1669                         /*
1670                          * we may have already spun some portion of this request
1671                          * off as async requests... we need to wait for the I/O
1672                          * to complete before returning
1673                          */
1674                         goto wait_for_writes;
1675                 }
1676                 /*
1677                  * Now look for pages already in the cache
1678                  * and throw them away.
1679                  * uio->uio_offset is page aligned within the file
1680                  * io_size is a multiple of PAGE_SIZE
1681                  */
1682                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
1683
1684                 /*
1685                  * we want push out these writes asynchronously so that we can overlap
1686                  * the preparation of the next I/O
1687                  * if there are already too many outstanding writes
1688                  * wait until some complete before issuing the next
1689                  */
1690                 lck_mtx_lock(cl_mtxp);
1691
1692                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1693                         iostate.io_wanted = 1;
1694                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
1695                 }
1696                 lck_mtx_unlock(cl_mtxp);
1697
1698                 if (iostate.io_error) {
1699                         /*
1700                          * one of the earlier writes we issued ran into a hard error
1701                          * don't issue any more writes, cleanup the UPL
1702                          * that was just created but not used, then
1703                          * go wait for all writes that are part of this stream
1704                          * to complete before returning the error to the caller
1705                          */
1706                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1707                                             UPL_ABORT_FREE_ON_EMPTY);
1708
1709                         goto wait_for_writes;
1710                 }
1711                 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE;
1712
1713                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1714                              (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1715
1716                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1717                                    io_size, io_flag, (buf_t)NULL, &iostate);
1718
1719                 uio_update(uio, (user_size_t)io_size);
1720
1721                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1722                              (int)upl_offset, (int)uio->uio_offset, (int)uio_resid(uio), error, 0);
1723
1724         } /* end while */
1725
1726 wait_for_writes:
1727         /*
1728          * make sure all async writes issued as part of this stream
1729          * have completed before we return
1730          */
1731         lck_mtx_lock(cl_mtxp);
1732
1733         while (iostate.io_issued != iostate.io_completed) {
1734                 iostate.io_wanted = 1;
1735                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
1736         }
1737         lck_mtx_unlock(cl_mtxp);
1738
1739         if (iostate.io_error)
1740                 error = iostate.io_error;
1741
1742         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1743                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1744
1745         return (error);
1746 }
1747
1748
1749 static int
1750 cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF)
1751 {
1752         upl_page_info_t *pl;
1753         addr64_t         src_paddr;
1754         upl_t            upl;
1755         vm_offset_t      upl_offset;
1756         int              tail_size;
1757         int              io_size;
1758         int              upl_size;
1759         int              upl_needed_size;
1760         int              pages_in_pl;
1761         int              upl_flags;
1762         kern_return_t    kret;
1763         int              error  = 0;
1764         user_addr_t      iov_base;
1765         int              devblocksize;
1766         struct cl_writebehind *wbp;
1767
1768         devblocksize = vp->v_mount->mnt_devblocksize;
1769         /*
1770          * When we enter this routine, we know
1771          *  -- the resid will not exceed iov_len
1772          *  -- the vector target address is physcially contiguous
1773          */
1774         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1775
1776                 cluster_try_push(wbp, vp, newEOF, 0, 1);
1777
1778                 lck_mtx_unlock(&wbp->cl_lockw);
1779         }
1780 #if LP64_DEBUG
1781         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1782                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1783         }
1784 #endif /* LP64_DEBUG */
1785
1786         // LP64todo - fix this!
1787         io_size = (int)uio_curriovlen(uio);
1788         iov_base = uio_curriovbase(uio);
1789
1790         upl_offset = CAST_DOWN(upl_offset_t, iov_base) & PAGE_MASK;
1791         upl_needed_size = upl_offset + io_size;
1792
1793         pages_in_pl = 0;
1794         upl_size = upl_needed_size;
1795         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1796                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1797
1798         // LP64todo - fix this!
1799         kret = vm_map_get_upl(current_map(),
1800                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1801                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1802
1803         if (kret != KERN_SUCCESS) {
1804                 /*
1805                  * cluster_phys_write: failed to get pagelist
1806                  * note: return kret here
1807                  */
1808               return(EINVAL);
1809         }
1810         /*
1811          * Consider the possibility that upl_size wasn't satisfied.
1812          * This is a failure in the physical memory case.
1813          */
1814         if (upl_size < upl_needed_size) {
1815                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1816                 return(EINVAL);
1817         }
1818         pl = ubc_upl_pageinfo(upl);
1819
1820         src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
1821
1822         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1823                 int   head_size;
1824
1825                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1826
1827                 if (head_size > io_size)
1828                         head_size = io_size;
1829
1830                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0);
1831
1832                 if (error) {
1833                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1834
1835                         return(EINVAL);
1836                 }
1837                 upl_offset += head_size;
1838                 src_paddr  += head_size;
1839                 io_size    -= head_size;
1840         }
1841         tail_size = io_size & (devblocksize - 1);
1842         io_size  -= tail_size;
1843
1844         if (io_size) {
1845                 /*
1846                  * issue a synchronous write to cluster_io
1847                  */
1848                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1849                                    io_size, CL_DEV_MEMORY, (buf_t)NULL, (struct clios *)NULL);
1850         }
1851         if (error == 0) {
1852                 /*
1853                  * The cluster_io write completed successfully,
1854                  * update the uio structure
1855                  */
1856                 uio_update(uio, (user_size_t)io_size);
1857
1858                 src_paddr += io_size;
1859
1860                 if (tail_size)
1861                         error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0);
1862         }
1863         /*
1864          * just release our hold on the physically contiguous
1865          * region without changing any state
1866          */
1867         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1868
1869         return (error);
1870 }
1871
1872
1873 static int
1874 cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int flags)
1875 {
1876         upl_page_info_t *pl;
1877         upl_t            upl;
1878         vm_offset_t      upl_offset = 0;
1879         int              upl_size;
1880         off_t            upl_f_offset;
1881         int              pages_in_upl;
1882         int              start_offset;
1883         int              xfer_resid;
1884         int              io_size;
1885         int              io_offset;
1886         int              bytes_to_zero;
1887         int              bytes_to_move;
1888         kern_return_t    kret;
1889         int              retval = 0;
1890         int              io_resid;
1891         long long        total_size;
1892         long long        zero_cnt;
1893         off_t            zero_off;
1894         long long        zero_cnt1;
1895         off_t            zero_off1;
1896         struct cl_extent cl;
1897         int              intersection;
1898         struct cl_writebehind *wbp;
1899
1900         if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1901         {
1902                 if (wbp->cl_hasbeenpaged) {
1903                         /*
1904                          * this vnode had pages cleaned to it by
1905                          * the pager which indicates that either
1906                          * it's not very 'hot', or the system is
1907                          * being overwhelmed by a lot of dirty
1908                          * data being delayed in the VM cache...
1909                          * in either event, we'll push our remaining
1910                          * delayed data at this point...  this will
1911                          * be more efficient than paging out 1 page at
1912                          * a time, and will also act as a throttle
1913                          * by delaying this client from writing any
1914                          * more data until all his delayed data has
1915                          * at least been queued to the uderlying driver.
1916                          */
1917                         if (wbp->cl_number || wbp->cl_scmap)
1918                                 cluster_push_EOF(vp, newEOF);
1919
1920                         wbp->cl_hasbeenpaged = 0;
1921                 }
1922         }
1923         if (uio) {
1924                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1925                              (int)uio->uio_offset, uio_resid(uio), (int)oldEOF, (int)newEOF, 0);
1926
1927                 // LP64todo - fix this
1928                 io_resid = uio_resid(uio);
1929         } else {
1930                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1931                              0, 0, (int)oldEOF, (int)newEOF, 0);
1932
1933                 io_resid = 0;
1934         }
1935         zero_cnt  = 0;
1936         zero_cnt1 = 0;
1937         zero_off  = 0;
1938         zero_off1 = 0;
1939
1940         if (flags & IO_HEADZEROFILL) {
1941                 /*
1942                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1943                  * so we zero fill the intervening space between the old EOF and the offset
1944                  * where the next chunk of real data begins.... ftruncate will also use this
1945                  * routine to zero fill to the new EOF when growing a file... in this case, the
1946                  * uio structure will not be provided
1947                  */
1948                 if (uio) {
1949                         if (headOff < uio->uio_offset) {
1950                                 zero_cnt = uio->uio_offset - headOff;
1951                                 zero_off = headOff;
1952                         }
1953                 } else if (headOff < newEOF) {
1954                         zero_cnt = newEOF - headOff;
1955                         zero_off = headOff;
1956                 }
1957         }
1958         if (flags & IO_TAILZEROFILL) {
1959                 if (uio) {
1960                         // LP64todo - fix this
1961                         zero_off1 = uio->uio_offset + uio_resid(uio);
1962
1963                         if (zero_off1 < tailOff)
1964                                 zero_cnt1 = tailOff - zero_off1;
1965                 }
1966         }
1967         if (zero_cnt == 0 && uio == (struct uio *) 0) {
1968                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1969                              retval, 0, 0, 0, 0);
1970                 return (0);
1971         }
1972
1973         while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1974                 /*
1975                  * for this iteration of the loop, figure out where our starting point is
1976                  */
1977                 if (zero_cnt) {
1978                         start_offset = (int)(zero_off & PAGE_MASK_64);
1979                         upl_f_offset = zero_off - start_offset;
1980                 } else if (io_resid) {
1981                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1982                         upl_f_offset = uio->uio_offset - start_offset;
1983                 } else {
1984                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1985                         upl_f_offset = zero_off1 - start_offset;
1986                 }
1987                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1988                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1989
1990                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1991                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1992
1993                 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
1994
1995                 if (uio && ((flags & (IO_NOCACHE | IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
1996                         /*
1997                          * assumption... total_size <= io_resid
1998                          * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1999                          */
2000                         if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
2001                                 total_size -= start_offset;
2002                         xfer_resid = total_size;
2003
2004                         retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
2005
2006                         if (retval)
2007                                 break;
2008
2009                         io_resid   -= (total_size - xfer_resid);
2010                         total_size   = xfer_resid;
2011                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2012                         upl_f_offset = uio->uio_offset - start_offset;
2013
2014                         if (total_size == 0) {
2015                                 if (start_offset) {
2016                                         /*
2017                                          * the write did not finish on a page boundary
2018                                          * which will leave upl_f_offset pointing to the
2019                                          * beginning of the last page written instead of
2020                                          * the page beyond it... bump it in this case
2021                                          * so that the cluster code records the last page
2022                                          * written as dirty
2023                                          */
2024                                         upl_f_offset += PAGE_SIZE_64;
2025                                 }
2026                                 upl_size = 0;
2027
2028                                 goto check_cluster;
2029                         }
2030                 }
2031                 /*
2032                  * compute the size of the upl needed to encompass
2033                  * the requested write... limit each call to cluster_io
2034                  * to the maximum UPL size... cluster_io will clip if
2035                  * this exceeds the maximum io_size for the device,
2036                  * make sure to account for
2037                  * a starting offset that's not page aligned
2038                  */
2039                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2040
2041                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2042                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2043
2044                 pages_in_upl = upl_size / PAGE_SIZE;
2045                 io_size      = upl_size - start_offset;
2046
2047                 if ((long long)io_size > total_size)
2048                         io_size = total_size;
2049
2050                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
2051
2052
2053                 /*
2054                  * Gather the pages from the buffer cache.
2055                  * The UPL_WILL_MODIFY flag lets the UPL subsystem know
2056                  * that we intend to modify these pages.
2057                  */
2058                 kret = ubc_create_upl(vp,
2059                                       upl_f_offset,
2060                                       upl_size,
2061                                       &upl,
2062                                       &pl,
2063                                       UPL_SET_LITE | UPL_WILL_MODIFY);
2064                 if (kret != KERN_SUCCESS)
2065                         panic("cluster_write: failed to get pagelist");
2066
2067                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
2068                         (int)upl, (int)upl_f_offset, start_offset, 0, 0);
2069
2070                 if (start_offset && !upl_valid_page(pl, 0)) {
2071                         int   read_size;
2072
2073                         /*
2074                          * we're starting in the middle of the first page of the upl
2075                          * and the page isn't currently valid, so we're going to have
2076                          * to read it in first... this is a synchronous operation
2077                          */
2078                         read_size = PAGE_SIZE;
2079
2080                         if ((upl_f_offset + read_size) > newEOF)
2081                                 read_size = newEOF - upl_f_offset;
2082
2083                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
2084                                             CL_READ, (buf_t)NULL, (struct clios *)NULL);
2085                         if (retval) {
2086                                 /*
2087                                  * we had an error during the read which causes us to abort
2088                                  * the current cluster_write request... before we do, we need
2089                                  * to release the rest of the pages in the upl without modifying
2090                                  * there state and mark the failed page in error
2091                                  */
2092                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2093
2094                                 if (upl_size > PAGE_SIZE)
2095                                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2096
2097                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2098                                              (int)upl, 0, 0, retval, 0);
2099                                 break;
2100                         }
2101                 }
2102                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
2103                         /*
2104                          * the last offset we're writing to in this upl does not end on a page
2105                          * boundary... if it's not beyond the old EOF, then we'll also need to
2106                          * pre-read this page in if it isn't already valid
2107                          */
2108                         upl_offset = upl_size - PAGE_SIZE;
2109
2110                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
2111                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
2112                                 int   read_size;
2113
2114                                 read_size = PAGE_SIZE;
2115
2116                                 if ((upl_f_offset + upl_offset + read_size) > newEOF)
2117                                         read_size = newEOF - (upl_f_offset + upl_offset);
2118
2119                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
2120                                                     CL_READ, (buf_t)NULL, (struct clios *)NULL);
2121                                 if (retval) {
2122                                         /*
2123                                          * we had an error during the read which causes us to abort
2124                                          * the current cluster_write request... before we do, we
2125                                          * need to release the rest of the pages in the upl without
2126                                          * modifying there state and mark the failed page in error
2127                                          */
2128                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2129
2130                                         if (upl_size > PAGE_SIZE)
2131                                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2132
2133                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2134                                                      (int)upl, 0, 0, retval, 0);
2135                                         break;
2136                                 }
2137                         }
2138                 }
2139                 xfer_resid = io_size;
2140                 io_offset = start_offset;
2141
2142                 while (zero_cnt && xfer_resid) {
2143
2144                         if (zero_cnt < (long long)xfer_resid)
2145                                 bytes_to_zero = zero_cnt;
2146                         else
2147                                 bytes_to_zero = xfer_resid;
2148
2149                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2150                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2151                         } else {
2152                                 int zero_pg_index;
2153
2154                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
2155                                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
2156
2157                                 if ( !upl_valid_page(pl, zero_pg_index)) {
2158                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2159
2160                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2161                                            !upl_dirty_page(pl, zero_pg_index)) {
2162                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2163                                 }
2164                         }
2165                         xfer_resid -= bytes_to_zero;
2166                         zero_cnt   -= bytes_to_zero;
2167                         zero_off   += bytes_to_zero;
2168                         io_offset  += bytes_to_zero;
2169                 }
2170                 if (xfer_resid && io_resid) {
2171                         bytes_to_move = min(io_resid, xfer_resid);
2172
2173                         retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
2174
2175                         if (retval) {
2176
2177                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2178
2179                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2180                                              (int)upl, 0, 0, retval, 0);
2181                         } else {
2182                                 io_resid  -= bytes_to_move;
2183                                 xfer_resid -= bytes_to_move;
2184                                 io_offset  += bytes_to_move;
2185                         }
2186                 }
2187                 while (xfer_resid && zero_cnt1 && retval == 0) {
2188
2189                         if (zero_cnt1 < (long long)xfer_resid)
2190                                 bytes_to_zero = zero_cnt1;
2191                         else
2192                                 bytes_to_zero = xfer_resid;
2193
2194                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2195                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2196                         } else {
2197                                 int zero_pg_index;
2198
2199                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
2200                                 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
2201
2202                                 if ( !upl_valid_page(pl, zero_pg_index)) {
2203                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2204                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2205                                            !upl_dirty_page(pl, zero_pg_index)) {
2206                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2207                                 }
2208                         }
2209                         xfer_resid -= bytes_to_zero;
2210                         zero_cnt1  -= bytes_to_zero;
2211                         zero_off1  += bytes_to_zero;
2212                         io_offset  += bytes_to_zero;
2213                 }
2214
2215                 if (retval == 0) {
2216                         int cl_index;
2217                         int can_delay;
2218
2219                         io_size += start_offset;
2220
2221                         if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
2222                                 /*
2223                                  * if we're extending the file with this write
2224                                  * we'll zero fill the rest of the page so that
2225                                  * if the file gets extended again in such a way as to leave a
2226                                  * hole starting at this EOF, we'll have zero's in the correct spot
2227                                  */
2228                                 cluster_zero(upl, io_size, upl_size - io_size, NULL);
2229                         }
2230                         if (flags & IO_SYNC)
2231                                 /*
2232                                  * if the IO_SYNC flag is set than we need to
2233                                  * bypass any clusters and immediately issue
2234                                  * the I/O
2235                                  */
2236                                 goto issue_io;
2237 check_cluster:
2238                         /*
2239                          * take the lock to protect our accesses
2240                          * of the writebehind and sparse cluster state
2241                          */
2242                         wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
2243
2244                         /*
2245                          * calculate the last logical block number
2246                          * that this delayed I/O encompassed
2247                          */
2248                         cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
2249
2250                         if (wbp->cl_scmap) {
2251
2252                                 if ( !(flags & IO_NOCACHE)) {
2253                                         /*
2254                                          * we've fallen into the sparse
2255                                          * cluster method of delaying dirty pages
2256                                          * first, we need to release the upl if we hold one
2257                                          * since pages in it may be present in the sparse cluster map
2258                                          * and may span 2 separate buckets there... if they do and
2259                                          * we happen to have to flush a bucket to make room and it intersects
2260                                          * this upl, a deadlock may result on page BUSY
2261                                          */
2262                                         if (upl_size)
2263                                                 ubc_upl_commit_range(upl, 0, upl_size,
2264                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2265
2266                                         sparse_cluster_add(wbp, vp, &cl, newEOF);
2267
2268                                         lck_mtx_unlock(&wbp->cl_lockw);
2269
2270                                         continue;
2271                                 }
2272                                 /*
2273                                  * must have done cached writes that fell into
2274                                  * the sparse cluster mechanism... we've switched
2275                                  * to uncached writes on the file, so go ahead
2276                                  * and push whatever's in the sparse map
2277                                  * and switch back to normal clustering
2278                                  *
2279                                  * see the comment above concerning a possible deadlock...
2280                                  */
2281                                 if (upl_size) {
2282                                         ubc_upl_commit_range(upl, 0, upl_size,
2283                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2284                                         /*
2285                                          * setting upl_size to 0 keeps us from committing a
2286                                          * second time in the start_new_cluster path
2287                                          */
2288                                         upl_size = 0;
2289                                 }
2290                                 sparse_cluster_push(wbp, vp, newEOF, 1);
2291
2292                                 wbp->cl_number = 0;
2293                                 /*
2294                                  * no clusters of either type present at this point
2295                                  * so just go directly to start_new_cluster since
2296                                  * we know we need to delay this I/O since we've
2297                                  * already released the pages back into the cache
2298                                  * to avoid the deadlock with sparse_cluster_push
2299                                  */
2300                                 goto start_new_cluster;
2301                         }
2302                         upl_offset = 0;
2303
2304                         if (wbp->cl_number == 0)
2305                                 /*
2306                                  * no clusters currently present
2307                                  */
2308                                 goto start_new_cluster;
2309
2310                         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
2311                                 /*
2312                                  * check each cluster that we currently hold
2313                                  * try to merge some or all of this write into
2314                                  * one or more of the existing clusters... if
2315                                  * any portion of the write remains, start a
2316                                  * new cluster
2317                                  */
2318                                 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
2319                                         /*
2320                                          * the current write starts at or after the current cluster
2321                                          */
2322                                         if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
2323                                                 /*
2324                                                  * we have a write that fits entirely
2325                                                  * within the existing cluster limits
2326                                                  */
2327                                                 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
2328                                                         /*
2329                                                          * update our idea of where the cluster ends
2330                                                          */
2331                                                         wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2332                                                 break;
2333                                         }
2334                                         if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
2335                                                 /*
2336                                                  * we have a write that starts in the middle of the current cluster
2337                                                  * but extends beyond the cluster's limit... we know this because
2338                                                  * of the previous checks
2339                                                  * we'll extend the current cluster to the max
2340                                                  * and update the b_addr for the current write to reflect that
2341                                                  * the head of it was absorbed into this cluster...
2342                                                  * note that we'll always have a leftover tail in this case since
2343                                                  * full absorbtion would have occurred in the clause above
2344                                                  */
2345                                                 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER;
2346
2347                                                 if (upl_size) {
2348                                                         daddr64_t start_pg_in_upl;
2349
2350                                                         start_pg_in_upl = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
2351
2352                                                         if (start_pg_in_upl < wbp->cl_clusters[cl_index].e_addr) {
2353                                                                 intersection = (int)((wbp->cl_clusters[cl_index].e_addr - start_pg_in_upl) * PAGE_SIZE);
2354
2355                                                                 ubc_upl_commit_range(upl, upl_offset, intersection,
2356                                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2357                                                                 upl_f_offset += intersection;
2358                                                                 upl_offset   += intersection;
2359                                                                 upl_size     -= intersection;
2360                                                         }
2361                                                 }
2362                                                 cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
2363                                         }
2364                                         /*
2365                                          * we come here for the case where the current write starts
2366                                          * beyond the limit of the existing cluster or we have a leftover
2367                                          * tail after a partial absorbtion
2368                                          *
2369                                          * in either case, we'll check the remaining clusters before
2370                                          * starting a new one
2371                                          */
2372                                 } else {
2373                                         /*
2374                                          * the current write starts in front of the cluster we're currently considering
2375                                          */
2376                                         if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= MAX_UPL_TRANSFER) {
2377                                                 /*
2378                                                  * we can just merge the new request into
2379                                                  * this cluster and leave it in the cache
2380                                                  * since the resulting cluster is still
2381                                                  * less than the maximum allowable size
2382                                                  */
2383                                                 wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
2384
2385                                                 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
2386                                                         /*
2387                                                          * the current write completely
2388                                                          * envelops the existing cluster and since
2389                                                          * each write is limited to at most MAX_UPL_TRANSFER bytes
2390                                                          * we can just use the start and last blocknos of the write
2391                                                          * to generate the cluster limits
2392                                                          */
2393                                                         wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2394                                                 }
2395                                                 break;
2396                                         }
2397
2398                                         /*
2399                                          * if we were to combine this write with the current cluster
2400                                          * we would exceed the cluster size limit.... so,
2401                                          * let's see if there's any overlap of the new I/O with
2402                                          * the cluster we're currently considering... in fact, we'll
2403                                          * stretch the cluster out to it's full limit and see if we
2404                                          * get an intersection with the current write
2405                                          *
2406                                          */
2407                                         if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER) {
2408                                                 /*
2409                                                  * the current write extends into the proposed cluster
2410                                                  * clip the length of the current write after first combining it's
2411                                                  * tail with the newly shaped cluster
2412                                                  */
2413                                                 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER;
2414
2415                                                 if (upl_size) {
2416                                                         intersection = (int)((cl.e_addr - wbp->cl_clusters[cl_index].b_addr) * PAGE_SIZE);
2417
2418                                                         if (intersection > upl_size)
2419                                                                 /*
2420                                                                  * because the current write may consist of a number of pages found in the cache
2421                                                                  * which are not part of the UPL, we may have an intersection that exceeds
2422                                                                  * the size of the UPL that is also part of this write
2423                                                                  */
2424                                                                 intersection = upl_size;
2425
2426                                                         ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2427                                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2428                                                         upl_size -= intersection;
2429                                                 }
2430                                                 cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
2431                                         }
2432                                         /*
2433                                          * if we get here, there was no way to merge
2434                                          * any portion of this write with this cluster
2435                                          * or we could only merge part of it which
2436                                          * will leave a tail...
2437                                          * we'll check the remaining clusters before starting a new one
2438                                          */
2439                                 }
2440                         }
2441                         if (cl_index < wbp->cl_number)
2442                                 /*
2443                                  * we found an existing cluster(s) that we
2444                                  * could entirely merge this I/O into
2445                                  */
2446                                 goto delay_io;
2447
2448                         if (wbp->cl_number < MAX_CLUSTERS && !(flags & IO_NOCACHE))
2449                                 /*
2450                                  * we didn't find an existing cluster to
2451                                  * merge into, but there's room to start
2452                                  * a new one
2453                                  */
2454                                 goto start_new_cluster;
2455
2456                         /*
2457                          * no exisitng cluster to merge with and no
2458                          * room to start a new one... we'll try
2459                          * pushing one of the existing ones... if none of
2460                          * them are able to be pushed, we'll switch
2461                          * to the sparse cluster mechanism
2462                          * cluster_try_push updates cl_number to the
2463                          * number of remaining clusters... and
2464                          * returns the number of currently unused clusters
2465                          */
2466                         int ret_cluster_try_push = 0;
2467                         /* if writes are not deferred, call cluster push immediately */
2468                         if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2469                                 if (flags & IO_NOCACHE)
2470                                         can_delay = 0;
2471                                 else
2472                                         can_delay = 1;
2473
2474                                 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, can_delay, 0);
2475                         }
2476
2477                         /* execute following regardless writes are deferred or not */
2478                         if (ret_cluster_try_push == 0) {
2479                                 /*
2480                                  * no more room in the normal cluster mechanism
2481                                  * so let's switch to the more expansive but expensive
2482                                  * sparse mechanism....
2483                                  * first, we need to release the upl if we hold one
2484                                  * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2485                                  * and may span 2 separate buckets there... if they do and
2486                                  * we happen to have to flush a bucket to make room and it intersects
2487                                  * this upl, a deadlock may result on page BUSY
2488                                  */
2489                                 if (upl_size)
2490                                         ubc_upl_commit_range(upl, upl_offset, upl_size,
2491                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2492
2493                                 sparse_cluster_switch(wbp, vp, newEOF);
2494                                 sparse_cluster_add(wbp, vp, &cl, newEOF);
2495
2496                                 lck_mtx_unlock(&wbp->cl_lockw);
2497
2498                                 continue;
2499                         }
2500                         /*
2501                          * we pushed one cluster successfully, so we must be sequentially writing this file
2502                          * otherwise, we would have failed and fallen into the sparse cluster support
2503                          * so let's take the opportunity to push out additional clusters as long as we
2504                          * remain below the throttle... this will give us better I/O locality if we're
2505                          * in a copy loop (i.e.  we won't jump back and forth between the read and write points
2506                          * however, we don't want to push so much out that the write throttle kicks in and
2507                          * hangs this thread up until some of the I/O completes...
2508                          */
2509                         if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2510                                 while (wbp->cl_number && (vp->v_numoutput <= (VNODE_ASYNC_THROTTLE / 2)))
2511                                         cluster_try_push(wbp, vp, newEOF, 0, 0);
2512                         }
2513
2514 start_new_cluster:
2515                         wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
2516                         wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
2517
2518                         if (flags & IO_NOCACHE)
2519                                 wbp->cl_clusters[wbp->cl_number].io_nocache = 1;
2520                         else
2521                                 wbp->cl_clusters[wbp->cl_number].io_nocache = 0;
2522                         wbp->cl_number++;
2523 delay_io:
2524                         if (upl_size)
2525                                 ubc_upl_commit_range(upl, upl_offset, upl_size,
2526                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2527
2528                         lck_mtx_unlock(&wbp->cl_lockw);
2529
2530                         continue;
2531 issue_io:
2532                         /*
2533                          * we don't hold the vnode lock at this point
2534                          *
2535                          * because we had to ask for a UPL that provides currenty non-present pages, the
2536                          * UPL has been automatically set to clear the dirty flags (both software and hardware)
2537                          * upon committing it... this is not the behavior we want since it's possible for
2538                          * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
2539                          * in order to maintain some semblance of coherency with mapped writes
2540                          * we need to drop the current upl and pick it back up with COPYOUT_FROM set
2541                          * so that we correctly deal with a change in state of the hardware modify bit...
2542                          * we do this via cluster_push_x... by passing along the IO_SYNC flag, we force
2543                          * cluster_push_x to wait until all the I/Os have completed... cluster_push_x is also
2544                          * responsible for generating the correct sized I/O(s)
2545                          */
2546                         ubc_upl_commit_range(upl, 0, upl_size,
2547                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2548
2549                         cl.e_addr = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
2550
2551                         retval = cluster_push_x(vp, &cl, newEOF, flags);
2552                 }
2553         }
2554         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2555                      retval, 0, io_resid, 0, 0);
2556
2557         return (retval);
2558 }
2559
2560 int
2561 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
2562 {
2563         int           prev_resid;
2564         u_int         clip_size;
2565         off_t         max_io_size;
2566         int           upl_size;
2567         int           upl_flags;
2568         upl_t         upl;
2569         int           retval = 0;
2570         int           flags;
2571
2572         flags = xflags;
2573
2574         if (vp->v_flag & VNOCACHE_DATA)
2575                 flags |= IO_NOCACHE;
2576         if (vp->v_flag & VRAOFF)
2577                 flags |= IO_RAOFF;
2578
2579         if (!((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
2580                 /*
2581                  * go do a read through the cache if one of the following is true....
2582                  *   NOCACHE is not true
2583                  *   the uio request doesn't target USERSPACE
2584                  */
2585                 return (cluster_read_x(vp, uio, filesize, flags));
2586         }
2587
2588 #if LP64_DEBUG
2589         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
2590                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
2591         }
2592 #endif /* LP64_DEBUG */
2593
2594         while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
2595                 user_size_t     iov_len;
2596                 user_addr_t     iov_base;
2597
2598                 /*
2599                  * we know we have a resid, so this is safe
2600                  * skip over any emtpy vectors
2601                  */
2602                 uio_update(uio, (user_size_t)0);
2603
2604                 iov_len  = uio_curriovlen(uio);
2605                 iov_base = uio_curriovbase(uio);
2606
2607                 upl_size  = PAGE_SIZE;
2608                 upl_flags = UPL_QUERY_OBJECT_TYPE;
2609
2610                 // LP64todo - fix this!
2611                 if ((vm_map_get_upl(current_map(),
2612                                     (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2613                                     &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
2614                         /*
2615                          * the user app must have passed in an invalid address
2616                          */
2617                         return (EFAULT);
2618                 }
2619
2620                 /*
2621                  * We check every vector target but if it is physically
2622                  * contiguous space, we skip the sanity checks.
2623                  */
2624                 if (upl_flags & UPL_PHYS_CONTIG) {
2625                         retval = cluster_phys_read(vp, uio, filesize);
2626                 }
2627                 else if (uio_resid(uio) < PAGE_SIZE) {
2628                         /*
2629                          * we're here because we're don't have a physically contiguous target buffer
2630                          * go do a read through the cache if
2631                          *   the total xfer size is less than a page...
2632                          */
2633                         return (cluster_read_x(vp, uio, filesize, flags));
2634                 }
2635                 // LP64todo - fix this!
2636                 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2637                        if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2638                                /*
2639                                 * Bring the file offset read up to a pagesize boundary
2640                                 * this will also bring the base address to a page boundary
2641                                 * since they both are currently on the same offset within a page
2642                                 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2643                                 * so the computed clip_size must always be less than the current uio_resid
2644                                 */
2645                                clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2646
2647                                /*
2648                                 * Fake the resid going into the cluster_read_x call
2649                                 * and restore it on the way out.
2650                                 */
2651                                prev_resid = uio_resid(uio);
2652                                // LP64todo - fix this
2653                                uio_setresid(uio, clip_size);
2654
2655                                retval = cluster_read_x(vp, uio, filesize, flags);
2656
2657                                uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2658                        } else {
2659                                /*
2660                                 * can't get both the file offset and the buffer offset aligned to a page boundary
2661                                 * so fire an I/O through the cache for this entire vector
2662                                 */
2663                                // LP64todo - fix this!
2664                                clip_size = iov_len;
2665                                prev_resid = uio_resid(uio);
2666                                uio_setresid(uio, clip_size);
2667
2668                                retval = cluster_read_x(vp, uio, filesize, flags);
2669
2670                                uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2671                        }
2672                 } else {
2673                         /*
2674                          * If we come in here, we know the offset into
2675                          * the file is on a pagesize boundary
2676                          */
2677                         max_io_size = filesize - uio->uio_offset;
2678                         // LP64todo - fix this
2679                         clip_size = uio_resid(uio);
2680                         if (iov_len < clip_size)
2681                                 clip_size = iov_len;
2682                         if (max_io_size < clip_size)
2683                                 clip_size = (int)max_io_size;
2684
2685                         if (clip_size < PAGE_SIZE) {
2686                                 /*
2687                                  * Take care of the tail end of the read in this vector.
2688                                  */
2689                                 // LP64todo - fix this
2690                                 prev_resid = uio_resid(uio);
2691                                 uio_setresid(uio, clip_size);
2692
2693                                 retval = cluster_read_x(vp, uio, filesize, flags);
2694
2695                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2696                         } else {
2697                                 /* round clip_size down to a multiple of pagesize */
2698                                 clip_size = clip_size & ~(PAGE_MASK);
2699                                 // LP64todo - fix this
2700                                 prev_resid = uio_resid(uio);
2701                                 uio_setresid(uio, clip_size);
2702
2703                                 retval = cluster_nocopy_read(vp, uio, filesize);
2704
2705                                 if ((retval==0) && uio_resid(uio))
2706                                         retval = cluster_read_x(vp, uio, filesize, flags);
2707
2708                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2709                         }
2710                 } /* end else */
2711         } /* end while */
2712
2713         return(retval);
2714 }
2715
2716 static int
2717 cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags)
2718 {
2719         upl_page_info_t *pl;
2720         upl_t            upl;
2721         vm_offset_t      upl_offset;
2722         int              upl_size;
2723         off_t            upl_f_offset;
2724         int              start_offset;
2725         int              start_pg;
2726         int              last_pg;
2727         int              uio_last = 0;
2728         int              pages_in_upl;
2729         off_t            max_size;
2730         off_t            last_ioread_offset;
2731         off_t            last_request_offset;
2732         u_int            size_of_prefetch;
2733         u_int            io_size;
2734         kern_return_t    kret;
2735         int              error  = 0;
2736         int              retval = 0;
2737         u_int            max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2738         u_int            rd_ahead_enabled = 1;
2739         u_int            prefetch_enabled = 1;
2740         struct cl_readahead *   rap;
2741         struct clios            iostate;
2742         struct cl_extent        extent;
2743
2744         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2745                      (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
2746
2747         // LP64todo - fix this
2748         last_request_offset = uio->uio_offset + uio_resid(uio);
2749
2750         if ((flags & (IO_RAOFF|IO_NOCACHE)) ||
2751                 ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
2752                 rd_ahead_enabled = 0;
2753                 rap = NULL;
2754         } else {
2755                 if (cluster_hard_throttle_on(vp)) {
2756                         rd_ahead_enabled = 0;
2757                         prefetch_enabled = 0;
2758
2759                         max_rd_size = HARD_THROTTLE_MAXSIZE;
2760                 }
2761                 if ((rap = cluster_get_rap(vp)) == NULL)
2762                         rd_ahead_enabled = 0;
2763         }
2764         if (last_request_offset > filesize)
2765                 last_request_offset = filesize;
2766         extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
2767         extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
2768
2769         if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
2770                 /*
2771                  * determine if we already have a read-ahead in the pipe courtesy of the
2772                  * last read systemcall that was issued...
2773                  * if so, pick up it's extent to determine where we should start
2774                  * with respect to any read-ahead that might be necessary to
2775                  * garner all the data needed to complete this read systemcall
2776                  */
2777                 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
2778
2779                 if (last_ioread_offset < uio->uio_offset)
2780                         last_ioread_offset = (off_t)0;
2781                 else if (last_ioread_offset > last_request_offset)
2782                         last_ioread_offset = last_request_offset;
2783         } else
2784                 last_ioread_offset = (off_t)0;
2785
2786         while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
2787                 /*
2788                  * compute the size of the upl needed to encompass
2789                  * the requested read... limit each call to cluster_io
2790                  * to the maximum UPL size... cluster_io will clip if
2791                  * this exceeds the maximum io_size for the device,
2792                  * make sure to account for
2793                  * a starting offset that's not page aligned
2794                  */
2795                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2796                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2797                 max_size     = filesize - uio->uio_offset;
2798
2799         // LP64todo - fix this!
2800                 if ((off_t)((unsigned int)uio_resid(uio)) < max_size)
2801                         io_size = uio_resid(uio);
2802                 else
2803                         io_size = max_size;
2804
2805                 if (!(flags & IO_NOCACHE)) {
2806
2807                         while (io_size) {
2808                                 u_int io_resid;
2809                                 u_int io_requested;
2810
2811                                 /*
2812                                  * if we keep finding the pages we need already in the cache, then
2813                                  * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2814                                  * to determine that we have all the pages we need... once we miss in
2815                                  * the cache and have issued an I/O, than we'll assume that we're likely
2816                                  * to continue to miss in the cache and it's to our advantage to try and prefetch
2817                                  */
2818                                 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
2819                                         if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
2820                                                 /*
2821                                                  * we've already issued I/O for this request and
2822                                                  * there's still work to do and
2823                                                  * our prefetch stream is running dry, so issue a
2824                                                  * pre-fetch I/O... the I/O latency will overlap
2825                                                  * with the copying of the data
2826                                                  */
2827                                                 if (size_of_prefetch > max_rd_size)
2828                                                         size_of_prefetch = max_rd_size;
2829
2830                                                 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
2831
2832                                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2833
2834                                                 if (last_ioread_offset > last_request_offset)
2835                                                         last_ioread_offset = last_request_offset;
2836                                         }
2837                                 }
2838                                 /*
2839                                  * limit the size of the copy we're about to do so that
2840                                  * we can notice that our I/O pipe is running dry and
2841                                  * get the next I/O issued before it does go dry
2842                                  */
2843                                 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
2844                                         io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
2845                                 else
2846                                         io_resid = io_size;
2847
2848                                 io_requested = io_resid;
2849
2850                                 retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
2851
2852                                 io_size -= (io_requested - io_resid);
2853
2854                                 if (retval || io_resid)
2855                                         /*
2856                                          * if we run into a real error or
2857                                          * a page that is not in the cache
2858                                          * we need to leave streaming mode
2859                                          */
2860                                         break;
2861
2862                                 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
2863                                         /*
2864                                          * we're already finished the I/O for this read request
2865                                          * let's see if we should do a read-ahead
2866                                          */
2867                                         cluster_rd_ahead(vp, &extent, filesize, rap);
2868                                 }
2869                         }
2870                         if (retval)
2871                                 break;
2872                         if (io_size == 0) {
2873                                 if (rap != NULL) {
2874                                         if (extent.e_addr < rap->cl_lastr)
2875                                                 rap->cl_maxra = 0;
2876                                         rap->cl_lastr = extent.e_addr;
2877                                 }
2878                                 break;
2879                         }
2880                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2881                         upl_f_offset = uio->uio_offset - (off_t)start_offset;
2882                         max_size     = filesize - uio->uio_offset;
2883                 }
2884                 if (io_size > max_rd_size)
2885                         io_size = max_rd_size;
2886
2887                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2888
2889                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2890                         upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
2891                 pages_in_upl = upl_size / PAGE_SIZE;
2892
2893                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2894                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2895
2896                 kret = ubc_create_upl(vp,
2897                                       upl_f_offset,
2898                                       upl_size,
2899                                       &upl,
2900                                       &pl,
2901                                       UPL_SET_LITE);
2902                 if (kret != KERN_SUCCESS)
2903                         panic("cluster_read: failed to get pagelist");
2904
2905                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2906                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2907
2908                 /*
2909                  * scan from the beginning of the upl looking for the first
2910                  * non-valid page.... this will become the first page in
2911                  * the request we're going to make to 'cluster_io'... if all
2912                  * of the pages are valid, we won't call through to 'cluster_io'
2913                  */
2914                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2915                         if (!upl_valid_page(pl, start_pg))
2916                                 break;
2917                 }
2918
2919                 /*
2920                  * scan from the starting invalid page looking for a valid
2921                  * page before the end of the upl is reached, if we
2922                  * find one, then it will be the last page of the request to
2923                  * 'cluster_io'
2924                  */
2925                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2926                         if (upl_valid_page(pl, last_pg))
2927                                 break;
2928                 }
2929                 iostate.io_completed = 0;
2930                 iostate.io_issued = 0;
2931                 iostate.io_error = 0;
2932                 iostate.io_wanted = 0;
2933
2934                 if (start_pg < last_pg) {
2935                         /*
2936                          * we found a range of 'invalid' pages that must be filled
2937                          * if the last page in this range is the last page of the file
2938                          * we may have to clip the size of it to keep from reading past
2939                          * the end of the last physical block associated with the file
2940                          */
2941                         upl_offset = start_pg * PAGE_SIZE;
2942                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2943
2944                         if ((upl_f_offset + upl_offset + io_size) > filesize)
2945                                 io_size = filesize - (upl_f_offset + upl_offset);
2946
2947                         /*
2948                          * issue an asynchronous read to cluster_io
2949                          */
2950
2951                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2952                                            io_size, CL_READ | CL_ASYNC, (buf_t)NULL, &iostate);
2953                 }
2954                 if (error == 0) {
2955                         /*
2956                          * if the read completed successfully, or there was no I/O request
2957                          * issued, than copy the data into user land via 'cluster_upl_copy_data'
2958                          * we'll first add on any 'valid'
2959                          * pages that were present in the upl when we acquired it.
2960                          */
2961                         u_int  val_size;
2962
2963                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2964                                 if (!upl_valid_page(pl, uio_last))
2965                                         break;
2966                         }
2967                         /*
2968                          * compute size to transfer this round,  if uio->uio_resid is
2969                          * still non-zero after this attempt, we'll loop around and
2970                          * set up for another I/O.
2971                          */
2972                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2973
2974                         if (val_size > max_size)
2975                                 val_size = max_size;
2976
2977                         if (val_size > uio_resid(uio))
2978         // LP64todo - fix this
2979                                 val_size = uio_resid(uio);
2980
2981                         if (last_ioread_offset == 0)
2982                                 last_ioread_offset = uio->uio_offset + val_size;
2983
2984                         if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
2985                                 /*
2986                                  * if there's still I/O left to do for this request, and...
2987                                  * we're not in hard throttle mode, then issue a
2988                                  * pre-fetch I/O... the I/O latency will overlap
2989                                  * with the copying of the data
2990                                  */
2991                                 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
2992
2993                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2994
2995                                 if (last_ioread_offset > last_request_offset)
2996                                         last_ioread_offset = last_request_offset;
2997
2998                         } else if ((uio->uio_offset + val_size) == last_request_offset) {
2999                                 /*
3000                                  * this transfer will finish this request, so...
3001                                  * let's try to read ahead if we're in
3002                                  * a sequential access pattern and we haven't
3003                                  * explicitly disabled it
3004                                  */
3005                                 if (rd_ahead_enabled)
3006                                         cluster_rd_ahead(vp, &extent, filesize, rap);
3007
3008                                 if (rap != NULL) {
3009                                         if (extent.e_addr < rap->cl_lastr)
3010                                                 rap->cl_maxra = 0;
3011                                         rap->cl_lastr = extent.e_addr;
3012                                 }
3013                         }
3014                         lck_mtx_lock(cl_mtxp);
3015
3016                         while (iostate.io_issued != iostate.io_completed) {
3017                                 iostate.io_wanted = 1;
3018                                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_x", 0);
3019                         }
3020                         lck_mtx_unlock(cl_mtxp);
3021
3022                         if (iostate.io_error)
3023                                 error = iostate.io_error;
3024                         else
3025                                 retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
3026                 }
3027                 if (start_pg < last_pg) {
3028                         /*
3029                          * compute the range of pages that we actually issued an I/O for
3030                          * and either commit them as valid if the I/O succeeded
3031                          * or abort them if the I/O failed
3032                          */
3033                         io_size = (last_pg - start_pg) * PAGE_SIZE;
3034
3035                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3036                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3037
3038                         if (error || (flags & IO_NOCACHE))
3039                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
3040                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3041                         else
3042                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
3043                                                      UPL_COMMIT_CLEAR_DIRTY |
3044                                                      UPL_COMMIT_FREE_ON_EMPTY |
3045                                                      UPL_COMMIT_INACTIVATE);
3046
3047                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
3048                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3049                 }
3050                 if ((last_pg - start_pg) < pages_in_upl) {
3051                         int cur_pg;
3052                         int commit_flags;
3053
3054                         /*
3055                          * the set of pages that we issued an I/O for did not encompass
3056                          * the entire upl... so just release these without modifying
3057                          * their state
3058                          */
3059                         if (error)
3060                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3061                         else {
3062                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3063                                              (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
3064
3065                                 if (start_pg) {
3066                                         /*
3067                                          * we found some already valid pages at the beginning of
3068                                          * the upl commit these back to the inactive list with
3069                                          * reference cleared
3070                                          */
3071                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
3072                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
3073                                                                    | UPL_COMMIT_INACTIVATE;
3074
3075                                                 if (upl_dirty_page(pl, cur_pg))
3076                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
3077
3078                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
3079                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3080                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3081                                                 else
3082                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3083                                                                 PAGE_SIZE, commit_flags);
3084                                         }
3085                                 }
3086                                 if (last_pg < uio_last) {
3087                                         /*
3088                                          * we found some already valid pages immediately after the
3089                                          * pages we issued I/O for, commit these back to the
3090                                          * inactive list with reference cleared
3091                                          */
3092                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
3093                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
3094                                                                                 | UPL_COMMIT_INACTIVATE;
3095
3096                                                 if (upl_dirty_page(pl, cur_pg))
3097                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
3098
3099                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
3100                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3101                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3102                                                 else
3103                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3104                                                                 PAGE_SIZE, commit_flags);
3105                                         }
3106                                 }
3107                                 if (uio_last < pages_in_upl) {
3108                                         /*
3109                                          * there were some invalid pages beyond the valid pages
3110                                          * that we didn't issue an I/O for, just release them
3111                                          * unchanged
3112                                          */
3113                                         ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
3114                                                             (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
3115                                 }
3116
3117                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
3118                                         (int)upl, -1, -1, 0, 0);
3119                         }
3120                 }
3121                 if (retval == 0)
3122                         retval = error;
3123
3124                 if ( uio_resid(uio) ) {
3125                         if (cluster_hard_throttle_on(vp)) {
3126                                 rd_ahead_enabled = 0;
3127                                 prefetch_enabled = 0;
3128
3129                                 max_rd_size = HARD_THROTTLE_MAXSIZE;
3130                         } else {
3131                                 if (rap != NULL)
3132                                         rd_ahead_enabled = 1;
3133                                 prefetch_enabled = 1;
3134
3135                                 max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3136                         }
3137                 }
3138         }
3139         if (rap != NULL) {
3140                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3141                              (int)uio->uio_offset, uio_resid(uio), rap->cl_lastr, retval, 0);
3142
3143                 lck_mtx_unlock(&rap->cl_lockr);
3144         } else {
3145                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3146                              (int)uio->uio_offset, uio_resid(uio), 0, retval, 0);
3147         }
3148
3149         return (retval);
3150 }
3151
3152
3153 static int
3154 cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize)
3155 {
3156         upl_t            upl;
3157         upl_page_info_t  *pl;
3158         vm_offset_t      upl_offset;
3159         off_t            max_io_size;
3160         int              io_size;
3161         int              upl_size;
3162         int              upl_needed_size;
3163         int              pages_in_pl;
3164         int              upl_flags;
3165         kern_return_t    kret;
3166         int              i;
3167         int              force_data_sync;
3168         int              retval = 0;
3169         int              no_zero_fill = 0;
3170         int              abort_flag = 0;
3171         struct clios     iostate;
3172         u_int            max_rd_size  = MAX_UPL_TRANSFER * PAGE_SIZE;
3173         u_int            max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
3174
3175
3176         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
3177                      (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
3178
3179         /*
3180          * When we enter this routine, we know
3181          *  -- the offset into the file is on a pagesize boundary
3182          *  -- the resid is a page multiple
3183          *  -- the resid will not exceed iov_len
3184          */
3185
3186         iostate.io_completed = 0;
3187         iostate.io_issued = 0;
3188         iostate.io_error = 0;
3189         iostate.io_wanted = 0;
3190
3191         while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
3192                 user_addr_t     iov_base;
3193
3194                 if (cluster_hard_throttle_on(vp)) {
3195                         max_rd_size  = HARD_THROTTLE_MAXSIZE;
3196                         max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
3197                 } else {
3198                         max_rd_size  = MAX_UPL_TRANSFER * PAGE_SIZE;
3199                         max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 8;
3200                 }
3201                 max_io_size = filesize - uio->uio_offset;
3202
3203         // LP64todo - fix this
3204                 if (max_io_size < (off_t)((unsigned int)uio_resid(uio)))
3205                         io_size = max_io_size;
3206                 else
3207                         io_size = uio_resid(uio);
3208
3209                 /*
3210                  * First look for pages already in the cache
3211                  * and move them to user space.
3212                  */
3213                 retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
3214
3215                 if (retval) {
3216                         /*
3217                          * we may have already spun some portion of this request
3218                          * off as async requests... we need to wait for the I/O
3219                          * to complete before returning
3220                          */
3221                         goto wait_for_reads;
3222                 }
3223                 /*
3224                  * If we are already finished with this read, then return
3225                  */
3226                 if (io_size == 0) {
3227                         /*
3228                          * we may have already spun some portion of this request
3229                          * off as async requests... we need to wait for the I/O
3230                          * to complete before returning
3231                          */
3232                         goto wait_for_reads;
3233                 }
3234                 max_io_size = io_size;
3235
3236                 if (max_io_size > max_rd_size)
3237                         max_io_size = max_rd_size;
3238
3239                 io_size = 0;
3240
3241                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
3242
3243                 if (io_size == 0)
3244                         /*
3245                          * we may have already spun some portion of this request
3246                          * off as async requests... we need to wait for the I/O
3247                          * to complete before returning
3248                          */
3249                         goto wait_for_reads;
3250
3251                 iov_base = uio_curriovbase(uio);
3252
3253                 // LP64todo - fix this!
3254                 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
3255                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
3256
3257                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
3258                              (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
3259
3260                 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
3261                         no_zero_fill = 1;
3262                         abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY;
3263                 } else {
3264                         no_zero_fill = 0;
3265                         abort_flag = UPL_ABORT_FREE_ON_EMPTY;
3266                 }
3267                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
3268                         pages_in_pl = 0;
3269                         upl_size = upl_needed_size;
3270                         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3271
3272                         if (no_zero_fill)
3273                                 upl_flags |= UPL_NOZEROFILL;
3274                         if (force_data_sync)
3275                                 upl_flags |= UPL_FORCE_DATA_SYNC;
3276
3277                         // LP64todo - fix this!
3278                         kret = vm_map_create_upl(current_map(),
3279                                                  (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
3280                                                  &upl_size, &upl, NULL, &pages_in_pl, &upl_flags);
3281
3282                         if (kret != KERN_SUCCESS) {
3283                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3284                                              (int)upl_offset, upl_size, io_size, kret, 0);
3285                                 /*
3286                                  * cluster_nocopy_read: failed to get pagelist
3287                                  *
3288                                  * we may have already spun some portion of this request
3289                                  * off as async requests... we need to wait for the I/O
3290                                  * to complete before returning
3291                                  */
3292                                 goto wait_for_reads;
3293                         }
3294                         pages_in_pl = upl_size / PAGE_SIZE;
3295                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3296
3297                         for (i = 0; i < pages_in_pl; i++) {
3298                                 if (!upl_valid_page(pl, i))
3299                                         break;
3300                         }
3301                         if (i == pages_in_pl)
3302                                 break;
3303
3304                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3305                 }
3306                 if (force_data_sync >= 3) {
3307                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3308                                      (int)upl_offset, upl_size, io_size, kret, 0);
3309
3310                         goto wait_for_reads;
3311                 }
3312                 /*
3313                  * Consider the possibility that upl_size wasn't satisfied.
3314                  */
3315                 if (upl_size != upl_needed_size)
3316                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
3317
3318                 if (io_size == 0) {
3319                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3320                         goto wait_for_reads;
3321                 }
3322                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3323                              (int)upl_offset, upl_size, io_size, kret, 0);
3324
3325                 /*
3326                  * request asynchronously so that we can overlap
3327                  * the preparation of the next I/O
3328                  * if there are already too many outstanding reads
3329                  * wait until some have completed before issuing the next read
3330                  */
3331                 lck_mtx_lock(cl_mtxp);
3332
3333                 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
3334                         iostate.io_wanted = 1;
3335                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
3336                 }
3337                 lck_mtx_unlock(cl_mtxp);
3338
3339                 if (iostate.io_error) {
3340                         /*
3341                          * one of the earlier reads we issued ran into a hard error
3342                          * don't issue any more reads, cleanup the UPL
3343                          * that was just created but not used, then
3344                          * go wait for any other reads to complete before
3345                          * returning the error to the caller
3346                          */
3347                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3348
3349                         goto wait_for_reads;
3350                 }
3351                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
3352                              (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
3353
3354                 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size,
3355                                    CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
3356                                    (buf_t)NULL, &iostate);
3357
3358                 /*
3359                  * update the uio structure
3360                  */
3361                 uio_update(uio, (user_size_t)io_size);
3362
3363                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
3364                              (int)upl, (int)uio->uio_offset, (int)uio_resid(uio), retval, 0);
3365
3366         } /* end while */
3367
3368 wait_for_reads:
3369         /*
3370          * make sure all async reads that are part of this stream
3371          * have completed before we return
3372          */
3373         lck_mtx_lock(cl_mtxp);
3374
3375         while (iostate.io_issued != iostate.io_completed) {
3376                 iostate.io_wanted = 1;
3377                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
3378         }
3379         lck_mtx_unlock(cl_mtxp);
3380
3381         if (iostate.io_error)
3382                 retval = iostate.io_error;
3383
3384         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
3385                      (int)uio->uio_offset, (int)uio_resid(uio), 6, retval, 0);
3386
3387         return (retval);
3388 }
3389
3390
3391 static int
3392 cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize)
3393 {
3394         upl_page_info_t *pl;
3395         upl_t            upl;
3396         vm_offset_t      upl_offset;
3397         addr64_t         dst_paddr;
3398         off_t            max_size;
3399         int              io_size;
3400         user_size_t      iov_len;
3401         user_addr_t      iov_base;
3402         int              tail_size;
3403         int              upl_size;
3404         int              upl_needed_size;
3405         int              pages_in_pl;
3406         int              upl_flags;
3407         kern_return_t    kret;
3408         struct clios     iostate;
3409         int              error;
3410         int              devblocksize;
3411
3412         devblocksize = vp->v_mount->mnt_devblocksize;
3413         /*
3414          * When we enter this routine, we know
3415          *  -- the resid will not exceed iov_len
3416          *  -- the target address is physically contiguous
3417          */
3418
3419 #if LP64_DEBUG
3420         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
3421                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
3422         }
3423 #endif /* LP64_DEBUG */
3424
3425         iov_len = uio_curriovlen(uio);
3426         iov_base = uio_curriovbase(uio);
3427
3428         max_size = filesize - uio->uio_offset;
3429
3430         // LP64todo - fix this!
3431         if (max_size < 0 || (u_int64_t)max_size > iov_len)
3432                 io_size = iov_len;
3433         else
3434                 io_size = max_size;
3435
3436         // LP64todo - fix this!
3437         upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
3438         upl_needed_size = upl_offset + io_size;
3439
3440         error       = 0;
3441         pages_in_pl = 0;
3442         upl_size = upl_needed_size;
3443         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3444
3445         kret = vm_map_get_upl(current_map(),
3446                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
3447                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
3448
3449         if (kret != KERN_SUCCESS) {
3450                 /*
3451                  * cluster_phys_read: failed to get pagelist
3452                  */
3453                 return(EINVAL);
3454         }
3455         if (upl_size < upl_needed_size) {
3456                 /*
3457                  * The upl_size wasn't satisfied.
3458                  */
3459                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3460
3461                 return(EINVAL);
3462         }
3463         pl = ubc_upl_pageinfo(upl);
3464
3465         dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
3466
3467         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3468                 int   head_size;
3469
3470                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
3471
3472                 if (head_size > io_size)
3473                         head_size = io_size;
3474
3475                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ);
3476
3477                 if (error) {
3478                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3479
3480                         return(EINVAL);
3481                 }
3482                 upl_offset += head_size;
3483                 dst_paddr  += head_size;
3484                 io_size    -= head_size;
3485         }
3486         tail_size = io_size & (devblocksize - 1);
3487         io_size  -= tail_size;
3488
3489         iostate.io_completed = 0;
3490         iostate.io_issued = 0;
3491         iostate.io_error = 0;
3492         iostate.io_wanted = 0;
3493
3494         while (io_size && error == 0) {
3495                 int  xsize;
3496
3497                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3498                         xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
3499                 else
3500                         xsize = io_size;
3501                 /*
3502                  * request asynchronously so that we can overlap
3503                  * the preparation of the next I/O... we'll do
3504                  * the commit after all the I/O has completed
3505                  * since its all issued against the same UPL
3506                  * if there are already too many outstanding reads
3507                  * wait until some have completed before issuing the next
3508                  */
3509                 lck_mtx_lock(cl_mtxp);
3510
3511                 while ((iostate.io_issued - iostate.io_completed) > (8 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
3512                         iostate.io_wanted = 1;
3513                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
3514                 }
3515                 lck_mtx_unlock(cl_mtxp);
3516
3517                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize,
3518                                    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3519                                    (buf_t)NULL, &iostate);
3520                 /*
3521                  * The cluster_io read was issued successfully,
3522                  * update the uio structure
3523                  */
3524                 if (error == 0) {
3525                         uio_update(uio, (user_size_t)xsize);
3526
3527                         dst_paddr  += xsize;
3528                         upl_offset += xsize;
3529                         io_size    -= xsize;
3530                 }
3531         }
3532         /*
3533          * make sure all async reads that are part of this stream
3534          * have completed before we proceed
3535          */
3536         lck_mtx_lock(cl_mtxp);
3537
3538         while (iostate.io_issued != iostate.io_completed) {
3539                 iostate.io_wanted = 1;
3540                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
3541         }
3542         lck_mtx_unlock(cl_mtxp);
3543
3544         if (iostate.io_error)
3545                 error = iostate.io_error;
3546
3547         if (error == 0 && tail_size)
3548                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ);
3549
3550         /*
3551          * just release our hold on the physically contiguous
3552          * region without changing any state
3553          */
3554         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3555
3556         return (error);
3557 }
3558
3559
3560 /*
3561  * generate advisory I/O's in the largest chunks possible
3562  * the completed pages will be released into the VM cache
3563  */
3564 int
3565 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
3566 {
3567         upl_page_info_t *pl;
3568         upl_t            upl;
3569         vm_offset_t      upl_offset;
3570         int              upl_size;
3571         off_t            upl_f_offset;
3572         int              start_offset;
3573         int              start_pg;
3574         int              last_pg;
3575         int              pages_in_upl;
3576         off_t            max_size;
3577         int              io_size;
3578         kern_return_t    kret;
3579         int              retval = 0;
3580         int              issued_io;
3581         int              skip_range;
3582
3583         if ( !UBCINFOEXISTS(vp))
3584                 return(EINVAL);
3585
3586         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3587                      (int)f_offset, resid, (int)filesize, 0, 0);
3588
3589         while (resid && f_offset < filesize && retval == 0) {
3590                 /*
3591                  * compute the size of the upl needed to encompass
3592                  * the requested read... limit each call to cluster_io
3593                  * to the maximum UPL size... cluster_io will clip if
3594                  * this exceeds the maximum io_size for the device,
3595                  * make sure to account for
3596                  * a starting offset that's not page aligned
3597                  */
3598                 start_offset = (int)(f_offset & PAGE_MASK_64);
3599                 upl_f_offset = f_offset - (off_t)start_offset;
3600                 max_size     = filesize - f_offset;
3601
3602                 if (resid < max_size)
3603                         io_size = resid;
3604                 else
3605                         io_size = max_size;
3606
3607                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3608                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3609                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3610
3611                 skip_range = 0;
3612                 /*
3613                  * return the number of contiguously present pages in the cache
3614                  * starting at upl_f_offset within the file
3615                  */
3616                 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
3617
3618                 if (skip_range) {
3619                         /*
3620                          * skip over pages already present in the cache
3621                          */
3622                         io_size = skip_range - start_offset;
3623
3624                         f_offset += io_size;
3625                         resid    -= io_size;
3626
3627                         if (skip_range == upl_size)
3628                                 continue;
3629                         /*
3630                          * have to issue some real I/O
3631                          * at this point, we know it's starting on a page boundary
3632                          * because we've skipped over at least the first page in the request
3633                          */
3634                         start_offset = 0;
3635                         upl_f_offset += skip_range;
3636                         upl_size     -= skip_range;
3637                 }
3638                 pages_in_upl = upl_size / PAGE_SIZE;
3639
3640                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
3641                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3642
3643                 kret = ubc_create_upl(vp,
3644                                       upl_f_offset,
3645                                       upl_size,
3646                                       &upl,
3647                                       &pl,
3648                                       UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
3649                 if (kret != KERN_SUCCESS)
3650                         return(retval);
3651                 issued_io = 0;
3652
3653                 /*
3654                  * before we start marching forward, we must make sure we end on
3655                  * a present page, otherwise we will be working with a freed
3656                  * upl
3657                  */
3658                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3659                         if (upl_page_present(pl, last_pg))
3660                                 break;
3661                 }
3662                 pages_in_upl = last_pg + 1;
3663
3664
3665                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
3666                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3667
3668
3669                 for (last_pg = 0; last_pg < pages_in_upl; ) {
3670                         /*
3671                          * scan from the beginning of the upl looking for the first
3672                          * page that is present.... this will become the first page in
3673                          * the request we're going to make to 'cluster_io'... if all
3674                          * of the pages are absent, we won't call through to 'cluster_io'
3675                          */
3676                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3677                                 if (upl_page_present(pl, start_pg))
3678                                         break;
3679                         }
3680
3681                         /*
3682                          * scan from the starting present page looking for an absent
3683                          * page before the end of the upl is reached, if we
3684                          * find one, then it will terminate the range of pages being
3685                          * presented to 'cluster_io'
3686                          */
3687                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3688                                 if (!upl_page_present(pl, last_pg))
3689                                         break;
3690                         }
3691
3692                         if (last_pg > start_pg) {
3693                                 /*
3694                                  * we found a range of pages that must be filled
3695                                  * if the last page in this range is the last page of the file
3696                                  * we may have to clip the size of it to keep from reading past
3697                                  * the end of the last physical block associated with the file
3698                                  */
3699                                 upl_offset = start_pg * PAGE_SIZE;
3700                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
3701
3702                                 if ((upl_f_offset + upl_offset + io_size) > filesize)
3703                                         io_size = filesize - (upl_f_offset + upl_offset);
3704
3705                                 /*
3706                                  * issue an asynchronous read to cluster_io
3707                                  */
3708                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
3709                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (buf_t)NULL, (struct clios *)NULL);
3710
3711                                 issued_io = 1;
3712                         }
3713                 }
3714                 if (issued_io == 0)
3715                         ubc_upl_abort(upl, 0);
3716
3717                 io_size = upl_size - start_offset;
3718
3719                 if (io_size > resid)
3720                         io_size = resid;
3721                 f_offset += io_size;
3722                 resid    -= io_size;
3723         }
3724
3725         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3726                      (int)f_offset, resid, retval, 0, 0);
3727
3728         return(retval);
3729 }
3730
3731
3732 int
3733 cluster_push(vnode_t vp, int flags)
3734 {
3735         int     retval;
3736         struct  cl_writebehind *wbp;
3737
3738         if ( !UBCINFOEXISTS(vp)) {
3739                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -1, 0);
3740                 return (0);
3741         }
3742         /* return if deferred write is set */
3743         if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
3744                 return (0);
3745         }
3746         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
3747                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -2, 0);
3748                 return (0);
3749         }
3750         if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
3751                 lck_mtx_unlock(&wbp->cl_lockw);
3752
3753                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -3, 0);
3754                 return(0);
3755         }
3756         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3757                      (int)wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
3758
3759         if (wbp->cl_scmap) {
3760                 sparse_cluster_push(wbp, vp, ubc_getsize(vp), 1);
3761
3762                 retval = 1;
3763         } else
3764                 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), 0, 1);
3765
3766         lck_mtx_unlock(&wbp->cl_lockw);
3767
3768         if (flags & IO_SYNC)
3769                 (void)vnode_waitforwrites(vp, 0, 0, 0, (char *)"cluster_push");
3770
3771         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3772                      (int)wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
3773
3774         return (retval);
3775 }
3776
3777
3778 __private_extern__ void
3779 cluster_release(struct ubc_info *ubc)
3780 {
3781         struct cl_writebehind *wbp;
3782         struct cl_readahead   *rap;
3783
3784         if ((wbp = ubc->cl_wbehind)) {
3785
3786                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
3787
3788                 if (wbp->cl_scmap)
3789                         vfs_drt_control(&(wbp->cl_scmap), 0);
3790         } else {
3791                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, 0, 0, 0, 0);
3792         }
3793
3794         rap = ubc->cl_rahead;
3795
3796         if (wbp != NULL) {
3797                 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
3798                 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
3799         }
3800         if ((rap = ubc->cl_rahead)) {
3801                 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
3802                 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
3803         }
3804         ubc->cl_rahead  = NULL;
3805         ubc->cl_wbehind = NULL;
3806
3807         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)ubc, (int)rap, (int)wbp, 0, 0);
3808 }
3809
3810
3811 static void
3812 cluster_push_EOF(vnode_t vp, off_t EOF)
3813 {
3814         struct cl_writebehind *wbp;
3815
3816         wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3817
3818         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3819                      (int)wbp->cl_scmap, wbp->cl_number, (int)EOF, 0, 0);
3820
3821         if (wbp->cl_scmap)
3822                 sparse_cluster_push(wbp, vp, EOF, 1);
3823         else
3824                 cluster_try_push(wbp, vp, EOF, 0, 1);
3825
3826         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3827                      (int)wbp->cl_scmap, wbp->cl_number, 0, 0, 0);
3828
3829         lck_mtx_unlock(&wbp->cl_lockw);
3830 }
3831
3832
3833 static int
3834 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int can_delay, int push_all)
3835 {
3836         int cl_index;
3837         int cl_index1;
3838         int min_index;
3839         int cl_len;
3840         int cl_pushed = 0;
3841         struct cl_wextent l_clusters[MAX_CLUSTERS];
3842
3843         /*
3844          * the write behind context exists and has
3845          * already been locked...
3846          *
3847          * make a local 'sorted' copy of the clusters
3848          * and clear wbp->cl_number so that new clusters can
3849          * be developed
3850          */
3851         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3852                 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
3853                         if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
3854                                 continue;
3855                         if (min_index == -1)
3856                                 min_index = cl_index1;
3857                         else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
3858                                 min_index = cl_index1;
3859                 }
3860                 if (min_index == -1)
3861                         break;
3862                 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
3863                 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
3864                 l_clusters[cl_index].io_nocache = wbp->cl_clusters[min_index].io_nocache;
3865
3866                 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
3867         }
3868         wbp->cl_number = 0;
3869
3870         cl_len = cl_index;
3871
3872         if (can_delay && cl_len == MAX_CLUSTERS) {
3873                 int   i;
3874
3875                 /*
3876                  * determine if we appear to be writing the file sequentially
3877                  * if not, by returning without having pushed any clusters
3878                  * we will cause this vnode to be pushed into the sparse cluster mechanism
3879                  * used for managing more random I/O patterns
3880                  *
3881                  * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3882                  * that's why we're in try_push with can_delay true...
3883                  *
3884                  * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3885                  * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3886                  * so we can just make a simple pass through, up to, but not including the last one...
3887                  * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
3888                  * are sequential
3889                  *
3890                  * we let the last one be partial as long as it was adjacent to the previous one...
3891                  * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3892                  * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3893                  */
3894                 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
3895                         if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != MAX_UPL_TRANSFER)
3896                                 goto dont_try;
3897                         if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
3898                                 goto dont_try;
3899                 }
3900         }
3901         /*
3902          * drop the lock while we're firing off the I/Os...
3903          * this is safe since I'm working off of a private sorted copy
3904          * of the clusters, and I'm going to re-evaluate the public
3905          * state after I retake the lock
3906          */
3907         lck_mtx_unlock(&wbp->cl_lockw);
3908
3909         for (cl_index = 0; cl_index < cl_len; cl_index++) {
3910                 int flags;
3911                 struct cl_extent cl;
3912
3913                 /*
3914                  * try to push each cluster in turn...
3915                  */
3916                 if (l_clusters[cl_index].io_nocache)
3917                         flags = IO_NOCACHE;
3918                 else
3919                         flags = 0;
3920                 cl.b_addr = l_clusters[cl_index].b_addr;
3921                 cl.e_addr = l_clusters[cl_index].e_addr;
3922
3923                 cluster_push_x(vp, &cl, EOF, flags);
3924
3925                 l_clusters[cl_index].b_addr = 0;
3926                 l_clusters[cl_index].e_addr = 0;
3927
3928                 cl_pushed++;
3929
3930                 if (push_all == 0)
3931                         break;
3932         }
3933         lck_mtx_lock(&wbp->cl_lockw);
3934
3935 dont_try:
3936         if (cl_len > cl_pushed) {
3937                /*
3938                 * we didn't push all of the clusters, so
3939                 * lets try to merge them back in to the vnode
3940                 */
3941                 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
3942                         /*
3943                          * we picked up some new clusters while we were trying to
3944                          * push the old ones... this can happen because I've dropped
3945                          * the vnode lock... the sum of the
3946                          * leftovers plus the new cluster count exceeds our ability
3947                          * to represent them, so switch to the sparse cluster mechanism
3948                          *
3949                          * collect the active public clusters...
3950                          */
3951                         sparse_cluster_switch(wbp, vp, EOF);
3952
3953                         for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
3954                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
3955                                         continue;
3956                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3957                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3958                                 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
3959
3960                                 cl_index1++;
3961                         }
3962                         /*
3963                          * update the cluster count
3964                          */
3965                         wbp->cl_number = cl_index1;
3966
3967                         /*
3968                          * and collect the original clusters that were moved into the
3969                          * local storage for sorting purposes
3970                          */
3971                         sparse_cluster_switch(wbp, vp, EOF);
3972
3973                 } else {
3974                         /*
3975                          * we've got room to merge the leftovers back in
3976                          * just append them starting at the next 'hole'
3977                          * represented by wbp->cl_number
3978                          */
3979                         for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
3980                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
3981                                         continue;
3982
3983                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3984                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3985                                 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
3986
3987                                 cl_index1++;
3988                         }
3989                         /*
3990                          * update the cluster count
3991                          */
3992                         wbp->cl_number = cl_index1;
3993                 }
3994         }
3995         return(MAX_CLUSTERS - wbp->cl_number);
3996 }
3997
3998
3999
4000 static int
4001 cluster_push_x(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags)
4002 {
4003         upl_page_info_t *pl;
4004         upl_t            upl;
4005         vm_offset_t      upl_offset;
4006         int              upl_size;
4007         off_t            upl_f_offset;
4008         int              pages_in_upl;
4009         int              start_pg;
4010         int              last_pg;
4011         int              io_size;
4012         int              io_flags;
4013         int              upl_flags;
4014         int              size;
4015         int              error = 0;
4016         int              retval;
4017         kern_return_t    kret;
4018
4019
4020         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
4021                      (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
4022
4023         if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
4024                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
4025
4026                 return (0);
4027         }
4028         upl_size = pages_in_upl * PAGE_SIZE;
4029         upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4030
4031         if (upl_f_offset + upl_size >= EOF) {
4032
4033                 if (upl_f_offset >= EOF) {
4034                         /*
4035                          * must have truncated the file and missed
4036                          * clearing a dangling cluster (i.e. it's completely
4037                          * beyond the new EOF
4038                          */
4039                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
4040
4041                         return(0);
4042                 }
4043                 size = EOF - upl_f_offset;
4044
4045                 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4046                 pages_in_upl = upl_size / PAGE_SIZE;
4047         } else
4048                 size = upl_size;
4049
4050         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
4051
4052         /*
4053          * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
4054          *
4055          * - only pages that are currently dirty are returned... these are the ones we need to clean
4056          * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
4057          * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
4058          * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
4059          *   someone dirties this page while the I/O is in progress, we don't lose track of the new state
4060          *
4061          * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
4062          */
4063
4064         if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE))
4065                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
4066         else
4067                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
4068
4069         kret = ubc_create_upl(vp,
4070                                 upl_f_offset,
4071                                 upl_size,
4072                                 &upl,
4073                                 &pl,
4074                                 upl_flags);
4075         if (kret != KERN_SUCCESS)
4076                 panic("cluster_push: failed to get pagelist");
4077
4078         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
4079
4080         /*
4081          * since we only asked for the dirty pages back
4082          * it's possible that we may only get a few or even none, so...
4083          * before we start marching forward, we must make sure we know
4084          * where the last present page is in the UPL, otherwise we could
4085          * end up working with a freed upl due to the FREE_ON_EMPTY semantics
4086          * employed by commit_range and abort_range.
4087          */
4088         for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
4089                 if (upl_page_present(pl, last_pg))
4090                         break;
4091         }
4092         pages_in_upl = last_pg + 1;
4093
4094         if (pages_in_upl == 0) {
4095                 ubc_upl_abort(upl, 0);
4096
4097                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
4098                 return(0);
4099         }
4100
4101         for (last_pg = 0; last_pg < pages_in_upl; ) {
4102                 /*
4103                  * find the next dirty page in the UPL
4104                  * this will become the first page in the
4105                  * next I/O to generate
4106                  */
4107                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
4108                         if (upl_dirty_page(pl, start_pg))
4109                                 break;
4110                         if (upl_page_present(pl, start_pg))
4111                                 /*
4112                                  * RET_ONLY_DIRTY will return non-dirty 'precious' pages
4113                                  * just release these unchanged since we're not going
4114                                  * to steal them or change their state
4115                                  */
4116                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4117                 }
4118                 if (start_pg >= pages_in_upl)
4119                         /*
4120                          * done... no more dirty pages to push
4121                          */
4122                         break;
4123                 if (start_pg > last_pg)
4124                         /*
4125                          * skipped over some non-dirty pages
4126                          */
4127                         size -= ((start_pg - last_pg) * PAGE_SIZE);
4128
4129                 /*
4130                  * find a range of dirty pages to write
4131                  */
4132                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4133                         if (!upl_dirty_page(pl, last_pg))
4134                                 break;
4135                 }
4136                 upl_offset = start_pg * PAGE_SIZE;
4137
4138                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
4139
4140                 io_flags = CL_THROTTLE | CL_COMMIT;
4141
4142                 if ( !(flags & IO_SYNC))
4143                         io_flags |= CL_ASYNC;
4144
4145                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
4146                                     io_flags, (buf_t)NULL, (struct clios *)NULL);
4147
4148                 if (error == 0 && retval)
4149                         error = retval;
4150
4151                 size -= io_size;
4152         }
4153         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
4154
4155         return(error);
4156 }
4157
4158
4159 /*
4160  * sparse_cluster_switch is called with the write behind lock held
4161  */
4162 static void
4163 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF)
4164 {
4165         int     cl_index;
4166
4167         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4168
4169         if (wbp->cl_scmap == NULL)
4170                 wbp->cl_scdirty = 0;
4171
4172         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
4173                 int       flags;
4174                 struct cl_extent cl;
4175
4176                 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
4177
4178                         if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
4179                                 if (flags & UPL_POP_DIRTY) {
4180                                         cl.e_addr = cl.b_addr + 1;
4181
4182                                         sparse_cluster_add(wbp, vp, &cl, EOF);
4183                                 }
4184                         }
4185                 }
4186         }
4187         wbp->cl_number = 0;
4188
4189         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4190 }
4191
4192
4193 /*
4194  * sparse_cluster_push is called with the write behind lock held
4195  */
4196 static void
4197 sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_all)
4198 {
4199         struct cl_extent cl;
4200         off_t           offset;
4201         u_int           length;
4202
4203         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, push_all, 0);
4204
4205         if (push_all)
4206                 vfs_drt_control(&(wbp->cl_scmap), 1);
4207
4208         for (;;) {
4209                 if (vfs_drt_get_cluster(&(wbp->cl_scmap), &offset, &length) != KERN_SUCCESS)
4210                         break;
4211
4212                 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
4213                 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
4214
4215                 wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr);
4216
4217                 cluster_push_x(vp, &cl, EOF, 0);
4218
4219                 if (push_all == 0)
4220                         break;
4221         }
4222         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4223 }
4224
4225
4226 /*
4227  * sparse_cluster_add is called with the write behind lock held
4228  */
4229 static void
4230 sparse_cluster_add(struct cl_writebehind *wbp, vnode_t vp, struct cl_extent *cl, off_t EOF)
4231 {
4232         u_int   new_dirty;
4233         u_int   length;
4234         off_t   offset;
4235
4236         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)wbp->cl_scmap, wbp->cl_scdirty, (int)cl->b_addr, (int)cl->e_addr, 0);
4237
4238         offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4239         length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
4240
4241         while (vfs_drt_mark_pages(&(wbp->cl_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
4242                 /*
4243                  * no room left in the map
4244                  * only a partial update was done
4245                  * push out some pages and try again
4246                  */
4247                 wbp->cl_scdirty += new_dirty;
4248
4249                 sparse_cluster_push(wbp, vp, EOF, 0);
4250
4251                 offset += (new_dirty * PAGE_SIZE_64);
4252                 length -= (new_dirty * PAGE_SIZE);
4253         }
4254         wbp->cl_scdirty += new_dirty;
4255
4256         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4257 }
4258
4259
4260 static int
4261 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags)
4262 {
4263         upl_page_info_t  *pl;
4264         upl_t            upl;
4265         addr64_t         ubc_paddr;
4266         kern_return_t    kret;
4267         int              error = 0;
4268         int              did_read = 0;
4269         int              abort_flags;
4270         int              upl_flags;
4271
4272         upl_flags = UPL_SET_LITE;
4273         if (! (flags & CL_READ)) {
4274                 /*
4275                  * "write" operation:  let the UPL subsystem know
4276                  * that we intend to modify the buffer cache pages
4277                  * we're gathering.
4278                  */
4279                 upl_flags |= UPL_WILL_MODIFY;
4280         }
4281
4282         kret = ubc_create_upl(vp,
4283                               uio->uio_offset & ~PAGE_MASK_64,
4284                               PAGE_SIZE,
4285                               &upl,
4286                               &pl,
4287                               upl_flags);
4288
4289         if (kret != KERN_SUCCESS)
4290                 return(EINVAL);
4291
4292         if (!upl_valid_page(pl, 0)) {
4293                 /*
4294                  * issue a synchronous read to cluster_io
4295                  */
4296                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4297                                    CL_READ, (buf_t)NULL, (struct clios *)NULL);
4298                 if (error) {
4299                           ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4300
4301                           return(error);
4302                 }
4303                 did_read = 1;
4304         }
4305         ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
4306
4307 /*
4308  *      NOTE:  There is no prototype for the following in BSD. It, and the definitions
4309  *      of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
4310  *      osfmk/ppc/mappings.h.  They are not included here because there appears to be no
4311  *      way to do so without exporting them to kexts as well.
4312  */
4313         if (flags & CL_READ)
4314 //              copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);    /* Copy physical to physical and flush the destination */
4315                 copypv(ubc_paddr, usr_paddr, xsize,        2 |        1 |        4);    /* Copy physical to physical and flush the destination */
4316         else
4317 //              copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);    /* Copy physical to physical and flush the source */
4318                 copypv(usr_paddr, ubc_paddr, xsize,        2 |        1 |        8);    /* Copy physical to physical and flush the source */
4319
4320         if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
4321                 /*
4322                  * issue a synchronous write to cluster_io
4323                  */
4324                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4325                                         0, (buf_t)NULL, (struct clios *)NULL);
4326         }
4327         if (error == 0)
4328                 uio_update(uio, (user_size_t)xsize);
4329
4330         if (did_read)
4331                 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4332         else
4333                 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
4334
4335         ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
4336
4337         return (error);
4338 }
4339
4340
4341
4342 int
4343 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
4344 {
4345         int       pg_offset;
4346         int       pg_index;
4347         int       csize;
4348         int       segflg;
4349         int       retval = 0;
4350         upl_page_info_t *pl;
4351
4352         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
4353                      (int)uio->uio_offset, uio_resid(uio), upl_offset, xsize, 0);
4354
4355         segflg = uio->uio_segflg;
4356
4357         switch(segflg) {
4358
4359           case UIO_USERSPACE32:
4360           case UIO_USERISPACE32:
4361                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4362                 break;
4363
4364           case UIO_USERSPACE:
4365           case UIO_USERISPACE:
4366                 uio->uio_segflg = UIO_PHYS_USERSPACE;
4367                 break;
4368
4369           case UIO_USERSPACE64:
4370           case UIO_USERISPACE64:
4371                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4372                 break;
4373
4374           case UIO_SYSSPACE32:
4375                 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4376                 break;
4377
4378           case UIO_SYSSPACE:
4379                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4380                 break;
4381
4382           case UIO_SYSSPACE64:
4383                 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4384                 break;
4385         }
4386         pl = ubc_upl_pageinfo(upl);
4387
4388         pg_index  = upl_offset / PAGE_SIZE;
4389         pg_offset = upl_offset & PAGE_MASK;
4390         csize     = min(PAGE_SIZE - pg_offset, xsize);
4391
4392         while (xsize && retval == 0) {
4393                 addr64_t  paddr;
4394
4395                 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
4396
4397                 retval = uiomove64(paddr, csize, uio);
4398
4399                 pg_index += 1;
4400                 pg_offset = 0;
4401                 xsize    -= csize;
4402                 csize     = min(PAGE_SIZE, xsize);
4403         }
4404         uio->uio_segflg = segflg;
4405
4406         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4407                      (int)uio->uio_offset, uio_resid(uio), retval, segflg, 0);
4408
4409         return (retval);
4410 }
4411
4412
4413 int
4414 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
4415 {
4416         int       segflg;
4417         int       io_size;
4418         int       xsize;
4419         int       start_offset;
4420         int       retval = 0;
4421         memory_object_control_t  control;
4422
4423
4424         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
4425                      (int)uio->uio_offset, uio_resid(uio), 0, *io_resid, 0);
4426
4427         control = ubc_getobject(vp, UBC_FLAGS_NONE);
4428         if (control == MEMORY_OBJECT_CONTROL_NULL) {
4429                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4430                              (int)uio->uio_offset, uio_resid(uio), retval, 3, 0);
4431
4432                 return(0);
4433         }
4434         segflg = uio->uio_segflg;
4435
4436         switch(segflg) {
4437
4438           case UIO_USERSPACE32:
4439           case UIO_USERISPACE32:
4440                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4441                 break;
4442
4443           case UIO_USERSPACE64:
4444           case UIO_USERISPACE64:
4445                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4446                 break;
4447
4448           case UIO_SYSSPACE32:
4449                 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4450                 break;
4451
4452           case UIO_SYSSPACE64:
4453                 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4454                 break;
4455
4456           case UIO_USERSPACE:
4457           case UIO_USERISPACE:
4458                 uio->uio_segflg = UIO_PHYS_USERSPACE;
4459                 break;
4460
4461           case UIO_SYSSPACE:
4462                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4463                 break;
4464         }
4465
4466         if ( (io_size = *io_resid) ) {
4467                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4468                 xsize = uio_resid(uio);
4469
4470                 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset,
4471                                                        uio, start_offset, io_size, mark_dirty);
4472                 xsize -= uio_resid(uio);
4473                 io_size -= xsize;
4474         }
4475         uio->uio_segflg = segflg;
4476         *io_resid       = io_size;
4477
4478         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4479                      (int)uio->uio_offset, uio_resid(uio), retval, 0x80000000 | segflg, 0);
4480
4481         return(retval);
4482 }
4483
4484
4485 int
4486 is_file_clean(vnode_t vp, off_t filesize)
4487 {
4488         off_t f_offset;
4489         int   flags;
4490         int   total_dirty = 0;
4491
4492         for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
4493                 if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
4494                         if (flags & UPL_POP_DIRTY) {
4495                                 total_dirty++;
4496                         }
4497                 }
4498         }
4499         if (total_dirty)
4500                 return(EINVAL);
4501
4502         return (0);
4503 }
4504
4505
4506
4507 /*
4508  * Dirty region tracking/clustering mechanism.
4509  *
4510  * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4511  * dirty regions within a larger space (file).  It is primarily intended to
4512  * support clustering in large files with many dirty areas.
4513  *
4514  * The implementation assumes that the dirty regions are pages.
4515  *
4516  * To represent dirty pages within the file, we store bit vectors in a
4517  * variable-size circular hash.
4518  */
4519
4520 /*
4521  * Bitvector size.  This determines the number of pages we group in a
4522  * single hashtable entry.  Each hashtable entry is aligned to this
4523  * size within the file.
4524  */
4525 #define DRT_BITVECTOR_PAGES             256
4526
4527 /*
4528  * File offset handling.
4529  *
4530  * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4531  * the correct formula is  (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4532  */
4533 #define DRT_ADDRESS_MASK                (~((1 << 20) - 1))
4534 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK)
4535
4536 /*
4537  * Hashtable address field handling.
4538  *
4539  * The low-order bits of the hashtable address are used to conserve
4540  * space.
4541  *
4542  * DRT_HASH_COUNT_MASK must be large enough to store the range
4543  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4544  * to indicate that the bucket is actually unoccupied.
4545  */
4546 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4547 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \
4548         do {                                                                                            \
4549                 (scm)->scm_hashtable[(i)].dhe_control =                                                 \
4550                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4551         } while (0)
4552 #define DRT_HASH_COUNT_MASK             0x1ff
4553 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4554 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \
4555         do {                                                                                                            \
4556                 (scm)->scm_hashtable[(i)].dhe_control =                                                                 \
4557                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \
4558         } while (0)
4559 #define DRT_HASH_CLEAR(scm, i)                                                                                          \
4560         do {                                                                                                            \
4561                 (scm)->scm_hashtable[(i)].dhe_control = 0;                                                              \
4562         } while (0)
4563 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4564 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4565 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \
4566         do {                                                                                            \
4567                 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \
4568                 DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \
4569         } while(0);
4570
4571
4572 /*
4573  * Hash table moduli.
4574  *
4575  * Since the hashtable entry's size is dependent on the size of
4576  * the bitvector, and since the hashtable size is constrained to
4577  * both being prime and fitting within the desired allocation
4578  * size, these values need to be manually determined.
4579  *
4580  * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4581  *
4582  * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4583  * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4584  */
4585 #define DRT_HASH_SMALL_MODULUS  23
4586 #define DRT_HASH_LARGE_MODULUS  401
4587
4588 #define DRT_SMALL_ALLOCATION    1024    /* 104 bytes spare */
4589 #define DRT_LARGE_ALLOCATION    16384   /* 344 bytes spare */
4590
4591 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4592
4593 /*
4594  * Hashtable bitvector handling.
4595  *
4596  * Bitvector fields are 32 bits long.
4597  */
4598
4599 #define DRT_HASH_SET_BIT(scm, i, bit)                           \
4600         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4601
4602 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \
4603         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4604
4605 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \
4606         ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4607
4608 #define DRT_BITVECTOR_CLEAR(scm, i)                             \
4609         bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4610
4611 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \
4612         bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \
4613             &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \
4614             (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4615
4616
4617
4618 /*
4619  * Hashtable entry.
4620  */
4621 struct vfs_drt_hashentry {
4622         u_int64_t       dhe_control;
4623         u_int32_t       dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
4624 };
4625
4626 /*
4627  * Dirty Region Tracking structure.
4628  *
4629  * The hashtable is allocated entirely inside the DRT structure.
4630  *
4631  * The hash is a simple circular prime modulus arrangement, the structure
4632  * is resized from small to large if it overflows.
4633  */
4634
4635 struct vfs_drt_clustermap {
4636         u_int32_t               scm_magic;      /* sanity/detection */
4637 #define DRT_SCM_MAGIC           0x12020003
4638         u_int32_t               scm_modulus;    /* current ring size */
4639         u_int32_t               scm_buckets;    /* number of occupied buckets */
4640         u_int32_t               scm_lastclean;  /* last entry we cleaned */
4641         u_int32_t               scm_iskips;     /* number of slot skips */
4642
4643         struct vfs_drt_hashentry scm_hashtable[0];
4644 };
4645
4646
4647 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus)
4648 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus)
4649
4650 /*
4651  * Debugging codes and arguments.
4652  */
4653 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4654 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4655 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4656 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4657 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4658                                                             * dirty */
4659                                                            /* 0, setcount */
4660                                                            /* 1 (clean, no map) */
4661                                                            /* 2 (map alloc fail) */
4662                                                            /* 3, resid (partial) */
4663 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87))
4664 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4665                                                             * lastclean, iskips */
4666
4667
4668 static kern_return_t    vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
4669 static kern_return_t    vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
4670 static kern_return_t    vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
4671         u_int64_t offset, int *indexp);
4672 static kern_return_t    vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
4673         u_int64_t offset,
4674         int *indexp,
4675         int recursed);
4676 static kern_return_t    vfs_drt_do_mark_pages(
4677         void            **cmapp,
4678         u_int64_t       offset,
4679         u_int           length,
4680         int             *setcountp,
4681         int             dirty);
4682 static void             vfs_drt_trace(
4683         struct vfs_drt_clustermap *cmap,
4684         int code,
4685         int arg1,
4686         int arg2,
4687         int arg3,
4688         int arg4);
4689
4690
4691 /*
4692  * Allocate and initialise a sparse cluster map.
4693  *
4694  * Will allocate a new map, resize or compact an existing map.
4695  *
4696  * XXX we should probably have at least one intermediate map size,
4697  * as the 1:16 ratio seems a bit drastic.
4698  */
4699 static kern_return_t
4700 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
4701 {
4702         struct vfs_drt_clustermap *cmap, *ocmap;
4703         kern_return_t   kret;
4704         u_int64_t       offset;
4705         int             nsize, i, active_buckets, index, copycount;
4706
4707         ocmap = NULL;
4708         if (cmapp != NULL)
4709                 ocmap = *cmapp;
4710
4711         /*
4712          * Decide on the size of the new map.
4713          */
4714         if (ocmap == NULL) {
4715                 nsize = DRT_HASH_SMALL_MODULUS;
4716         } else {
4717                 /* count the number of active buckets in the old map */
4718                 active_buckets = 0;
4719                 for (i = 0; i < ocmap->scm_modulus; i++) {
4720                         if (!DRT_HASH_VACANT(ocmap, i) &&
4721                             (DRT_HASH_GET_COUNT(ocmap, i) != 0))
4722                                 active_buckets++;
4723                 }
4724                 /*
4725                  * If we're currently using the small allocation, check to
4726                  * see whether we should grow to the large one.
4727                  */
4728                 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
4729                         /* if the ring is nearly full */
4730                         if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
4731                                 nsize = DRT_HASH_LARGE_MODULUS;
4732                         } else {
4733                                 nsize = DRT_HASH_SMALL_MODULUS;
4734                         }
4735                 } else {
4736                         /* already using the large modulus */
4737                         nsize = DRT_HASH_LARGE_MODULUS;
4738                         /*
4739                          * If the ring is completely full, there's
4740                          * nothing useful for us to do.  Behave as
4741                          * though we had compacted into the new
4742                          * array and return.
4743                          */
4744                         if (active_buckets >= DRT_HASH_LARGE_MODULUS)
4745                                 return(KERN_SUCCESS);
4746                 }
4747         }
4748
4749         /*
4750          * Allocate and initialise the new map.
4751          */
4752
4753         kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
4754             (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4755         if (kret != KERN_SUCCESS)
4756                 return(kret);
4757         cmap->scm_magic = DRT_SCM_MAGIC;
4758         cmap->scm_modulus = nsize;
4759         cmap->scm_buckets = 0;
4760         cmap->scm_lastclean = 0;
4761         cmap->scm_iskips = 0;
4762         for (i = 0; i < cmap->scm_modulus; i++) {
4763                 DRT_HASH_CLEAR(cmap, i);
4764                 DRT_HASH_VACATE(cmap, i);
4765                 DRT_BITVECTOR_CLEAR(cmap, i);
4766         }
4767
4768         /*
4769          * If there's an old map, re-hash entries from it into the new map.
4770          */
4771         copycount = 0;
4772         if (ocmap != NULL) {
4773                 for (i = 0; i < ocmap->scm_modulus; i++) {
4774                         /* skip empty buckets */
4775                         if (DRT_HASH_VACANT(ocmap, i) ||
4776                             (DRT_HASH_GET_COUNT(ocmap, i) == 0))
4777                                 continue;
4778                         /* get new index */
4779                         offset = DRT_HASH_GET_ADDRESS(ocmap, i);
4780                         kret = vfs_drt_get_index(&cmap, offset, &index, 1);
4781                         if (kret != KERN_SUCCESS) {
4782                                 /* XXX need to bail out gracefully here */
4783                                 panic("vfs_drt: new cluster map mysteriously too small");
4784                         }
4785                         /* copy */
4786                         DRT_HASH_COPY(ocmap, i, cmap, index);
4787                         copycount++;
4788                 }
4789         }
4790
4791         /* log what we've done */
4792         vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
4793
4794         /*
4795          * It's important to ensure that *cmapp always points to
4796          * a valid map, so we must overwrite it before freeing
4797          * the old map.
4798          */
4799         *cmapp = cmap;
4800         if (ocmap != NULL) {
4801                 /* emit stats into trace buffer */
4802                 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
4803                               ocmap->scm_modulus,
4804                               ocmap->scm_buckets,
4805                               ocmap->scm_lastclean,
4806                               ocmap->scm_iskips);
4807
4808                 vfs_drt_free_map(ocmap);
4809         }
4810         return(KERN_SUCCESS);
4811 }
4812
4813
4814 /*
4815  * Free a sparse cluster map.
4816  */
4817 static kern_return_t
4818 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
4819 {
4820         kmem_free(kernel_map, (vm_offset_t)cmap,
4821                   (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4822         return(KERN_SUCCESS);
4823 }
4824
4825
4826 /*
4827  * Find the hashtable slot currently occupied by an entry for the supplied offset.
4828  */
4829 static kern_return_t
4830 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
4831 {
4832         int             index, i;
4833
4834         offset = DRT_ALIGN_ADDRESS(offset);
4835         index = DRT_HASH(cmap, offset);
4836
4837         /* traverse the hashtable */
4838         for (i = 0; i < cmap->scm_modulus; i++) {
4839
4840                 /*
4841                  * If the slot is vacant, we can stop.
4842                  */
4843                 if (DRT_HASH_VACANT(cmap, index))
4844                         break;
4845
4846                 /*
4847                  * If the address matches our offset, we have success.
4848                  */
4849                 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
4850                         *indexp = index;
4851                         return(KERN_SUCCESS);
4852                 }
4853
4854                 /*
4855                  * Move to the next slot, try again.
4856                  */
4857                 index = DRT_HASH_NEXT(cmap, index);
4858         }
4859         /*
4860          * It's not there.
4861          */
4862         return(KERN_FAILURE);
4863 }
4864
4865 /*
4866  * Find the hashtable slot for the supplied offset.  If we haven't allocated
4867  * one yet, allocate one and populate the address field.  Note that it will
4868  * not have a nonzero page count and thus will still technically be free, so
4869  * in the case where we are called to clean pages, the slot will remain free.
4870  */
4871 static kern_return_t
4872 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
4873 {
4874         struct vfs_drt_clustermap *cmap;
4875         kern_return_t   kret;
4876         int             index, i;
4877
4878         cmap = *cmapp;
4879
4880         /* look for an existing entry */
4881         kret = vfs_drt_search_index(cmap, offset, indexp);
4882         if (kret == KERN_SUCCESS)
4883                 return(kret);
4884
4885         /* need to allocate an entry */
4886         offset = DRT_ALIGN_ADDRESS(offset);
4887         index = DRT_HASH(cmap, offset);
4888
4889         /* scan from the index forwards looking for a vacant slot */
4890         for (i = 0; i < cmap->scm_modulus; i++) {
4891                 /* slot vacant? */
4892                 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
4893                         cmap->scm_buckets++;
4894                         if (index < cmap->scm_lastclean)
4895                                 cmap->scm_lastclean = index;
4896                         DRT_HASH_SET_ADDRESS(cmap, index, offset);
4897                         DRT_HASH_SET_COUNT(cmap, index, 0);
4898                         DRT_BITVECTOR_CLEAR(cmap, index);
4899                         *indexp = index;
4900                         vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
4901                         return(KERN_SUCCESS);
4902                 }
4903                 cmap->scm_iskips += i;
4904                 index = DRT_HASH_NEXT(cmap, index);
4905         }
4906
4907         /*
4908          * We haven't found a vacant slot, so the map is full.  If we're not
4909          * already recursed, try reallocating/compacting it.
4910          */
4911         if (recursed)
4912                 return(KERN_FAILURE);
4913         kret = vfs_drt_alloc_map(cmapp);
4914         if (kret == KERN_SUCCESS) {
4915                 /* now try to insert again */
4916                 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
4917         }
4918         return(kret);
4919 }
4920
4921 /*
4922  * Implementation of set dirty/clean.
4923  *
4924  * In the 'clean' case, not finding a map is OK.
4925  */
4926 static kern_return_t
4927 vfs_drt_do_mark_pages(
4928         void            **private,
4929         u_int64_t       offset,
4930         u_int           length,
4931         int             *setcountp,
4932         int             dirty)
4933 {
4934         struct vfs_drt_clustermap *cmap, **cmapp;
4935         kern_return_t   kret;
4936         int             i, index, pgoff, pgcount, setcount, ecount;
4937
4938         cmapp = (struct vfs_drt_clustermap **)private;
4939         cmap = *cmapp;
4940
4941         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
4942
4943         if (setcountp != NULL)
4944                 *setcountp = 0;
4945
4946         /* allocate a cluster map if we don't already have one */
4947         if (cmap == NULL) {
4948                 /* no cluster map, nothing to clean */
4949                 if (!dirty) {
4950                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
4951                         return(KERN_SUCCESS);
4952                 }
4953                 kret = vfs_drt_alloc_map(cmapp);
4954                 if (kret != KERN_SUCCESS) {
4955                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
4956                         return(kret);
4957                 }
4958         }
4959         setcount = 0;
4960
4961         /*
4962          * Iterate over the length of the region.
4963          */
4964         while (length > 0) {
4965                 /*
4966                  * Get the hashtable index for this offset.
4967                  *
4968                  * XXX this will add blank entries if we are clearing a range
4969                  * that hasn't been dirtied.
4970                  */
4971                 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
4972                 cmap = *cmapp;  /* may have changed! */
4973                 /* this may be a partial-success return */
4974                 if (kret != KERN_SUCCESS) {
4975                         if (setcountp != NULL)
4976                                 *setcountp = setcount;
4977                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
4978
4979                         return(kret);
4980                 }
4981
4982                 /*
4983                  * Work out how many pages we're modifying in this
4984                  * hashtable entry.
4985                  */
4986                 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
4987                 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
4988
4989                 /*
4990                  * Iterate over pages, dirty/clearing as we go.
4991                  */
4992                 ecount = DRT_HASH_GET_COUNT(cmap, index);
4993                 for (i = 0; i < pgcount; i++) {
4994                         if (dirty) {
4995                                 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4996                                         DRT_HASH_SET_BIT(cmap, index, pgoff + i);
4997                                         ecount++;
4998                                         setcount++;
4999                                 }
5000                         } else {
5001                                 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
5002                                         DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
5003                                         ecount--;
5004                                         setcount++;
5005                                 }
5006                         }
5007                 }
5008                 DRT_HASH_SET_COUNT(cmap, index, ecount);
5009
5010                 offset += pgcount * PAGE_SIZE;
5011                 length -= pgcount * PAGE_SIZE;
5012         }
5013         if (setcountp != NULL)
5014                 *setcountp = setcount;
5015
5016         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
5017
5018         return(KERN_SUCCESS);
5019 }
5020
5021 /*
5022  * Mark a set of pages as dirty/clean.
5023  *
5024  * This is a public interface.
5025  *
5026  * cmapp
5027  *      Pointer to storage suitable for holding a pointer.  Note that
5028  *      this must either be NULL or a value set by this function.
5029  *
5030  * size
5031  *      Current file size in bytes.
5032  *
5033  * offset
5034  *      Offset of the first page to be marked as dirty, in bytes.  Must be
5035  *      page-aligned.
5036  *
5037  * length
5038  *      Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE.
5039  *
5040  * setcountp
5041  *      Number of pages newly marked dirty by this call (optional).
5042  *
5043  * Returns KERN_SUCCESS if all the pages were successfully marked.
5044  */
5045 static kern_return_t
5046 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp)
5047 {
5048         /* XXX size unused, drop from interface */
5049         return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
5050 }
5051
5052 #if 0
5053 static kern_return_t
5054 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
5055 {
5056         return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
5057 }
5058 #endif
5059
5060 /*
5061  * Get a cluster of dirty pages.
5062  *
5063  * This is a public interface.
5064  *
5065  * cmapp
5066  *      Pointer to storage managed by drt_mark_pages.  Note that this must
5067  *      be NULL or a value set by drt_mark_pages.
5068  *
5069  * offsetp
5070  *      Returns the byte offset into the file of the first page in the cluster.
5071  *
5072  * lengthp
5073  *      Returns the length in bytes of the cluster of dirty pages.
5074  *
5075  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there
5076  * are no dirty pages meeting the minmum size criteria.  Private storage will
5077  * be released if there are no more dirty pages left in the map
5078  *
5079  */
5080 static kern_return_t
5081 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
5082 {
5083         struct vfs_drt_clustermap *cmap;
5084         u_int64_t       offset;
5085         u_int           length;
5086         int             index, i, j, fs, ls;
5087
5088         /* sanity */
5089         if ((cmapp == NULL) || (*cmapp == NULL))
5090                 return(KERN_FAILURE);
5091         cmap = *cmapp;
5092
5093         /* walk the hashtable */
5094         for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
5095                 index = DRT_HASH(cmap, offset);
5096
5097                 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
5098                         continue;
5099
5100                 /* scan the bitfield for a string of bits */
5101                 fs = -1;
5102
5103                 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5104                         if (DRT_HASH_TEST_BIT(cmap, index, i)) {
5105                                 fs = i;
5106                                 break;
5107                         }
5108                 }
5109                 if (fs == -1) {
5110                         /*  didn't find any bits set */
5111                         panic("vfs_drt: entry summary count > 0 but no bits set in map");
5112                 }
5113                 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
5114                         if (!DRT_HASH_TEST_BIT(cmap, index, i))
5115                                 break;
5116                 }
5117
5118                 /* compute offset and length, mark pages clean */
5119                 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
5120                 length = ls * PAGE_SIZE;
5121                 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
5122                 cmap->scm_lastclean = index;
5123
5124                 /* return successful */
5125                 *offsetp = (off_t)offset;
5126                 *lengthp = length;
5127
5128                 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
5129                 return(KERN_SUCCESS);
5130         }
5131         /*
5132          * We didn't find anything... hashtable is empty
5133          * emit stats into trace buffer and
5134          * then free it
5135          */
5136         vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5137                       cmap->scm_modulus,
5138                       cmap->scm_buckets,
5139                       cmap->scm_lastclean,
5140                       cmap->scm_iskips);
5141
5142         vfs_drt_free_map(cmap);
5143         *cmapp = NULL;
5144
5145         return(KERN_FAILURE);
5146 }
5147
5148
5149 static kern_return_t
5150 vfs_drt_control(void **cmapp, int op_type)
5151 {
5152         struct vfs_drt_clustermap *cmap;
5153
5154         /* sanity */
5155         if ((cmapp == NULL) || (*cmapp == NULL))
5156                 return(KERN_FAILURE);
5157         cmap = *cmapp;
5158
5159         switch (op_type) {
5160         case 0:
5161                 /* emit stats into trace buffer */
5162                 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5163                               cmap->scm_modulus,
5164                               cmap->scm_buckets,
5165                               cmap->scm_lastclean,
5166                               cmap->scm_iskips);
5167
5168                 vfs_drt_free_map(cmap);
5169                 *cmapp = NULL;
5170                 break;
5171
5172         case 1:
5173                 cmap->scm_lastclean = 0;
5174                 break;
5175         }
5176         return(KERN_SUCCESS);
5177 }
5178
5179
5180
5181 /*
5182  * Emit a summary of the state of the clustermap into the trace buffer
5183  * along with some caller-provided data.
5184  */
5185 #if KDEBUG
5186 static void
5187 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
5188 {
5189         KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
5190 }
5191 #else
5192 static void
5193 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
5194                           __unused int arg1, __unused int arg2, __unused int arg3,
5195                           __unused int arg4)
5196 {
5197 }
5198 #endif
5199
5200 #if 0
5201 /*
5202  * Perform basic sanity check on the hash entry summary count
5203  * vs. the actual bits set in the entry.
5204  */
5205 static void
5206 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
5207 {
5208         int index, i;
5209         int bits_on;
5210
5211         for (index = 0; index < cmap->scm_modulus; index++) {
5212                 if (DRT_HASH_VACANT(cmap, index))
5213                         continue;
5214
5215                 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5216                         if (DRT_HASH_TEST_BIT(cmap, index, i))
5217                                 bits_on++;
5218                 }
5219                 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
5220                         panic("bits_on = %d,  index = %d\n", bits_on, index);
5221         }
5222 }
5223 #endif