bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  62  */
  63
  64 #include <sys/param.h>
  65 #include <sys/proc_internal.h>
  66 #include <sys/buf_internal.h>
  67 #include <sys/mount_internal.h>
  68 #include <sys/vnode_internal.h>
  69 #include <sys/trace.h>
  70 #include <sys/malloc.h>
  71 #include <sys/time.h>
  72 #include <sys/kernel.h>
  73 #include <sys/resourcevar.h>
  74 #include <sys/uio_internal.h>
  75 #include <libkern/libkern.h>
  76 #include <machine/machine_routines.h>
  77
  78 #include <sys/ubc_internal.h>
  79
  80 #include <mach/mach_types.h>
  81 #include <mach/memory_object_types.h>
  82 #include <mach/vm_map.h>
  83 #include <mach/upl.h>
  84
  85 #include <vm/vm_kern.h>
  86 #include <vm/vm_map.h>
  87 #include <vm/vm_pageout.h>
  88
  89 #include <sys/kdebug.h>
  90
  91
  92 #define CL_READ      0x01
  93 #define CL_ASYNC     0x02
  94 #define CL_COMMIT    0x04
  95 #define CL_PAGEOUT   0x10
  96 #define CL_AGE       0x20
  97 #define CL_DUMP      0x40
  98 #define CL_NOZERO    0x80
  99 #define CL_PAGEIN    0x100
 100 #define CL_DEV_MEMORY 0x200
 101 #define CL_PRESERVE   0x400
 102 #define CL_THROTTLE   0x800
 103 #define CL_KEEPCACHED 0x1000
 104
 105
 106 struct clios {
 107         u_int  io_completed;       /* amount of io that has currently completed */
 108         u_int  io_issued;          /* amount of io that was successfully issued */
 109         int    io_error;           /* error code of first error encountered */
 110         int    io_wanted;          /* someone is sleeping waiting for a change in state */
 111 };
 112
 113 static lck_grp_t        *cl_mtx_grp;
 114 static lck_attr_t       *cl_mtx_attr;
 115 static lck_grp_attr_t   *cl_mtx_grp_attr;
 116 static lck_mtx_t        *cl_mtxp;
 117
 118
 119 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 120                       int flags, buf_t real_bp, struct clios *iostate);
 121 static int cluster_iodone(buf_t bp, void *dummy);
 122 static int cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize);
 123 static int cluster_hard_throttle_on(vnode_t vp);
 124
 125 static int cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags);
 126 static int cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
 127                            off_t headOff, off_t tailOff, int flags);
 128 static int cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize);
 129 static int cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF);
 130 static int cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize);
 131 static int cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF);
 132 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags);
 133
 134 static void     cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra);
 135
 136 static int      cluster_push_x(vnode_t vp, struct cl_extent *, off_t EOF, int flags);
 137 static void     cluster_push_EOF(vnode_t vp, off_t EOF);
 138
 139 static int      cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int can_delay, int push_all);
 140
 141 static void     sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF);
 142 static void     sparse_cluster_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_all);
 143 static void     sparse_cluster_add(struct cl_writebehind *, vnode_t vp, struct cl_extent *, off_t EOF);
 144
 145 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp);
 146 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
 147 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
 148
 149 int     is_file_clean(vnode_t, off_t);
 150
 151 /*
 152  * throttle the number of async writes that
 153  * can be outstanding on a single vnode
 154  * before we issue a synchronous write
 155  */
 156 #define HARD_THROTTLE_MAXCNT    0
 157 #define HARD_THROTTLE_MAXSIZE   (64 * 1024)
 158
 159 int hard_throttle_on_root = 0;
 160 struct timeval priority_IO_timestamp_for_root;
 161
 162
 163 void
 164 cluster_init(void) {
 165     /*
 166          * allocate lock group attribute and group
 167          */
 168     cl_mtx_grp_attr = lck_grp_attr_alloc_init();
 169         cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
 170
 171         /*
 172          * allocate the lock attribute
 173          */
 174         cl_mtx_attr = lck_attr_alloc_init();
 175
 176         /*
 177          * allocate and initialize mutex's used to protect updates and waits
 178          * on the cluster_io context
 179          */
 180         cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
 181
 182         if (cl_mtxp == NULL)
 183                 panic("cluster_init: failed to allocate cl_mtxp");
 184 }
 185
 186
 187
 188 #define CLW_ALLOCATE            0x01
 189 #define CLW_RETURNLOCKED        0x02
 190 /*
 191  * if the read ahead context doesn't yet exist,
 192  * allocate and initialize it...
 193  * the vnode lock serializes multiple callers
 194  * during the actual assignment... first one
 195  * to grab the lock wins... the other callers
 196  * will release the now unnecessary storage
 197  *
 198  * once the context is present, try to grab (but don't block on)
 199  * the lock associated with it... if someone
 200  * else currently owns it, than the read
 201  * will run without read-ahead.  this allows
 202  * multiple readers to run in parallel and
 203  * since there's only 1 read ahead context,
 204  * there's no real loss in only allowing 1
 205  * reader to have read-ahead enabled.
 206  */
 207 static struct cl_readahead *
 208 cluster_get_rap(vnode_t vp)
 209 {
 210         struct ubc_info         *ubc;
 211         struct cl_readahead     *rap;
 212
 213         ubc = vp->v_ubcinfo;
 214
 215         if ((rap = ubc->cl_rahead) == NULL) {
 216                 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
 217
 218                 bzero(rap, sizeof *rap);
 219                 rap->cl_lastr = -1;
 220                 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
 221
 222                 vnode_lock(vp);
 223
 224                 if (ubc->cl_rahead == NULL)
 225                         ubc->cl_rahead = rap;
 226                 else {
 227                         lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
 228                         FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
 229                                 rap = ubc->cl_rahead;
 230                 }
 231                 vnode_unlock(vp);
 232         }
 233         if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
 234                 return(rap);
 235
 236         return ((struct cl_readahead *)NULL);
 237 }
 238
 239
 240 /*
 241  * if the write behind context doesn't yet exist,
 242  * and CLW_ALLOCATE is specified, allocate and initialize it...
 243  * the vnode lock serializes multiple callers
 244  * during the actual assignment... first one
 245  * to grab the lock wins... the other callers
 246  * will release the now unnecessary storage
 247  *
 248  * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
 249  * the lock associated with the write behind context before
 250  * returning
 251  */
 252
 253 static struct cl_writebehind *
 254 cluster_get_wbp(vnode_t vp, int flags)
 255 {
 256         struct ubc_info *ubc;
 257         struct cl_writebehind *wbp;
 258
 259         ubc = vp->v_ubcinfo;
 260
 261         if ((wbp = ubc->cl_wbehind) == NULL) {
 262
 263                 if ( !(flags & CLW_ALLOCATE))
 264                         return ((struct cl_writebehind *)NULL);
 265
 266                 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
 267
 268                 bzero(wbp, sizeof *wbp);
 269                 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
 270
 271                 vnode_lock(vp);
 272
 273                 if (ubc->cl_wbehind == NULL)
 274                         ubc->cl_wbehind = wbp;
 275                 else {
 276                         lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
 277                         FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
 278                                 wbp = ubc->cl_wbehind;
 279                 }
 280                 vnode_unlock(vp);
 281         }
 282         if (flags & CLW_RETURNLOCKED)
 283                 lck_mtx_lock(&wbp->cl_lockw);
 284
 285         return (wbp);
 286 }
 287
 288
 289 static int
 290 cluster_hard_throttle_on(vnode_t vp)
 291 {
 292         static struct timeval hard_throttle_maxelapsed = { 0, 200000 };
 293
 294         if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
 295                 struct timeval elapsed;
 296
 297                 if (hard_throttle_on_root)
 298                         return(1);
 299
 300                 microuptime(&elapsed);
 301                 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
 302
 303                 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
 304                         return(1);
 305         }
 306         return(0);
 307 }
 308
 309
 310 static int
 311 cluster_iodone(buf_t bp, __unused void *dummy)
 312 {
 313         int     b_flags;
 314         int     error;
 315         int     total_size;
 316         int     total_resid;
 317         int     upl_offset;
 318         int     zero_offset;
 319         upl_t   upl;
 320         buf_t   cbp;
 321         buf_t   cbp_head;
 322         buf_t   cbp_next;
 323         buf_t   real_bp;
 324         struct  clios *iostate;
 325         int     commit_size;
 326         int     pg_offset;
 327
 328         cbp_head = (buf_t)(bp->b_trans_head);
 329
 330         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 331                      (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 332
 333         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 334                 /*
 335                  * all I/O requests that are part of this transaction
 336                  * have to complete before we can process it
 337                  */
 338                 if ( !(cbp->b_flags & B_DONE)) {
 339
 340                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 341                                      (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
 342
 343                         return 0;
 344                 }
 345         }
 346         error       = 0;
 347         total_size  = 0;
 348         total_resid = 0;
 349
 350         cbp        = cbp_head;
 351         upl_offset = cbp->b_uploffset;
 352         upl        = cbp->b_upl;
 353         b_flags    = cbp->b_flags;
 354         real_bp    = cbp->b_real_bp;
 355         zero_offset= cbp->b_validend;
 356         iostate    = (struct clios *)cbp->b_iostate;
 357
 358         if (real_bp)
 359                 real_bp->b_dev = cbp->b_dev;
 360
 361         while (cbp) {
 362                 if ((cbp->b_flags & B_ERROR) && error == 0)
 363                         error = cbp->b_error;
 364
 365                 total_resid += cbp->b_resid;
 366                 total_size  += cbp->b_bcount;
 367
 368                 cbp_next = cbp->b_trans_next;
 369
 370                 free_io_buf(cbp);
 371
 372                 cbp = cbp_next;
 373         }
 374         if (zero_offset)
 375                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 376
 377         if (iostate) {
 378                 int need_wakeup = 0;
 379
 380                 /*
 381                  * someone has issued multiple I/Os asynchrounsly
 382                  * and is waiting for them to complete (streaming)
 383                  */
 384                 lck_mtx_lock(cl_mtxp);
 385
 386                 if (error && iostate->io_error == 0)
 387                         iostate->io_error = error;
 388
 389                 iostate->io_completed += total_size;
 390
 391                 if (iostate->io_wanted) {
 392                         /*
 393                          * someone is waiting for the state of
 394                          * this io stream to change
 395                          */
 396                         iostate->io_wanted = 0;
 397                         need_wakeup = 1;
 398                 }
 399                 lck_mtx_unlock(cl_mtxp);
 400
 401                 if (need_wakeup)
 402                         wakeup((caddr_t)&iostate->io_wanted);
 403         }
 404         if ((b_flags & B_NEED_IODONE) && real_bp) {
 405                 if (error) {
 406                         real_bp->b_flags |= B_ERROR;
 407                         real_bp->b_error = error;
 408                 }
 409                 real_bp->b_resid = total_resid;
 410
 411                 buf_biodone(real_bp);
 412         }
 413         if (error == 0 && total_resid)
 414                 error = EIO;
 415
 416         if (b_flags & B_COMMIT_UPL) {
 417                 pg_offset   = upl_offset & PAGE_MASK;
 418                 commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 419
 420                 if (error || (b_flags & B_NOCACHE)) {
 421                         int upl_abort_code;
 422                         int page_in  = 0;
 423                         int page_out = 0;
 424
 425                         if (b_flags & B_PAGEIO) {
 426                                 if (b_flags & B_READ)
 427                                         page_in  = 1;
 428                                 else
 429                                         page_out = 1;
 430                         }
 431                         if (b_flags & B_CACHE)          /* leave pages in the cache unchanged on error */
 432                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 433                         else if (page_out && (error != ENXIO)) /* transient error */
 434                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 435                         else if (page_in)
 436                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 437                         else
 438                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 439
 440                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 441                                                     upl_abort_code);
 442
 443                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 444                                      (int)upl, upl_offset - pg_offset, commit_size,
 445                                      0x80000000|upl_abort_code, 0);
 446
 447                 } else {
 448                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 449
 450                         if ((b_flags & B_PHYS) && (b_flags & B_READ))
 451                                 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
 452
 453                         if (b_flags & B_AGE)
 454                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 455
 456                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 457                                         upl_commit_flags);
 458
 459                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 460                                      (int)upl, upl_offset - pg_offset, commit_size,
 461                                      upl_commit_flags, 0);
 462                 }
 463         } else {
 464                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 465                              (int)upl, upl_offset, 0, error, 0);
 466         }
 467
 468         return (error);
 469 }
 470
 471
 472 void
 473 cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, buf_t bp)
 474 {
 475         upl_page_info_t *pl;
 476
 477         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
 478                      upl_offset, size, (int)bp, 0, 0);
 479
 480         if (bp == NULL || bp->b_datap == 0) {
 481
 482                 pl = ubc_upl_pageinfo(upl);
 483
 484                 while (size) {
 485                         int           page_offset;
 486                         int           page_index;
 487                         addr64_t      zero_addr;
 488                         int           zero_cnt;
 489
 490                         page_index  = upl_offset / PAGE_SIZE;
 491                         page_offset = upl_offset & PAGE_MASK;
 492
 493                         zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
 494                         zero_cnt  = min(PAGE_SIZE - page_offset, size);
 495
 496                         bzero_phys(zero_addr, zero_cnt);
 497
 498                         size       -= zero_cnt;
 499                         upl_offset += zero_cnt;
 500                 }
 501         } else
 502                 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
 503
 504         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
 505                      upl_offset, size, 0, 0, 0);
 506 }
 507
 508
 509 static int
 510 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 511            int flags, buf_t real_bp, struct clios *iostate)
 512 {
 513         buf_t   cbp;
 514         u_int   size;
 515         u_int   io_size;
 516         int     io_flags;
 517         int     bmap_flags;
 518         int     error = 0;
 519         int     retval = 0;
 520         buf_t   cbp_head = NULL;
 521         buf_t   cbp_tail = NULL;
 522         int     trans_count = 0;
 523         u_int   pg_count;
 524         int     pg_offset;
 525         u_int   max_iosize;
 526         u_int   max_vectors;
 527         int     priv;
 528         int     zero_offset = 0;
 529         int     async_throttle = 0;
 530         mount_t mp;
 531
 532         mp = vp->v_mount;
 533
 534         if (mp->mnt_devblocksize > 1) {
 535                 /*
 536                  * round the requested size up so that this I/O ends on a
 537                  * page boundary in case this is a 'write'... if the filesystem
 538                  * has blocks allocated to back the page beyond the EOF, we want to
 539                  * make sure to write out the zero's that are sitting beyond the EOF
 540                  * so that in case the filesystem doesn't explicitly zero this area
 541                  * if a hole is created via a lseek/write beyond the current EOF,
 542                  * it will return zeros when it's read back from the disk.  If the
 543                  * physical allocation doesn't extend for the whole page, we'll
 544                  * only write/read from the disk up to the end of this allocation
 545                  * via the extent info returned from the VNOP_BLOCKMAP call.
 546                  */
 547                 pg_offset = upl_offset & PAGE_MASK;
 548
 549                 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
 550         } else {
 551                 /*
 552                  * anyone advertising a blocksize of 1 byte probably
 553                  * can't deal with us rounding up the request size
 554                  * AFP is one such filesystem/device
 555                  */
 556                 size = non_rounded_size;
 557         }
 558         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 559                      (int)f_offset, size, upl_offset, flags, 0);
 560
 561         if (flags & CL_READ) {
 562                 io_flags = (B_READ);
 563                 bmap_flags = VNODE_READ;
 564
 565                 max_iosize  = mp->mnt_maxreadcnt;
 566                 max_vectors = mp->mnt_segreadcnt;
 567         } else {
 568                 io_flags = 0;
 569                 bmap_flags = VNODE_WRITE;
 570
 571                 max_iosize  = mp->mnt_maxwritecnt;
 572                 max_vectors = mp->mnt_segwritecnt;
 573         }
 574         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
 575
 576         /*
 577          * make sure the maximum iosize is a
 578          * multiple of the page size
 579          */
 580         max_iosize  &= ~PAGE_MASK;
 581
 582         if (flags & CL_THROTTLE) {
 583                 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
 584                         if (max_iosize > HARD_THROTTLE_MAXSIZE)
 585                                 max_iosize = HARD_THROTTLE_MAXSIZE;
 586                         async_throttle = HARD_THROTTLE_MAXCNT;
 587                 } else
 588                         async_throttle = VNODE_ASYNC_THROTTLE;
 589         }
 590         if (flags & CL_AGE)
 591                 io_flags |= B_AGE;
 592         if (flags & CL_DUMP)
 593                 io_flags |= B_NOCACHE;
 594         if (flags & (CL_PAGEIN | CL_PAGEOUT))
 595                 io_flags |= B_PAGEIO;
 596         if (flags & CL_COMMIT)
 597                 io_flags |= B_COMMIT_UPL;
 598         if (flags & CL_PRESERVE)
 599                 io_flags |= B_PHYS;
 600         if (flags & CL_KEEPCACHED)
 601                 io_flags |= B_CACHE;
 602
 603         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 604                 /*
 605                  * then we are going to end up
 606                  * with a page that we can't complete (the file size wasn't a multiple
 607                  * of PAGE_SIZE and we're trying to read to the end of the file
 608                  * so we'll go ahead and zero out the portion of the page we can't
 609                  * read in from the file
 610                  */
 611                 zero_offset = upl_offset + non_rounded_size;
 612         }
 613         while (size) {
 614                 int     pg_resid;
 615                 daddr64_t blkno;
 616                 daddr64_t lblkno;
 617
 618                 if (size > max_iosize)
 619                         io_size = max_iosize;
 620                 else
 621                         io_size = size;
 622
 623                 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, bmap_flags, NULL))) {
 624                         break;
 625                 }
 626                 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
 627                         real_bp->b_blkno = blkno;
 628
 629                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 630                              (int)f_offset, (int)blkno, io_size, zero_offset, 0);
 631
 632                 if (io_size == 0) {
 633                         /*
 634                          * vnop_blockmap didn't return an error... however, it did
 635                          * return an extent size of 0 which means we can't
 636                          * make forward progress on this I/O... a hole in the
 637                          * file would be returned as a blkno of -1 with a non-zero io_size
 638                          * a real extent is returned with a blkno != -1 and a non-zero io_size
 639                          */
 640                         error = EINVAL;
 641                         break;
 642                 }
 643                 if ( !(flags & CL_READ) && blkno == -1) {
 644                         off_t e_offset;
 645
 646                         /*
 647                          * we're writing into a 'hole'
 648                          */
 649                         if (flags & CL_PAGEOUT) {
 650                                 /*
 651                                  * if we got here via cluster_pageout
 652                                  * then just error the request and return
 653                                  * the 'hole' should already have been covered
 654                                  */
 655                                 error = EINVAL;
 656                                 break;
 657                         }
 658                         if ( !(flags & CL_COMMIT)) {
 659                                 /*
 660                                  * currently writes always request the commit to happen
 661                                  * as part of the io completion... however, if the CL_COMMIT
 662                                  * flag isn't specified, than we can't issue the abort_range
 663                                  * since the call site is going to abort or commit the same upl..
 664                                  * in this case we can only return an error
 665                                  */
 666                                 error = EINVAL;
 667                                 break;
 668                         }
 669                         /*
 670                          * we can get here if the cluster code happens to
 671                          * pick up a page that was dirtied via mmap vs
 672                          * a 'write' and the page targets a 'hole'...
 673                          * i.e. the writes to the cluster were sparse
 674                          * and the file was being written for the first time
 675                          *
 676                          * we can also get here if the filesystem supports
 677                          * 'holes' that are less than PAGE_SIZE.... because
 678                          * we can't know if the range in the page that covers
 679                          * the 'hole' has been dirtied via an mmap or not,
 680                          * we have to assume the worst and try to push the
 681                          * entire page to storage.
 682                          *
 683                          * Try paging out the page individually before
 684                          * giving up entirely and dumping it (the pageout
 685                          * path will insure that the zero extent accounting
 686                          * has been taken care of before we get back into cluster_io)
 687                          */
 688                         ubc_upl_abort_range(upl, trunc_page(upl_offset), PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
 689
 690                         e_offset = round_page_64(f_offset + 1);
 691
 692                         if (ubc_sync_range(vp, f_offset, e_offset, UBC_PUSHDIRTY) == 0) {
 693                                 error = EINVAL;
 694                                 break;
 695                         }
 696                         io_size = e_offset - f_offset;
 697
 698                         f_offset   += io_size;
 699                         upl_offset += io_size;
 700
 701                         if (size >= io_size)
 702                                 size -= io_size;
 703                         else
 704                                 size = 0;
 705                         /*
 706                          * keep track of how much of the original request
 707                          * that we've actually completed... non_rounded_size
 708                          * may go negative due to us rounding the request
 709                          * to a page size multiple (i.e.  size > non_rounded_size)
 710                          */
 711                         non_rounded_size -= io_size;
 712
 713                         if (non_rounded_size <= 0) {
 714                                 /*
 715                                  * we've transferred all of the data in the original
 716                                  * request, but we were unable to complete the tail
 717                                  * of the last page because the file didn't have
 718                                  * an allocation to back that portion... this is ok.
 719                                  */
 720                                 size = 0;
 721                         }
 722                         continue;
 723                 }
 724                 lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64);
 725                 /*
 726                  * we have now figured out how much I/O we can do - this is in 'io_size'
 727                  * pg_offset is the starting point in the first page for the I/O
 728                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 729                  */
 730                 pg_offset = upl_offset & PAGE_MASK;
 731
 732                 if (flags & CL_DEV_MEMORY) {
 733                         /*
 734                          * currently, can't deal with reading 'holes' in file
 735                          */
 736                         if (blkno == -1) {
 737                                 error = EINVAL;
 738                                 break;
 739                         }
 740                         /*
 741                          * treat physical requests as one 'giant' page
 742                          */
 743                         pg_count = 1;
 744                 } else
 745                         pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 746
 747                 if ((flags & CL_READ) && blkno == -1) {
 748                         int bytes_to_zero;
 749
 750                         /*
 751                          * if we're reading and blkno == -1, then we've got a
 752                          * 'hole' in the file that we need to deal with by zeroing
 753                          * out the affected area in the upl
 754                          */
 755                         if (zero_offset && io_size == size) {
 756                                 /*
 757                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
 758                                  * than 'zero_offset' will be non-zero
 759                                  * if the 'hole' returned by vnop_blockmap extends all the way to the eof
 760                                  * (indicated by the io_size finishing off the I/O request for this UPL)
 761                                  * than we're not going to issue an I/O for the
 762                                  * last page in this upl... we need to zero both the hole and the tail
 763                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
 764                                  */
 765                                 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
 766
 767                                 zero_offset = 0;
 768                         } else
 769                                 bytes_to_zero = io_size;
 770
 771                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
 772
 773                         if (cbp_head)
 774                                 /*
 775                                  * if there is a current I/O chain pending
 776                                  * then the first page of the group we just zero'd
 777                                  * will be handled by the I/O completion if the zero
 778                                  * fill started in the middle of the page
 779                                  */
 780                                 pg_count = (io_size - pg_offset) / PAGE_SIZE;
 781                         else {
 782                                 /*
 783                                  * no pending I/O to pick up that first page
 784                                  * so, we have to make sure it gets committed
 785                                  * here.
 786                                  * set the pg_offset to 0 so that the upl_commit_range
 787                                  * starts with this page
 788                                  */
 789                                 pg_count = (io_size + pg_offset) / PAGE_SIZE;
 790                                 pg_offset = 0;
 791                         }
 792                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 793                                 /*
 794                                  * if we're done with the request for this UPL
 795                                  * then we have to make sure to commit the last page
 796                                  * even if we only partially zero-filled it
 797                                  */
 798                                 pg_count++;
 799
 800                         if (pg_count) {
 801                                 if (pg_offset)
 802                                         pg_resid = PAGE_SIZE - pg_offset;
 803                                 else
 804                                         pg_resid = 0;
 805
 806                                 if (flags & CL_COMMIT)
 807                                         ubc_upl_commit_range(upl,
 808                                                         (upl_offset + pg_resid) & ~PAGE_MASK,
 809                                                         pg_count * PAGE_SIZE,
 810                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 811                         }
 812                         upl_offset += io_size;
 813                         f_offset   += io_size;
 814                         size       -= io_size;
 815                         /*
 816                          * keep track of how much of the original request
 817                          * that we've actually completed... non_rounded_size
 818                          * may go negative due to us rounding the request
 819                          * to a page size multiple (i.e.  size > non_rounded_size)
 820                          */
 821                         non_rounded_size -= io_size;
 822
 823                         if (non_rounded_size <= 0) {
 824                                 /*
 825                                  * we've transferred all of the data in the original
 826                                  * request, but we were unable to complete the tail
 827                                  * of the last page because the file didn't have
 828                                  * an allocation to back that portion... this is ok.
 829                                  */
 830                                 size = 0;
 831                         }
 832                         if (cbp_head && pg_count)
 833                                 goto start_io;
 834                         continue;
 835
 836                 }
 837                 if (pg_count > max_vectors) {
 838                         if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
 839                                 io_size = PAGE_SIZE - pg_offset;
 840                                 pg_count = 1;
 841                         } else {
 842                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 843                                 pg_count = max_vectors;
 844                         }
 845                 }
 846
 847                 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
 848                         /*
 849                          * if we're not targeting a virtual device i.e. a disk image
 850                          * it's safe to dip into the reserve pool since real devices
 851                          * can complete this I/O request without requiring additional
 852                          * bufs from the alloc_io_buf pool
 853                          */
 854                         priv = 1;
 855                 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 856                         /*
 857                          * Throttle the speculative IO
 858                          */
 859                         priv = 0;
 860                 else
 861                         priv = 1;
 862
 863                 cbp = alloc_io_buf(vp, priv);
 864
 865                 if (flags & CL_PAGEOUT) {
 866                         u_int i;
 867
 868                         for (i = 0; i < pg_count; i++) {
 869                                 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
 870                                         panic("BUSY bp found in cluster_io");
 871                         }
 872                 }
 873                 if (flags & CL_ASYNC) {
 874                         if (buf_setcallback(cbp, (void *)cluster_iodone, NULL))
 875                                 panic("buf_setcallback failed\n");
 876                 }
 877                 cbp->b_flags |= io_flags;
 878
 879                 cbp->b_lblkno = lblkno;
 880                 cbp->b_blkno  = blkno;
 881                 cbp->b_bcount = io_size;
 882
 883                 if (buf_setupl(cbp, upl, upl_offset))
 884                         panic("buf_setupl failed\n");
 885
 886                 cbp->b_trans_next = (buf_t)NULL;
 887
 888                 if ((cbp->b_iostate = (void *)iostate))
 889                         /*
 890                          * caller wants to track the state of this
 891                          * io... bump the amount issued against this stream
 892                          */
 893                         iostate->io_issued += io_size;
 894
 895                 if (flags & CL_READ) {
 896                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 897                                      (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
 898                 }
 899                 else {
 900                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 901                                      (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
 902                 }
 903
 904                 if (cbp_head) {
 905                         cbp_tail->b_trans_next = cbp;
 906                         cbp_tail = cbp;
 907                 } else {
 908                         cbp_head = cbp;
 909                         cbp_tail = cbp;
 910                 }
 911                 (buf_t)(cbp->b_trans_head) = cbp_head;
 912                 trans_count++;
 913
 914                 upl_offset += io_size;
 915                 f_offset   += io_size;
 916                 size       -= io_size;
 917                 /*
 918                  * keep track of how much of the original request
 919                  * that we've actually completed... non_rounded_size
 920                  * may go negative due to us rounding the request
 921                  * to a page size multiple (i.e.  size > non_rounded_size)
 922                  */
 923                 non_rounded_size -= io_size;
 924
 925                 if (non_rounded_size <= 0) {
 926                         /*
 927                          * we've transferred all of the data in the original
 928                          * request, but we were unable to complete the tail
 929                          * of the last page because the file didn't have
 930                          * an allocation to back that portion... this is ok.
 931                          */
 932                         size = 0;
 933                 }
 934                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || trans_count > 8)) || size == 0) {
 935                         /*
 936                          * if we have no more I/O to issue or
 937                          * the current I/O we've prepared fully
 938                          * completes the last page in this request
 939                          * and it's either an ASYNC request or
 940                          * we've already accumulated more than 8 I/O's into
 941                          * this transaction and it's not an I/O directed to
 942                          * special DEVICE memory
 943                          * then go ahead and issue the I/O
 944                          */
 945 start_io:
 946                         if (real_bp) {
 947                                 cbp_head->b_flags |= B_NEED_IODONE;
 948                                 cbp_head->b_real_bp = real_bp;
 949                         } else
 950                                 cbp_head->b_real_bp = (buf_t)NULL;
 951
 952                         if (size == 0) {
 953                                 /*
 954                                  * we're about to issue the last I/O for this upl
 955                                  * if this was a read to the eof and the eof doesn't
 956                                  * finish on a page boundary, than we need to zero-fill
 957                                  * the rest of the page....
 958                                  */
 959                                 cbp_head->b_validend = zero_offset;
 960                         } else
 961                                 cbp_head->b_validend = 0;
 962
 963                         if (flags & CL_THROTTLE)
 964                                 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, (char *)"cluster_io");
 965
 966                         for (cbp = cbp_head; cbp;) {
 967                                 buf_t   cbp_next;
 968
 969                                 if ( !(io_flags & B_READ))
 970                                         vnode_startwrite(vp);
 971
 972                                 cbp_next = cbp->b_trans_next;
 973
 974                                 (void) VNOP_STRATEGY(cbp);
 975                                 cbp = cbp_next;
 976                         }
 977                         if ( !(flags & CL_ASYNC)) {
 978                                 int dummy;
 979
 980                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 981                                         buf_biowait(cbp);
 982
 983                                 if ((error = cluster_iodone(cbp_head, (void *)&dummy))) {
 984                                         if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) == CL_PAGEOUT) && (error == ENXIO))
 985                                                 error = 0;      /* drop the error */
 986                                         else {
 987                                                 if (retval == 0)
 988                                                         retval = error;
 989                                                 error = 0;
 990                                         }
 991                                 }
 992                         }
 993                         cbp_head = (buf_t)NULL;
 994                         cbp_tail = (buf_t)NULL;
 995
 996                         trans_count = 0;
 997                 }
 998         }
 999         if (error) {
1000                 int abort_size;
1001
1002                 io_size = 0;
1003
1004                 for (cbp = cbp_head; cbp;) {
1005                         buf_t   cbp_next;
1006
1007                         upl_offset -= cbp->b_bcount;
1008                         size       += cbp->b_bcount;
1009                         io_size    += cbp->b_bcount;
1010
1011                         cbp_next = cbp->b_trans_next;
1012                         free_io_buf(cbp);
1013                         cbp = cbp_next;
1014                 }
1015                 if (iostate) {
1016                         int need_wakeup = 0;
1017
1018                         /*
1019                          * update the error condition for this stream
1020                          * since we never really issued the io
1021                          * just go ahead and adjust it back
1022                          */
1023                         lck_mtx_lock(cl_mtxp);
1024
1025                         if (iostate->io_error == 0)
1026                                 iostate->io_error = error;
1027                         iostate->io_issued -= io_size;
1028
1029                         if (iostate->io_wanted) {
1030                                 /*
1031                                  * someone is waiting for the state of
1032                                  * this io stream to change
1033                                  */
1034                                 iostate->io_wanted = 0;
1035                                 need_wakeup = 0;
1036                         }
1037                         lck_mtx_unlock(cl_mtxp);
1038
1039                         if (need_wakeup)
1040                                 wakeup((caddr_t)&iostate->io_wanted);
1041                 }
1042                 pg_offset  = upl_offset & PAGE_MASK;
1043                 abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1044
1045                 if (flags & CL_COMMIT) {
1046                         int upl_abort_code;
1047
1048                         if (flags & CL_PRESERVE) {
1049                                 ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
1050                                                      UPL_COMMIT_FREE_ON_EMPTY);
1051                         } else {
1052                                 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
1053                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
1054                                 else if (flags & CL_PAGEIN)
1055                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1056                                 else
1057                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
1058
1059                                 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
1060                                                 upl_abort_code);
1061                         }
1062                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
1063                                      (int)upl, upl_offset - pg_offset, abort_size, error, 0);
1064                 }
1065                 if (real_bp) {
1066                         real_bp->b_flags |= B_ERROR;
1067                         real_bp->b_error  = error;
1068
1069                         buf_biodone(real_bp);
1070                 }
1071                 if (retval == 0)
1072                         retval = error;
1073         }
1074         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
1075                      (int)f_offset, size, upl_offset, retval, 0);
1076
1077         return (retval);
1078 }
1079
1080
1081 static int
1082 cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize)
1083 {
1084         int           pages_in_prefetch;
1085
1086         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
1087                      (int)f_offset, size, (int)filesize, 0, 0);
1088
1089         if (f_offset >= filesize) {
1090                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1091                              (int)f_offset, 0, 0, 0, 0);
1092                 return(0);
1093         }
1094         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1095                 size = (MAX_UPL_TRANSFER * PAGE_SIZE);
1096         else
1097                 size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1098
1099         if ((off_t)size > (filesize - f_offset))
1100                 size = filesize - f_offset;
1101         pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1102
1103         advisory_read(vp, filesize, f_offset, size);
1104
1105         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1106                      (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1107
1108         return (pages_in_prefetch);
1109 }
1110
1111
1112
1113 static void
1114 cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap)
1115 {
1116         daddr64_t       r_addr;
1117         off_t           f_offset;
1118         int             size_of_prefetch;
1119
1120
1121         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
1122                      (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
1123
1124         if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
1125                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1126                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
1127                 return;
1128         }
1129         if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1) &&
1130                                    (extent->b_addr != (rap->cl_maxra + 1) || rap->cl_ralen == 0))) {
1131                 rap->cl_ralen = 0;
1132                 rap->cl_maxra = 0;
1133
1134                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1135                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
1136
1137                 return;
1138         }
1139         if (extent->e_addr < rap->cl_maxra) {
1140                 if ((rap->cl_maxra - extent->e_addr) > (MAX_UPL_TRANSFER / 4)) {
1141
1142                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1143                                      rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
1144                         return;
1145                 }
1146         }
1147         r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
1148         f_offset = (off_t)(r_addr * PAGE_SIZE_64);
1149
1150         size_of_prefetch = 0;
1151
1152         ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
1153
1154         if (size_of_prefetch) {
1155                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1156                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
1157                 return;
1158         }
1159         if (f_offset < filesize) {
1160                 daddr64_t read_size;
1161
1162                 rap->cl_ralen = rap->cl_ralen ? min(MAX_UPL_TRANSFER, rap->cl_ralen << 1) : 1;
1163
1164                 read_size = (extent->e_addr + 1) - extent->b_addr;
1165
1166                 if (read_size > rap->cl_ralen) {
1167                         if (read_size > MAX_UPL_TRANSFER)
1168                                 rap->cl_ralen = MAX_UPL_TRANSFER;
1169                         else
1170                                 rap->cl_ralen = read_size;
1171                 }
1172                 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize);
1173
1174                 if (size_of_prefetch)
1175                         rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
1176         }
1177         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1178                      rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
1179 }
1180
1181 int
1182 cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1183                 int size, off_t filesize, int flags)
1184 {
1185         int           io_size;
1186         int           rounded_size;
1187         off_t         max_size;
1188         int           local_flags;
1189         struct cl_writebehind *wbp;
1190
1191         if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
1192                 /*
1193                  * if we know we're issuing this I/O to a virtual device (i.e. disk image)
1194                  * then we don't want to enforce this throttle... if we do, we can
1195                  * potentially deadlock since we're stalling the pageout thread at a time
1196                  * when the disk image might need additional memory (which won't be available
1197                  * if the pageout thread can't run)... instead we'll just depend on the throttle
1198                  * that the pageout thread now has in place to deal with external files
1199                  */
1200                 local_flags = CL_PAGEOUT;
1201         else
1202                 local_flags = CL_PAGEOUT | CL_THROTTLE;
1203
1204         if ((flags & UPL_IOSYNC) == 0)
1205                 local_flags |= CL_ASYNC;
1206         if ((flags & UPL_NOCOMMIT) == 0)
1207                 local_flags |= CL_COMMIT;
1208         if ((flags & UPL_KEEPCACHED))
1209                 local_flags |= CL_KEEPCACHED;
1210
1211
1212         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
1213                      (int)f_offset, size, (int)filesize, local_flags, 0);
1214
1215         /*
1216          * If they didn't specify any I/O, then we are done...
1217          * we can't issue an abort because we don't know how
1218          * big the upl really is
1219          */
1220         if (size <= 0)
1221                 return (EINVAL);
1222
1223         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1224                 if (local_flags & CL_COMMIT)
1225                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1226                 return (EROFS);
1227         }
1228         /*
1229          * can't page-in from a negative offset
1230          * or if we're starting beyond the EOF
1231          * or if the file offset isn't page aligned
1232          * or the size requested isn't a multiple of PAGE_SIZE
1233          */
1234         if (f_offset < 0 || f_offset >= filesize ||
1235            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
1236                 if (local_flags & CL_COMMIT)
1237                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1238                 return (EINVAL);
1239         }
1240         max_size = filesize - f_offset;
1241
1242         if (size < max_size)
1243                 io_size = size;
1244         else
1245                 io_size = max_size;
1246
1247         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1248
1249         if (size > rounded_size) {
1250                 if (local_flags & CL_COMMIT)
1251                         ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
1252                                         UPL_ABORT_FREE_ON_EMPTY);
1253         }
1254         if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1255                 wbp->cl_hasbeenpaged = 1;
1256
1257         return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
1258                            local_flags, (buf_t)NULL, (struct clios *)NULL));
1259 }
1260
1261 int
1262 cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1263                int size, off_t filesize, int flags)
1264 {
1265         u_int         io_size;
1266         int           rounded_size;
1267         off_t         max_size;
1268         int           retval;
1269         int           local_flags = 0;
1270
1271         if (upl == NULL || size < 0)
1272                 panic("cluster_pagein: NULL upl passed in");
1273
1274         if ((flags & UPL_IOSYNC) == 0)
1275                 local_flags |= CL_ASYNC;
1276         if ((flags & UPL_NOCOMMIT) == 0)
1277                 local_flags |= CL_COMMIT;
1278
1279
1280         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1281                      (int)f_offset, size, (int)filesize, local_flags, 0);
1282
1283         /*
1284          * can't page-in from a negative offset
1285          * or if we're starting beyond the EOF
1286          * or if the file offset isn't page aligned
1287          * or the size requested isn't a multiple of PAGE_SIZE
1288          */
1289         if (f_offset < 0 || f_offset >= filesize ||
1290            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1291                 if (local_flags & CL_COMMIT)
1292                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1293                 return (EINVAL);
1294         }
1295         max_size = filesize - f_offset;
1296
1297         if (size < max_size)
1298                 io_size = size;
1299         else
1300                 io_size = max_size;
1301
1302         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1303
1304         if (size > rounded_size && (local_flags & CL_COMMIT))
1305                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1306                                     size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1307
1308         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
1309                            local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL);
1310
1311         if (retval == 0 && !(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) {
1312                 struct cl_readahead *rap;
1313
1314                 rap = cluster_get_rap(vp);
1315
1316                 if (rap != NULL) {
1317                         struct cl_extent extent;
1318
1319                         extent.b_addr = (daddr64_t)(f_offset / PAGE_SIZE_64);
1320                         extent.e_addr = (daddr64_t)((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1321
1322                         if (rounded_size == PAGE_SIZE) {
1323                                 /*
1324                                  * we haven't read the last page in of the file yet
1325                                  * so let's try to read ahead if we're in
1326                                  * a sequential access pattern
1327                                  */
1328                                 cluster_rd_ahead(vp, &extent, filesize, rap);
1329                         }
1330                         rap->cl_lastr = extent.e_addr;
1331
1332                         lck_mtx_unlock(&rap->cl_lockr);
1333                 }
1334         }
1335         return (retval);
1336 }
1337
1338 int
1339 cluster_bp(buf_t bp)
1340 {
1341         off_t  f_offset;
1342         int    flags;
1343
1344         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1345                      (int)bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1346
1347         if (bp->b_flags & B_READ)
1348                 flags = CL_ASYNC | CL_READ;
1349         else
1350                 flags = CL_ASYNC;
1351
1352         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1353
1354         return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL));
1355 }
1356
1357 int
1358 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
1359 {
1360         int           prev_resid;
1361         u_int         clip_size;
1362         off_t         max_io_size;
1363         int           upl_size;
1364         int           upl_flags;
1365         upl_t         upl;
1366         int           retval = 0;
1367         int           flags;
1368
1369         flags = xflags;
1370
1371         if (vp->v_flag & VNOCACHE_DATA)
1372                 flags |= IO_NOCACHE;
1373
1374         if ( (!(flags & IO_NOCACHE)) || (!uio) || (!UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
1375                 /*
1376                  * go do a write through the cache if one of the following is true....
1377                  *   NOCACHE is not true
1378                  *   there is no uio structure or it doesn't target USERSPACE
1379                  */
1380                 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1381         }
1382
1383 #if LP64_DEBUG
1384         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1385                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1386         }
1387 #endif /* LP64_DEBUG */
1388
1389         while (uio_resid(uio) && uio->uio_offset < newEOF && retval == 0) {
1390                 user_size_t     iov_len;
1391                 user_addr_t     iov_base;
1392
1393                 /*
1394                  * we know we have a resid, so this is safe
1395                  * skip over any emtpy vectors
1396                  */
1397                 uio_update(uio, (user_size_t)0);
1398
1399                 iov_len  = uio_curriovlen(uio);
1400                 iov_base = uio_curriovbase(uio);
1401
1402                 upl_size  = PAGE_SIZE;
1403                 upl_flags = UPL_QUERY_OBJECT_TYPE;
1404
1405                 // LP64todo - fix this!
1406                 if ((vm_map_get_upl(current_map(),
1407                                     (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1408                                     &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
1409                         /*
1410                          * the user app must have passed in an invalid address
1411                          */
1412                         return (EFAULT);
1413                 }
1414
1415                 /*
1416                  * We check every vector target but if it is physically
1417                  * contiguous space, we skip the sanity checks.
1418                  */
1419                 if (upl_flags & UPL_PHYS_CONTIG) {
1420                         int zflags;
1421
1422                         zflags = flags & ~IO_TAILZEROFILL;
1423                         zflags |= IO_HEADZEROFILL;
1424
1425                         if (flags & IO_HEADZEROFILL) {
1426                                 /*
1427                                  * in case we have additional vectors, we don't want to do this again
1428                                  */
1429                                 flags &= ~IO_HEADZEROFILL;
1430
1431                                 if ((retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, zflags)))
1432                                         return(retval);
1433                         }
1434                         retval = cluster_phys_write(vp, uio, newEOF);
1435
1436                         if (uio_resid(uio) == 0 && (flags & IO_TAILZEROFILL)) {
1437                                 return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, zflags));
1438                         }
1439                 }
1440                 else if ((uio_resid(uio) < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL))) {
1441                         /*
1442                          * we're here because we're don't have a physically contiguous target buffer
1443                          * go do a write through the cache if one of the following is true....
1444                          *   the total xfer size is less than a page...
1445                          *   we're being asked to ZEROFILL either the head or the tail of the I/O...
1446                          */
1447                         return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1448                 }
1449                 // LP64todo - fix this!
1450                 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1451                         if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1452                                 /*
1453                                  * Bring the file offset write up to a pagesize boundary
1454                                  * this will also bring the base address to a page boundary
1455                                  * since they both are currently on the same offset within a page
1456                                  * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1457                                  * so the computed clip_size must always be less than the current uio_resid
1458                                  */
1459                                 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1460
1461                                 /*
1462                                  * Fake the resid going into the cluster_write_x call
1463                                  * and restore it on the way out.
1464                                  */
1465                                 // LP64todo - fix this
1466                                 prev_resid = uio_resid(uio);
1467                                 uio_setresid(uio, clip_size);
1468
1469                                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1470
1471                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1472                         } else {
1473                                 /*
1474                                  * can't get both the file offset and the buffer offset aligned to a page boundary
1475                                  * so fire an I/O through the cache for this entire vector
1476                                  */
1477                                 // LP64todo - fix this
1478                                 clip_size = iov_len;
1479                                 // LP64todo - fix this
1480                                 prev_resid = uio_resid(uio);
1481                                 uio_setresid(uio, clip_size);
1482
1483                                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1484
1485                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1486                         }
1487                 } else {
1488                         /*
1489                          * If we come in here, we know the offset into
1490                          * the file is on a pagesize boundary and the
1491                          * target buffer address is also on a page boundary
1492                          */
1493                         max_io_size = newEOF - uio->uio_offset;
1494                         // LP64todo - fix this
1495                         clip_size = uio_resid(uio);
1496                         if (iov_len < clip_size)
1497                                 // LP64todo - fix this!
1498                                 clip_size = iov_len;
1499                         if (max_io_size < clip_size)
1500                                 clip_size = max_io_size;
1501
1502                         if (clip_size < PAGE_SIZE) {
1503                                 /*
1504                                  * Take care of tail end of write in this vector
1505                                  */
1506                                 // LP64todo - fix this
1507                                 prev_resid = uio_resid(uio);
1508                                 uio_setresid(uio, clip_size);
1509
1510                                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1511
1512                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1513                         } else {
1514                                 /* round clip_size down to a multiple of pagesize */
1515                                 clip_size = clip_size & ~(PAGE_MASK);
1516                                 // LP64todo - fix this
1517                                 prev_resid = uio_resid(uio);
1518                                 uio_setresid(uio, clip_size);
1519
1520                                 retval = cluster_nocopy_write(vp, uio, newEOF);
1521
1522                                 if ((retval == 0) && uio_resid(uio))
1523                                         retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1524
1525                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1526                         }
1527                 } /* end else */
1528         } /* end while */
1529
1530         return(retval);
1531 }
1532
1533
1534 static int
1535 cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF)
1536 {
1537         upl_t            upl;
1538         upl_page_info_t  *pl;
1539         vm_offset_t      upl_offset;
1540         int              io_size;
1541         int              io_flag;
1542         int              upl_size;
1543         int              upl_needed_size;
1544         int              pages_in_pl;
1545         int              upl_flags;
1546         kern_return_t    kret;
1547         int              i;
1548         int              force_data_sync;
1549         int              error  = 0;
1550         struct clios     iostate;
1551         struct cl_writebehind *wbp;
1552
1553
1554         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1555                      (int)uio->uio_offset, (int)uio_resid(uio),
1556                      (int)newEOF, 0, 0);
1557
1558         /*
1559          * When we enter this routine, we know
1560          *  -- the offset into the file is on a pagesize boundary
1561          *  -- the resid is a page multiple
1562          *  -- the resid will not exceed iov_len
1563          */
1564
1565         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1566
1567                 cluster_try_push(wbp, vp, newEOF, 0, 1);
1568
1569                 lck_mtx_unlock(&wbp->cl_lockw);
1570         }
1571         iostate.io_completed = 0;
1572         iostate.io_issued = 0;
1573         iostate.io_error = 0;
1574         iostate.io_wanted = 0;
1575
1576         while (uio_resid(uio) && uio->uio_offset < newEOF && error == 0) {
1577                 user_addr_t     iov_base;
1578
1579                 io_size = uio_resid(uio);
1580
1581                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1582                         io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1583
1584                 iov_base = uio_curriovbase(uio);
1585
1586                 // LP64todo - fix this!
1587                 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
1588
1589                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1590
1591                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1592                              (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
1593
1594                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1595                         pages_in_pl = 0;
1596                         upl_size = upl_needed_size;
1597                         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1598                                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1599
1600                         // LP64todo - fix this!
1601                         kret = vm_map_get_upl(current_map(),
1602                                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1603                                               &upl_size,
1604                                               &upl,
1605                                               NULL,
1606                                               &pages_in_pl,
1607                                               &upl_flags,
1608                                               force_data_sync);
1609
1610                         if (kret != KERN_SUCCESS) {
1611                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1612                                              0, 0, 0, kret, 0);
1613                                 /*
1614                                  * cluster_nocopy_write: failed to get pagelist
1615                                  *
1616                                  * we may have already spun some portion of this request
1617                                  * off as async requests... we need to wait for the I/O
1618                                  * to complete before returning
1619                                  */
1620                                 goto wait_for_writes;
1621                         }
1622                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1623                         pages_in_pl = upl_size / PAGE_SIZE;
1624
1625                         for (i = 0; i < pages_in_pl; i++) {
1626                                 if (!upl_valid_page(pl, i))
1627                                         break;
1628                         }
1629                         if (i == pages_in_pl)
1630                                 break;
1631
1632                         /*
1633                          * didn't get all the pages back that we
1634                          * needed... release this upl and try again
1635                          */
1636                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1637                                             UPL_ABORT_FREE_ON_EMPTY);
1638                 }
1639                 if (force_data_sync >= 3) {
1640                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1641                                      i, pages_in_pl, upl_size, kret, 0);
1642                         /*
1643                          * for some reason, we couldn't acquire a hold on all
1644                          * the pages needed in the user's address space
1645                          *
1646                          * we may have already spun some portion of this request
1647                          * off as async requests... we need to wait for the I/O
1648                          * to complete before returning
1649                          */
1650                         goto wait_for_writes;
1651                 }
1652
1653                 /*
1654                  * Consider the possibility that upl_size wasn't satisfied.
1655                  */
1656                 if (upl_size != upl_needed_size)
1657                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1658
1659                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1660                              (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
1661
1662                 if (io_size == 0) {
1663                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1664                                             UPL_ABORT_FREE_ON_EMPTY);
1665                         /*
1666                          * we may have already spun some portion of this request
1667                          * off as async requests... we need to wait for the I/O
1668                          * to complete before returning
1669                          */
1670                         goto wait_for_writes;
1671                 }
1672                 /*
1673                  * Now look for pages already in the cache
1674                  * and throw them away.
1675                  * uio->uio_offset is page aligned within the file
1676                  * io_size is a multiple of PAGE_SIZE
1677                  */
1678                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
1679
1680                 /*
1681                  * we want push out these writes asynchronously so that we can overlap
1682                  * the preparation of the next I/O
1683                  * if there are already too many outstanding writes
1684                  * wait until some complete before issuing the next
1685                  */
1686                 lck_mtx_lock(cl_mtxp);
1687
1688                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1689                         iostate.io_wanted = 1;
1690                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
1691                 }
1692                 lck_mtx_unlock(cl_mtxp);
1693
1694                 if (iostate.io_error) {
1695                         /*
1696                          * one of the earlier writes we issued ran into a hard error
1697                          * don't issue any more writes, cleanup the UPL
1698                          * that was just created but not used, then
1699                          * go wait for all writes that are part of this stream
1700                          * to complete before returning the error to the caller
1701                          */
1702                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1703                                             UPL_ABORT_FREE_ON_EMPTY);
1704
1705                         goto wait_for_writes;
1706                 }
1707                 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE;
1708
1709                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1710                              (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1711
1712                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1713                                    io_size, io_flag, (buf_t)NULL, &iostate);
1714
1715                 uio_update(uio, (user_size_t)io_size);
1716
1717                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1718                              (int)upl_offset, (int)uio->uio_offset, (int)uio_resid(uio), error, 0);
1719
1720         } /* end while */
1721
1722 wait_for_writes:
1723         /*
1724          * make sure all async writes issued as part of this stream
1725          * have completed before we return
1726          */
1727         lck_mtx_lock(cl_mtxp);
1728
1729         while (iostate.io_issued != iostate.io_completed) {
1730                 iostate.io_wanted = 1;
1731                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
1732         }
1733         lck_mtx_unlock(cl_mtxp);
1734
1735         if (iostate.io_error)
1736                 error = iostate.io_error;
1737
1738         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1739                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1740
1741         return (error);
1742 }
1743
1744
1745 static int
1746 cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF)
1747 {
1748         upl_page_info_t *pl;
1749         addr64_t         src_paddr;
1750         upl_t            upl;
1751         vm_offset_t      upl_offset;
1752         int              tail_size;
1753         int              io_size;
1754         int              upl_size;
1755         int              upl_needed_size;
1756         int              pages_in_pl;
1757         int              upl_flags;
1758         kern_return_t    kret;
1759         int              error  = 0;
1760         user_addr_t      iov_base;
1761         int              devblocksize;
1762         struct cl_writebehind *wbp;
1763
1764         devblocksize = vp->v_mount->mnt_devblocksize;
1765         /*
1766          * When we enter this routine, we know
1767          *  -- the resid will not exceed iov_len
1768          *  -- the vector target address is physcially contiguous
1769          */
1770         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1771
1772                 cluster_try_push(wbp, vp, newEOF, 0, 1);
1773
1774                 lck_mtx_unlock(&wbp->cl_lockw);
1775         }
1776 #if LP64_DEBUG
1777         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1778                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1779         }
1780 #endif /* LP64_DEBUG */
1781
1782         // LP64todo - fix this!
1783         io_size = (int)uio_curriovlen(uio);
1784         iov_base = uio_curriovbase(uio);
1785
1786         upl_offset = CAST_DOWN(upl_offset_t, iov_base) & PAGE_MASK;
1787         upl_needed_size = upl_offset + io_size;
1788
1789         pages_in_pl = 0;
1790         upl_size = upl_needed_size;
1791         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1792                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1793
1794         // LP64todo - fix this!
1795         kret = vm_map_get_upl(current_map(),
1796                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1797                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1798
1799         if (kret != KERN_SUCCESS) {
1800                 /*
1801                  * cluster_phys_write: failed to get pagelist
1802                  * note: return kret here
1803                  */
1804               return(EINVAL);
1805         }
1806         /*
1807          * Consider the possibility that upl_size wasn't satisfied.
1808          * This is a failure in the physical memory case.
1809          */
1810         if (upl_size < upl_needed_size) {
1811                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1812                 return(EINVAL);
1813         }
1814         pl = ubc_upl_pageinfo(upl);
1815
1816         src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
1817
1818         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1819                 int   head_size;
1820
1821                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1822
1823                 if (head_size > io_size)
1824                         head_size = io_size;
1825
1826                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0);
1827
1828                 if (error) {
1829                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1830
1831                         return(EINVAL);
1832                 }
1833                 upl_offset += head_size;
1834                 src_paddr  += head_size;
1835                 io_size    -= head_size;
1836         }
1837         tail_size = io_size & (devblocksize - 1);
1838         io_size  -= tail_size;
1839
1840         if (io_size) {
1841                 /*
1842                  * issue a synchronous write to cluster_io
1843                  */
1844                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1845                                    io_size, CL_DEV_MEMORY, (buf_t)NULL, (struct clios *)NULL);
1846         }
1847         if (error == 0) {
1848                 /*
1849                  * The cluster_io write completed successfully,
1850                  * update the uio structure
1851                  */
1852                 uio_update(uio, (user_size_t)io_size);
1853
1854                 src_paddr += io_size;
1855
1856                 if (tail_size)
1857                         error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0);
1858         }
1859         /*
1860          * just release our hold on the physically contiguous
1861          * region without changing any state
1862          */
1863         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1864
1865         return (error);
1866 }
1867
1868
1869 static int
1870 cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int flags)
1871 {
1872         upl_page_info_t *pl;
1873         upl_t            upl;
1874         vm_offset_t      upl_offset = 0;
1875         int              upl_size;
1876         off_t            upl_f_offset;
1877         int              pages_in_upl;
1878         int              start_offset;
1879         int              xfer_resid;
1880         int              io_size;
1881         int              io_offset;
1882         int              bytes_to_zero;
1883         int              bytes_to_move;
1884         kern_return_t    kret;
1885         int              retval = 0;
1886         int              io_resid;
1887         long long        total_size;
1888         long long        zero_cnt;
1889         off_t            zero_off;
1890         long long        zero_cnt1;
1891         off_t            zero_off1;
1892         struct cl_extent cl;
1893         int              intersection;
1894         struct cl_writebehind *wbp;
1895
1896         if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1897         {
1898                 if (wbp->cl_hasbeenpaged) {
1899                         /*
1900                          * this vnode had pages cleaned to it by
1901                          * the pager which indicates that either
1902                          * it's not very 'hot', or the system is
1903                          * being overwhelmed by a lot of dirty
1904                          * data being delayed in the VM cache...
1905                          * in either event, we'll push our remaining
1906                          * delayed data at this point...  this will
1907                          * be more efficient than paging out 1 page at
1908                          * a time, and will also act as a throttle
1909                          * by delaying this client from writing any
1910                          * more data until all his delayed data has
1911                          * at least been queued to the uderlying driver.
1912                          */
1913                         if (wbp->cl_number || wbp->cl_scmap)
1914                                 cluster_push_EOF(vp, newEOF);
1915
1916                         wbp->cl_hasbeenpaged = 0;
1917                 }
1918         }
1919         if (uio) {
1920                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1921                              (int)uio->uio_offset, uio_resid(uio), (int)oldEOF, (int)newEOF, 0);
1922
1923                 // LP64todo - fix this
1924                 io_resid = uio_resid(uio);
1925         } else {
1926                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1927                              0, 0, (int)oldEOF, (int)newEOF, 0);
1928
1929                 io_resid = 0;
1930         }
1931         zero_cnt  = 0;
1932         zero_cnt1 = 0;
1933         zero_off  = 0;
1934         zero_off1 = 0;
1935
1936         if (flags & IO_HEADZEROFILL) {
1937                 /*
1938                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1939                  * so we zero fill the intervening space between the old EOF and the offset
1940                  * where the next chunk of real data begins.... ftruncate will also use this
1941                  * routine to zero fill to the new EOF when growing a file... in this case, the
1942                  * uio structure will not be provided
1943                  */
1944                 if (uio) {
1945                         if (headOff < uio->uio_offset) {
1946                                 zero_cnt = uio->uio_offset - headOff;
1947                                 zero_off = headOff;
1948                         }
1949                 } else if (headOff < newEOF) {
1950                         zero_cnt = newEOF - headOff;
1951                         zero_off = headOff;
1952                 }
1953         }
1954         if (flags & IO_TAILZEROFILL) {
1955                 if (uio) {
1956                         // LP64todo - fix this
1957                         zero_off1 = uio->uio_offset + uio_resid(uio);
1958
1959                         if (zero_off1 < tailOff)
1960                                 zero_cnt1 = tailOff - zero_off1;
1961                 }
1962         }
1963         if (zero_cnt == 0 && uio == (struct uio *) 0) {
1964                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1965                              retval, 0, 0, 0, 0);
1966                 return (0);
1967         }
1968
1969         while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1970                 /*
1971                  * for this iteration of the loop, figure out where our starting point is
1972                  */
1973                 if (zero_cnt) {
1974                         start_offset = (int)(zero_off & PAGE_MASK_64);
1975                         upl_f_offset = zero_off - start_offset;
1976                 } else if (io_resid) {
1977                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1978                         upl_f_offset = uio->uio_offset - start_offset;
1979                 } else {
1980                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1981                         upl_f_offset = zero_off1 - start_offset;
1982                 }
1983                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1984                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1985
1986                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1987                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1988
1989                 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
1990
1991                 if (uio && ((flags & (IO_NOCACHE | IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
1992                         /*
1993                          * assumption... total_size <= io_resid
1994                          * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1995                          */
1996                         if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
1997                                 total_size -= start_offset;
1998                         xfer_resid = total_size;
1999
2000                         retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
2001
2002                         if (retval)
2003                                 break;
2004
2005                         io_resid   -= (total_size - xfer_resid);
2006                         total_size   = xfer_resid;
2007                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2008                         upl_f_offset = uio->uio_offset - start_offset;
2009
2010                         if (total_size == 0) {
2011                                 if (start_offset) {
2012                                         /*
2013                                          * the write did not finish on a page boundary
2014                                          * which will leave upl_f_offset pointing to the
2015                                          * beginning of the last page written instead of
2016                                          * the page beyond it... bump it in this case
2017                                          * so that the cluster code records the last page
2018                                          * written as dirty
2019                                          */
2020                                         upl_f_offset += PAGE_SIZE_64;
2021                                 }
2022                                 upl_size = 0;
2023
2024                                 goto check_cluster;
2025                         }
2026                 }
2027                 /*
2028                  * compute the size of the upl needed to encompass
2029                  * the requested write... limit each call to cluster_io
2030                  * to the maximum UPL size... cluster_io will clip if
2031                  * this exceeds the maximum io_size for the device,
2032                  * make sure to account for
2033                  * a starting offset that's not page aligned
2034                  */
2035                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2036
2037                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2038                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2039
2040                 pages_in_upl = upl_size / PAGE_SIZE;
2041                 io_size      = upl_size - start_offset;
2042
2043                 if ((long long)io_size > total_size)
2044                         io_size = total_size;
2045
2046                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
2047
2048
2049                 /*
2050                  * Gather the pages from the buffer cache.
2051                  * The UPL_WILL_MODIFY flag lets the UPL subsystem know
2052                  * that we intend to modify these pages.
2053                  */
2054                 kret = ubc_create_upl(vp,
2055                                       upl_f_offset,
2056                                       upl_size,
2057                                       &upl,
2058                                       &pl,
2059                                       UPL_SET_LITE | UPL_WILL_MODIFY);
2060                 if (kret != KERN_SUCCESS)
2061                         panic("cluster_write: failed to get pagelist");
2062
2063                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
2064                         (int)upl, (int)upl_f_offset, start_offset, 0, 0);
2065
2066                 if (start_offset && !upl_valid_page(pl, 0)) {
2067                         int   read_size;
2068
2069                         /*
2070                          * we're starting in the middle of the first page of the upl
2071                          * and the page isn't currently valid, so we're going to have
2072                          * to read it in first... this is a synchronous operation
2073                          */
2074                         read_size = PAGE_SIZE;
2075
2076                         if ((upl_f_offset + read_size) > newEOF)
2077                                 read_size = newEOF - upl_f_offset;
2078
2079                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
2080                                             CL_READ, (buf_t)NULL, (struct clios *)NULL);
2081                         if (retval) {
2082                                 /*
2083                                  * we had an error during the read which causes us to abort
2084                                  * the current cluster_write request... before we do, we need
2085                                  * to release the rest of the pages in the upl without modifying
2086                                  * there state and mark the failed page in error
2087                                  */
2088                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2089
2090                                 if (upl_size > PAGE_SIZE)
2091                                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2092
2093                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2094                                              (int)upl, 0, 0, retval, 0);
2095                                 break;
2096                         }
2097                 }
2098                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
2099                         /*
2100                          * the last offset we're writing to in this upl does not end on a page
2101                          * boundary... if it's not beyond the old EOF, then we'll also need to
2102                          * pre-read this page in if it isn't already valid
2103                          */
2104                         upl_offset = upl_size - PAGE_SIZE;
2105
2106                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
2107                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
2108                                 int   read_size;
2109
2110                                 read_size = PAGE_SIZE;
2111
2112                                 if ((upl_f_offset + upl_offset + read_size) > newEOF)
2113                                         read_size = newEOF - (upl_f_offset + upl_offset);
2114
2115                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
2116                                                     CL_READ, (buf_t)NULL, (struct clios *)NULL);
2117                                 if (retval) {
2118                                         /*
2119                                          * we had an error during the read which causes us to abort
2120                                          * the current cluster_write request... before we do, we
2121                                          * need to release the rest of the pages in the upl without
2122                                          * modifying there state and mark the failed page in error
2123                                          */
2124                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2125
2126                                         if (upl_size > PAGE_SIZE)
2127                                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2128
2129                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2130                                                      (int)upl, 0, 0, retval, 0);
2131                                         break;
2132                                 }
2133                         }
2134                 }
2135                 xfer_resid = io_size;
2136                 io_offset = start_offset;
2137
2138                 while (zero_cnt && xfer_resid) {
2139
2140                         if (zero_cnt < (long long)xfer_resid)
2141                                 bytes_to_zero = zero_cnt;
2142                         else
2143                                 bytes_to_zero = xfer_resid;
2144
2145                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2146                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2147                         } else {
2148                                 int zero_pg_index;
2149
2150                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
2151                                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
2152
2153                                 if ( !upl_valid_page(pl, zero_pg_index)) {
2154                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2155
2156                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2157                                            !upl_dirty_page(pl, zero_pg_index)) {
2158                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2159                                 }
2160                         }
2161                         xfer_resid -= bytes_to_zero;
2162                         zero_cnt   -= bytes_to_zero;
2163                         zero_off   += bytes_to_zero;
2164                         io_offset  += bytes_to_zero;
2165                 }
2166                 if (xfer_resid && io_resid) {
2167                         bytes_to_move = min(io_resid, xfer_resid);
2168
2169                         retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
2170
2171                         if (retval) {
2172
2173                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2174
2175                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2176                                              (int)upl, 0, 0, retval, 0);
2177                         } else {
2178                                 io_resid  -= bytes_to_move;
2179                                 xfer_resid -= bytes_to_move;
2180                                 io_offset  += bytes_to_move;
2181                         }
2182                 }
2183                 while (xfer_resid && zero_cnt1 && retval == 0) {
2184
2185                         if (zero_cnt1 < (long long)xfer_resid)
2186                                 bytes_to_zero = zero_cnt1;
2187                         else
2188                                 bytes_to_zero = xfer_resid;
2189
2190                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2191                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2192                         } else {
2193                                 int zero_pg_index;
2194
2195                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
2196                                 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
2197
2198                                 if ( !upl_valid_page(pl, zero_pg_index)) {
2199                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2200                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2201                                            !upl_dirty_page(pl, zero_pg_index)) {
2202                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2203                                 }
2204                         }
2205                         xfer_resid -= bytes_to_zero;
2206                         zero_cnt1  -= bytes_to_zero;
2207                         zero_off1  += bytes_to_zero;
2208                         io_offset  += bytes_to_zero;
2209                 }
2210
2211                 if (retval == 0) {
2212                         int cl_index;
2213                         int can_delay;
2214
2215                         io_size += start_offset;
2216
2217                         if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
2218                                 /*
2219                                  * if we're extending the file with this write
2220                                  * we'll zero fill the rest of the page so that
2221                                  * if the file gets extended again in such a way as to leave a
2222                                  * hole starting at this EOF, we'll have zero's in the correct spot
2223                                  */
2224                                 cluster_zero(upl, io_size, upl_size - io_size, NULL);
2225                         }
2226                         if (flags & IO_SYNC)
2227                                 /*
2228                                  * if the IO_SYNC flag is set than we need to
2229                                  * bypass any clusters and immediately issue
2230                                  * the I/O
2231                                  */
2232                                 goto issue_io;
2233 check_cluster:
2234                         /*
2235                          * take the lock to protect our accesses
2236                          * of the writebehind and sparse cluster state
2237                          */
2238                         wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
2239
2240                         /*
2241                          * calculate the last logical block number
2242                          * that this delayed I/O encompassed
2243                          */
2244                         cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
2245
2246                         if (wbp->cl_scmap) {
2247
2248                                 if ( !(flags & IO_NOCACHE)) {
2249                                         /*
2250                                          * we've fallen into the sparse
2251                                          * cluster method of delaying dirty pages
2252                                          * first, we need to release the upl if we hold one
2253                                          * since pages in it may be present in the sparse cluster map
2254                                          * and may span 2 separate buckets there... if they do and
2255                                          * we happen to have to flush a bucket to make room and it intersects
2256                                          * this upl, a deadlock may result on page BUSY
2257                                          */
2258                                         if (upl_size)
2259                                                 ubc_upl_commit_range(upl, 0, upl_size,
2260                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2261
2262                                         sparse_cluster_add(wbp, vp, &cl, newEOF);
2263
2264                                         lck_mtx_unlock(&wbp->cl_lockw);
2265
2266                                         continue;
2267                                 }
2268                                 /*
2269                                  * must have done cached writes that fell into
2270                                  * the sparse cluster mechanism... we've switched
2271                                  * to uncached writes on the file, so go ahead
2272                                  * and push whatever's in the sparse map
2273                                  * and switch back to normal clustering
2274                                  *
2275                                  * see the comment above concerning a possible deadlock...
2276                                  */
2277                                 if (upl_size) {
2278                                         ubc_upl_commit_range(upl, 0, upl_size,
2279                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2280                                         /*
2281                                          * setting upl_size to 0 keeps us from committing a
2282                                          * second time in the start_new_cluster path
2283                                          */
2284                                         upl_size = 0;
2285                                 }
2286                                 sparse_cluster_push(wbp, vp, newEOF, 1);
2287
2288                                 wbp->cl_number = 0;
2289                                 /*
2290                                  * no clusters of either type present at this point
2291                                  * so just go directly to start_new_cluster since
2292                                  * we know we need to delay this I/O since we've
2293                                  * already released the pages back into the cache
2294                                  * to avoid the deadlock with sparse_cluster_push
2295                                  */
2296                                 goto start_new_cluster;
2297                         }
2298                         upl_offset = 0;
2299
2300                         if (wbp->cl_number == 0)
2301                                 /*
2302                                  * no clusters currently present
2303                                  */
2304                                 goto start_new_cluster;
2305
2306                         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
2307                                 /*
2308                                  * check each cluster that we currently hold
2309                                  * try to merge some or all of this write into
2310                                  * one or more of the existing clusters... if
2311                                  * any portion of the write remains, start a
2312                                  * new cluster
2313                                  */
2314                                 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
2315                                         /*
2316                                          * the current write starts at or after the current cluster
2317                                          */
2318                                         if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
2319                                                 /*
2320                                                  * we have a write that fits entirely
2321                                                  * within the existing cluster limits
2322                                                  */
2323                                                 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
2324                                                         /*
2325                                                          * update our idea of where the cluster ends
2326                                                          */
2327                                                         wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2328                                                 break;
2329                                         }
2330                                         if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
2331                                                 /*
2332                                                  * we have a write that starts in the middle of the current cluster
2333                                                  * but extends beyond the cluster's limit... we know this because
2334                                                  * of the previous checks
2335                                                  * we'll extend the current cluster to the max
2336                                                  * and update the b_addr for the current write to reflect that
2337                                                  * the head of it was absorbed into this cluster...
2338                                                  * note that we'll always have a leftover tail in this case since
2339                                                  * full absorbtion would have occurred in the clause above
2340                                                  */
2341                                                 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER;
2342
2343                                                 if (upl_size) {
2344                                                         daddr64_t start_pg_in_upl;
2345
2346                                                         start_pg_in_upl = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
2347
2348                                                         if (start_pg_in_upl < wbp->cl_clusters[cl_index].e_addr) {
2349                                                                 intersection = (int)((wbp->cl_clusters[cl_index].e_addr - start_pg_in_upl) * PAGE_SIZE);
2350
2351                                                                 ubc_upl_commit_range(upl, upl_offset, intersection,
2352                                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2353                                                                 upl_f_offset += intersection;
2354                                                                 upl_offset   += intersection;
2355                                                                 upl_size     -= intersection;
2356                                                         }
2357                                                 }
2358                                                 cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
2359                                         }
2360                                         /*
2361                                          * we come here for the case where the current write starts
2362                                          * beyond the limit of the existing cluster or we have a leftover
2363                                          * tail after a partial absorbtion
2364                                          *
2365                                          * in either case, we'll check the remaining clusters before
2366                                          * starting a new one
2367                                          */
2368                                 } else {
2369                                         /*
2370                                          * the current write starts in front of the cluster we're currently considering
2371                                          */
2372                                         if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= MAX_UPL_TRANSFER) {
2373                                                 /*
2374                                                  * we can just merge the new request into
2375                                                  * this cluster and leave it in the cache
2376                                                  * since the resulting cluster is still
2377                                                  * less than the maximum allowable size
2378                                                  */
2379                                                 wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
2380
2381                                                 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
2382                                                         /*
2383                                                          * the current write completely
2384                                                          * envelops the existing cluster and since
2385                                                          * each write is limited to at most MAX_UPL_TRANSFER bytes
2386                                                          * we can just use the start and last blocknos of the write
2387                                                          * to generate the cluster limits
2388                                                          */
2389                                                         wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2390                                                 }
2391                                                 break;
2392                                         }
2393
2394                                         /*
2395                                          * if we were to combine this write with the current cluster
2396                                          * we would exceed the cluster size limit.... so,
2397                                          * let's see if there's any overlap of the new I/O with
2398                                          * the cluster we're currently considering... in fact, we'll
2399                                          * stretch the cluster out to it's full limit and see if we
2400                                          * get an intersection with the current write
2401                                          *
2402                                          */
2403                                         if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER) {
2404                                                 /*
2405                                                  * the current write extends into the proposed cluster
2406                                                  * clip the length of the current write after first combining it's
2407                                                  * tail with the newly shaped cluster
2408                                                  */
2409                                                 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER;
2410
2411                                                 if (upl_size) {
2412                                                         intersection = (int)((cl.e_addr - wbp->cl_clusters[cl_index].b_addr) * PAGE_SIZE);
2413
2414                                                         if (intersection > upl_size)
2415                                                                 /*
2416                                                                  * because the current write may consist of a number of pages found in the cache
2417                                                                  * which are not part of the UPL, we may have an intersection that exceeds
2418                                                                  * the size of the UPL that is also part of this write
2419                                                                  */
2420                                                                 intersection = upl_size;
2421
2422                                                         ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2423                                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2424                                                         upl_size -= intersection;
2425                                                 }
2426                                                 cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
2427                                         }
2428                                         /*
2429                                          * if we get here, there was no way to merge
2430                                          * any portion of this write with this cluster
2431                                          * or we could only merge part of it which
2432                                          * will leave a tail...
2433                                          * we'll check the remaining clusters before starting a new one
2434                                          */
2435                                 }
2436                         }
2437                         if (cl_index < wbp->cl_number)
2438                                 /*
2439                                  * we found an existing cluster(s) that we
2440                                  * could entirely merge this I/O into
2441                                  */
2442                                 goto delay_io;
2443
2444                         if (wbp->cl_number < MAX_CLUSTERS && !(flags & IO_NOCACHE))
2445                                 /*
2446                                  * we didn't find an existing cluster to
2447                                  * merge into, but there's room to start
2448                                  * a new one
2449                                  */
2450                                 goto start_new_cluster;
2451
2452                         /*
2453                          * no exisitng cluster to merge with and no
2454                          * room to start a new one... we'll try
2455                          * pushing one of the existing ones... if none of
2456                          * them are able to be pushed, we'll switch
2457                          * to the sparse cluster mechanism
2458                          * cluster_try_push updates cl_number to the
2459                          * number of remaining clusters... and
2460                          * returns the number of currently unused clusters
2461                          */
2462                         int ret_cluster_try_push = 0;
2463                         /* if writes are not deferred, call cluster push immediately */
2464                         if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2465                                 if (flags & IO_NOCACHE)
2466                                         can_delay = 0;
2467                                 else
2468                                         can_delay = 1;
2469
2470                                 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, can_delay, 0);
2471                         }
2472
2473                         /* execute following regardless writes are deferred or not */
2474                         if (ret_cluster_try_push == 0) {
2475                                 /*
2476                                  * no more room in the normal cluster mechanism
2477                                  * so let's switch to the more expansive but expensive
2478                                  * sparse mechanism....
2479                                  * first, we need to release the upl if we hold one
2480                                  * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2481                                  * and may span 2 separate buckets there... if they do and
2482                                  * we happen to have to flush a bucket to make room and it intersects
2483                                  * this upl, a deadlock may result on page BUSY
2484                                  */
2485                                 if (upl_size)
2486                                         ubc_upl_commit_range(upl, upl_offset, upl_size,
2487                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2488
2489                                 sparse_cluster_switch(wbp, vp, newEOF);
2490                                 sparse_cluster_add(wbp, vp, &cl, newEOF);
2491
2492                                 lck_mtx_unlock(&wbp->cl_lockw);
2493
2494                                 continue;
2495                         }
2496                         /*
2497                          * we pushed one cluster successfully, so we must be sequentially writing this file
2498                          * otherwise, we would have failed and fallen into the sparse cluster support
2499                          * so let's take the opportunity to push out additional clusters as long as we
2500                          * remain below the throttle... this will give us better I/O locality if we're
2501                          * in a copy loop (i.e.  we won't jump back and forth between the read and write points
2502                          * however, we don't want to push so much out that the write throttle kicks in and
2503                          * hangs this thread up until some of the I/O completes...
2504                          */
2505                         if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2506                                 while (wbp->cl_number && (vp->v_numoutput <= (VNODE_ASYNC_THROTTLE / 2)))
2507                                         cluster_try_push(wbp, vp, newEOF, 0, 0);
2508                         }
2509
2510 start_new_cluster:
2511                         wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
2512                         wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
2513
2514                         if (flags & IO_NOCACHE)
2515                                 wbp->cl_clusters[wbp->cl_number].io_nocache = 1;
2516                         else
2517                                 wbp->cl_clusters[wbp->cl_number].io_nocache = 0;
2518                         wbp->cl_number++;
2519 delay_io:
2520                         if (upl_size)
2521                                 ubc_upl_commit_range(upl, upl_offset, upl_size,
2522                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2523
2524                         lck_mtx_unlock(&wbp->cl_lockw);
2525
2526                         continue;
2527 issue_io:
2528                         /*
2529                          * we don't hold the vnode lock at this point
2530                          *
2531                          * because we had to ask for a UPL that provides currenty non-present pages, the
2532                          * UPL has been automatically set to clear the dirty flags (both software and hardware)
2533                          * upon committing it... this is not the behavior we want since it's possible for
2534                          * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
2535                          * in order to maintain some semblance of coherency with mapped writes
2536                          * we need to drop the current upl and pick it back up with COPYOUT_FROM set
2537                          * so that we correctly deal with a change in state of the hardware modify bit...
2538                          * we do this via cluster_push_x... by passing along the IO_SYNC flag, we force
2539                          * cluster_push_x to wait until all the I/Os have completed... cluster_push_x is also
2540                          * responsible for generating the correct sized I/O(s)
2541                          */
2542                         ubc_upl_commit_range(upl, 0, upl_size,
2543                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2544
2545                         cl.e_addr = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
2546
2547                         retval = cluster_push_x(vp, &cl, newEOF, flags);
2548                 }
2549         }
2550         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2551                      retval, 0, io_resid, 0, 0);
2552
2553         return (retval);
2554 }
2555
2556 int
2557 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
2558 {
2559         int           prev_resid;
2560         u_int         clip_size;
2561         off_t         max_io_size;
2562         int           upl_size;
2563         int           upl_flags;
2564         upl_t         upl;
2565         int           retval = 0;
2566         int           flags;
2567
2568         flags = xflags;
2569
2570         if (vp->v_flag & VNOCACHE_DATA)
2571                 flags |= IO_NOCACHE;
2572         if (vp->v_flag & VRAOFF)
2573                 flags |= IO_RAOFF;
2574
2575         if (!((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
2576                 /*
2577                  * go do a read through the cache if one of the following is true....
2578                  *   NOCACHE is not true
2579                  *   the uio request doesn't target USERSPACE
2580                  */
2581                 return (cluster_read_x(vp, uio, filesize, flags));
2582         }
2583
2584 #if LP64_DEBUG
2585         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
2586                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
2587         }
2588 #endif /* LP64_DEBUG */
2589
2590         while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
2591                 user_size_t     iov_len;
2592                 user_addr_t     iov_base;
2593
2594                 /*
2595                  * we know we have a resid, so this is safe
2596                  * skip over any emtpy vectors
2597                  */
2598                 uio_update(uio, (user_size_t)0);
2599
2600                 iov_len  = uio_curriovlen(uio);
2601                 iov_base = uio_curriovbase(uio);
2602
2603                 upl_size  = PAGE_SIZE;
2604                 upl_flags = UPL_QUERY_OBJECT_TYPE;
2605
2606                 // LP64todo - fix this!
2607                 if ((vm_map_get_upl(current_map(),
2608                                     (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2609                                     &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
2610                         /*
2611                          * the user app must have passed in an invalid address
2612                          */
2613                         return (EFAULT);
2614                 }
2615
2616                 /*
2617                  * We check every vector target but if it is physically
2618                  * contiguous space, we skip the sanity checks.
2619                  */
2620                 if (upl_flags & UPL_PHYS_CONTIG) {
2621                         retval = cluster_phys_read(vp, uio, filesize);
2622                 }
2623                 else if (uio_resid(uio) < PAGE_SIZE) {
2624                         /*
2625                          * we're here because we're don't have a physically contiguous target buffer
2626                          * go do a read through the cache if
2627                          *   the total xfer size is less than a page...
2628                          */
2629                         return (cluster_read_x(vp, uio, filesize, flags));
2630                 }
2631                 // LP64todo - fix this!
2632                 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2633                        if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2634                                /*
2635                                 * Bring the file offset read up to a pagesize boundary
2636                                 * this will also bring the base address to a page boundary
2637                                 * since they both are currently on the same offset within a page
2638                                 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2639                                 * so the computed clip_size must always be less than the current uio_resid
2640                                 */
2641                                clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2642
2643                                /*
2644                                 * Fake the resid going into the cluster_read_x call
2645                                 * and restore it on the way out.
2646                                 */
2647                                prev_resid = uio_resid(uio);
2648                                // LP64todo - fix this
2649                                uio_setresid(uio, clip_size);
2650
2651                                retval = cluster_read_x(vp, uio, filesize, flags);
2652
2653                                uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2654                        } else {
2655                                /*
2656                                 * can't get both the file offset and the buffer offset aligned to a page boundary
2657                                 * so fire an I/O through the cache for this entire vector
2658                                 */
2659                                // LP64todo - fix this!
2660                                clip_size = iov_len;
2661                                prev_resid = uio_resid(uio);
2662                                uio_setresid(uio, clip_size);
2663
2664                                retval = cluster_read_x(vp, uio, filesize, flags);
2665
2666                                uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2667                        }
2668                 } else {
2669                         /*
2670                          * If we come in here, we know the offset into
2671                          * the file is on a pagesize boundary
2672                          */
2673                         max_io_size = filesize - uio->uio_offset;
2674                         // LP64todo - fix this
2675                         clip_size = uio_resid(uio);
2676                         if (iov_len < clip_size)
2677                                 clip_size = iov_len;
2678                         if (max_io_size < clip_size)
2679                                 clip_size = (int)max_io_size;
2680
2681                         if (clip_size < PAGE_SIZE) {
2682                                 /*
2683                                  * Take care of the tail end of the read in this vector.
2684                                  */
2685                                 // LP64todo - fix this
2686                                 prev_resid = uio_resid(uio);
2687                                 uio_setresid(uio, clip_size);
2688
2689                                 retval = cluster_read_x(vp, uio, filesize, flags);
2690
2691                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2692                         } else {
2693                                 /* round clip_size down to a multiple of pagesize */
2694                                 clip_size = clip_size & ~(PAGE_MASK);
2695                                 // LP64todo - fix this
2696                                 prev_resid = uio_resid(uio);
2697                                 uio_setresid(uio, clip_size);
2698
2699                                 retval = cluster_nocopy_read(vp, uio, filesize);
2700
2701                                 if ((retval==0) && uio_resid(uio))
2702                                         retval = cluster_read_x(vp, uio, filesize, flags);
2703
2704                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2705                         }
2706                 } /* end else */
2707         } /* end while */
2708
2709         return(retval);
2710 }
2711
2712 static int
2713 cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags)
2714 {
2715         upl_page_info_t *pl;
2716         upl_t            upl;
2717         vm_offset_t      upl_offset;
2718         int              upl_size;
2719         off_t            upl_f_offset;
2720         int              start_offset;
2721         int              start_pg;
2722         int              last_pg;
2723         int              uio_last = 0;
2724         int              pages_in_upl;
2725         off_t            max_size;
2726         off_t            last_ioread_offset;
2727         off_t            last_request_offset;
2728         u_int            size_of_prefetch;
2729         u_int            io_size;
2730         kern_return_t    kret;
2731         int              error  = 0;
2732         int              retval = 0;
2733         u_int            max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2734         u_int            rd_ahead_enabled = 1;
2735         u_int            prefetch_enabled = 1;
2736         struct cl_readahead *   rap;
2737         struct clios            iostate;
2738         struct cl_extent        extent;
2739
2740         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2741                      (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
2742
2743         // LP64todo - fix this
2744         last_request_offset = uio->uio_offset + uio_resid(uio);
2745
2746         if ((flags & (IO_RAOFF|IO_NOCACHE)) ||
2747                 ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
2748                 rd_ahead_enabled = 0;
2749                 rap = NULL;
2750         } else {
2751                 if (cluster_hard_throttle_on(vp)) {
2752                         rd_ahead_enabled = 0;
2753                         prefetch_enabled = 0;
2754
2755                         max_rd_size = HARD_THROTTLE_MAXSIZE;
2756                 }
2757                 if ((rap = cluster_get_rap(vp)) == NULL)
2758                         rd_ahead_enabled = 0;
2759         }
2760         if (last_request_offset > filesize)
2761                 last_request_offset = filesize;
2762         extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
2763         extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
2764
2765         if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
2766                 /*
2767                  * determine if we already have a read-ahead in the pipe courtesy of the
2768                  * last read systemcall that was issued...
2769                  * if so, pick up it's extent to determine where we should start
2770                  * with respect to any read-ahead that might be necessary to
2771                  * garner all the data needed to complete this read systemcall
2772                  */
2773                 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
2774
2775                 if (last_ioread_offset < uio->uio_offset)
2776                         last_ioread_offset = (off_t)0;
2777                 else if (last_ioread_offset > last_request_offset)
2778                         last_ioread_offset = last_request_offset;
2779         } else
2780                 last_ioread_offset = (off_t)0;
2781
2782         while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
2783                 /*
2784                  * compute the size of the upl needed to encompass
2785                  * the requested read... limit each call to cluster_io
2786                  * to the maximum UPL size... cluster_io will clip if
2787                  * this exceeds the maximum io_size for the device,
2788                  * make sure to account for
2789                  * a starting offset that's not page aligned
2790                  */
2791                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2792                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2793                 max_size     = filesize - uio->uio_offset;
2794
2795         // LP64todo - fix this!
2796                 if ((off_t)((unsigned int)uio_resid(uio)) < max_size)
2797                         io_size = uio_resid(uio);
2798                 else
2799                         io_size = max_size;
2800
2801                 if (!(flags & IO_NOCACHE)) {
2802
2803                         while (io_size) {
2804                                 u_int io_resid;
2805                                 u_int io_requested;
2806
2807                                 /*
2808                                  * if we keep finding the pages we need already in the cache, then
2809                                  * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2810                                  * to determine that we have all the pages we need... once we miss in
2811                                  * the cache and have issued an I/O, than we'll assume that we're likely
2812                                  * to continue to miss in the cache and it's to our advantage to try and prefetch
2813                                  */
2814                                 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
2815                                         if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
2816                                                 /*
2817                                                  * we've already issued I/O for this request and
2818                                                  * there's still work to do and
2819                                                  * our prefetch stream is running dry, so issue a
2820                                                  * pre-fetch I/O... the I/O latency will overlap
2821                                                  * with the copying of the data
2822                                                  */
2823                                                 if (size_of_prefetch > max_rd_size)
2824                                                         size_of_prefetch = max_rd_size;
2825
2826                                                 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
2827
2828                                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2829
2830                                                 if (last_ioread_offset > last_request_offset)
2831                                                         last_ioread_offset = last_request_offset;
2832                                         }
2833                                 }
2834                                 /*
2835                                  * limit the size of the copy we're about to do so that
2836                                  * we can notice that our I/O pipe is running dry and
2837                                  * get the next I/O issued before it does go dry
2838                                  */
2839                                 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
2840                                         io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
2841                                 else
2842                                         io_resid = io_size;
2843
2844                                 io_requested = io_resid;
2845
2846                                 retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
2847
2848                                 io_size -= (io_requested - io_resid);
2849
2850                                 if (retval || io_resid)
2851                                         /*
2852                                          * if we run into a real error or
2853                                          * a page that is not in the cache
2854                                          * we need to leave streaming mode
2855                                          */
2856                                         break;
2857
2858                                 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
2859                                         /*
2860                                          * we're already finished the I/O for this read request
2861                                          * let's see if we should do a read-ahead
2862                                          */
2863                                         cluster_rd_ahead(vp, &extent, filesize, rap);
2864                                 }
2865                         }
2866                         if (retval)
2867                                 break;
2868                         if (io_size == 0) {
2869                                 if (rap != NULL) {
2870                                         if (extent.e_addr < rap->cl_lastr)
2871                                                 rap->cl_maxra = 0;
2872                                         rap->cl_lastr = extent.e_addr;
2873                                 }
2874                                 break;
2875                         }
2876                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2877                         upl_f_offset = uio->uio_offset - (off_t)start_offset;
2878                         max_size     = filesize - uio->uio_offset;
2879                 }
2880                 if (io_size > max_rd_size)
2881                         io_size = max_rd_size;
2882
2883                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2884
2885                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2886                         upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
2887                 pages_in_upl = upl_size / PAGE_SIZE;
2888
2889                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2890                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2891
2892                 kret = ubc_create_upl(vp,
2893                                       upl_f_offset,
2894                                       upl_size,
2895                                       &upl,
2896                                       &pl,
2897                                       UPL_SET_LITE);
2898                 if (kret != KERN_SUCCESS)
2899                         panic("cluster_read: failed to get pagelist");
2900
2901                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2902                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2903
2904                 /*
2905                  * scan from the beginning of the upl looking for the first
2906                  * non-valid page.... this will become the first page in
2907                  * the request we're going to make to 'cluster_io'... if all
2908                  * of the pages are valid, we won't call through to 'cluster_io'
2909                  */
2910                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2911                         if (!upl_valid_page(pl, start_pg))
2912                                 break;
2913                 }
2914
2915                 /*
2916                  * scan from the starting invalid page looking for a valid
2917                  * page before the end of the upl is reached, if we
2918                  * find one, then it will be the last page of the request to
2919                  * 'cluster_io'
2920                  */
2921                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2922                         if (upl_valid_page(pl, last_pg))
2923                                 break;
2924                 }
2925                 iostate.io_completed = 0;
2926                 iostate.io_issued = 0;
2927                 iostate.io_error = 0;
2928                 iostate.io_wanted = 0;
2929
2930                 if (start_pg < last_pg) {
2931                         /*
2932                          * we found a range of 'invalid' pages that must be filled
2933                          * if the last page in this range is the last page of the file
2934                          * we may have to clip the size of it to keep from reading past
2935                          * the end of the last physical block associated with the file
2936                          */
2937                         upl_offset = start_pg * PAGE_SIZE;
2938                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2939
2940                         if ((upl_f_offset + upl_offset + io_size) > filesize)
2941                                 io_size = filesize - (upl_f_offset + upl_offset);
2942
2943                         /*
2944                          * issue an asynchronous read to cluster_io
2945                          */
2946
2947                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2948                                            io_size, CL_READ | CL_ASYNC, (buf_t)NULL, &iostate);
2949                 }
2950                 if (error == 0) {
2951                         /*
2952                          * if the read completed successfully, or there was no I/O request
2953                          * issued, than copy the data into user land via 'cluster_upl_copy_data'
2954                          * we'll first add on any 'valid'
2955                          * pages that were present in the upl when we acquired it.
2956                          */
2957                         u_int  val_size;
2958
2959                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2960                                 if (!upl_valid_page(pl, uio_last))
2961                                         break;
2962                         }
2963                         /*
2964                          * compute size to transfer this round,  if uio->uio_resid is
2965                          * still non-zero after this attempt, we'll loop around and
2966                          * set up for another I/O.
2967                          */
2968                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2969
2970                         if (val_size > max_size)
2971                                 val_size = max_size;
2972
2973                         if (val_size > uio_resid(uio))
2974         // LP64todo - fix this
2975                                 val_size = uio_resid(uio);
2976
2977                         if (last_ioread_offset == 0)
2978                                 last_ioread_offset = uio->uio_offset + val_size;
2979
2980                         if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
2981                                 /*
2982                                  * if there's still I/O left to do for this request, and...
2983                                  * we're not in hard throttle mode, then issue a
2984                                  * pre-fetch I/O... the I/O latency will overlap
2985                                  * with the copying of the data
2986                                  */
2987                                 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
2988
2989                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2990
2991                                 if (last_ioread_offset > last_request_offset)
2992                                         last_ioread_offset = last_request_offset;
2993
2994                         } else if ((uio->uio_offset + val_size) == last_request_offset) {
2995                                 /*
2996                                  * this transfer will finish this request, so...
2997                                  * let's try to read ahead if we're in
2998                                  * a sequential access pattern and we haven't
2999                                  * explicitly disabled it
3000                                  */
3001                                 if (rd_ahead_enabled)
3002                                         cluster_rd_ahead(vp, &extent, filesize, rap);
3003
3004                                 if (rap != NULL) {
3005                                         if (extent.e_addr < rap->cl_lastr)
3006                                                 rap->cl_maxra = 0;
3007                                         rap->cl_lastr = extent.e_addr;
3008                                 }
3009                         }
3010                         lck_mtx_lock(cl_mtxp);
3011
3012                         while (iostate.io_issued != iostate.io_completed) {
3013                                 iostate.io_wanted = 1;
3014                                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_x", 0);
3015                         }
3016                         lck_mtx_unlock(cl_mtxp);
3017
3018                         if (iostate.io_error)
3019                                 error = iostate.io_error;
3020                         else
3021                                 retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
3022                 }
3023                 if (start_pg < last_pg) {
3024                         /*
3025                          * compute the range of pages that we actually issued an I/O for
3026                          * and either commit them as valid if the I/O succeeded
3027                          * or abort them if the I/O failed
3028                          */
3029                         io_size = (last_pg - start_pg) * PAGE_SIZE;
3030
3031                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3032                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3033
3034                         if (error || (flags & IO_NOCACHE))
3035                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
3036                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3037                         else
3038                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
3039                                                      UPL_COMMIT_CLEAR_DIRTY |
3040                                                      UPL_COMMIT_FREE_ON_EMPTY |
3041                                                      UPL_COMMIT_INACTIVATE);
3042
3043                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
3044                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3045                 }
3046                 if ((last_pg - start_pg) < pages_in_upl) {
3047                         int cur_pg;
3048                         int commit_flags;
3049
3050                         /*
3051                          * the set of pages that we issued an I/O for did not encompass
3052                          * the entire upl... so just release these without modifying
3053                          * their state
3054                          */
3055                         if (error)
3056                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3057                         else {
3058                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3059                                              (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
3060
3061                                 if (start_pg) {
3062                                         /*
3063                                          * we found some already valid pages at the beginning of
3064                                          * the upl commit these back to the inactive list with
3065                                          * reference cleared
3066                                          */
3067                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
3068                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
3069                                                                    | UPL_COMMIT_INACTIVATE;
3070
3071                                                 if (upl_dirty_page(pl, cur_pg))
3072                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
3073
3074                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
3075                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3076                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3077                                                 else
3078                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3079                                                                 PAGE_SIZE, commit_flags);
3080                                         }
3081                                 }
3082                                 if (last_pg < uio_last) {
3083                                         /*
3084                                          * we found some already valid pages immediately after the
3085                                          * pages we issued I/O for, commit these back to the
3086                                          * inactive list with reference cleared
3087                                          */
3088                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
3089                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
3090                                                                                 | UPL_COMMIT_INACTIVATE;
3091
3092                                                 if (upl_dirty_page(pl, cur_pg))
3093                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
3094
3095                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
3096                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3097                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3098                                                 else
3099                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3100                                                                 PAGE_SIZE, commit_flags);
3101                                         }
3102                                 }
3103                                 if (uio_last < pages_in_upl) {
3104                                         /*
3105                                          * there were some invalid pages beyond the valid pages
3106                                          * that we didn't issue an I/O for, just release them
3107                                          * unchanged
3108                                          */
3109                                         ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
3110                                                             (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
3111                                 }
3112
3113                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
3114                                         (int)upl, -1, -1, 0, 0);
3115                         }
3116                 }
3117                 if (retval == 0)
3118                         retval = error;
3119
3120                 if ( uio_resid(uio) ) {
3121                         if (cluster_hard_throttle_on(vp)) {
3122                                 rd_ahead_enabled = 0;
3123                                 prefetch_enabled = 0;
3124
3125                                 max_rd_size = HARD_THROTTLE_MAXSIZE;
3126                         } else {
3127                                 if (rap != NULL)
3128                                         rd_ahead_enabled = 1;
3129                                 prefetch_enabled = 1;
3130
3131                                 max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3132                         }
3133                 }
3134         }
3135         if (rap != NULL) {
3136                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3137                              (int)uio->uio_offset, uio_resid(uio), rap->cl_lastr, retval, 0);
3138
3139                 lck_mtx_unlock(&rap->cl_lockr);
3140         } else {
3141                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3142                              (int)uio->uio_offset, uio_resid(uio), 0, retval, 0);
3143         }
3144
3145         return (retval);
3146 }
3147
3148
3149 static int
3150 cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize)
3151 {
3152         upl_t            upl;
3153         upl_page_info_t  *pl;
3154         vm_offset_t      upl_offset;
3155         off_t            max_io_size;
3156         int              io_size;
3157         int              upl_size;
3158         int              upl_needed_size;
3159         int              pages_in_pl;
3160         int              upl_flags;
3161         kern_return_t    kret;
3162         int              i;
3163         int              force_data_sync;
3164         int              retval = 0;
3165         int              no_zero_fill = 0;
3166         int              abort_flag = 0;
3167         struct clios     iostate;
3168         u_int            max_rd_size  = MAX_UPL_TRANSFER * PAGE_SIZE;
3169         u_int            max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
3170
3171
3172         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
3173                      (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
3174
3175         /*
3176          * When we enter this routine, we know
3177          *  -- the offset into the file is on a pagesize boundary
3178          *  -- the resid is a page multiple
3179          *  -- the resid will not exceed iov_len
3180          */
3181
3182         iostate.io_completed = 0;
3183         iostate.io_issued = 0;
3184         iostate.io_error = 0;
3185         iostate.io_wanted = 0;
3186
3187         while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
3188                 user_addr_t     iov_base;
3189
3190                 if (cluster_hard_throttle_on(vp)) {
3191                         max_rd_size  = HARD_THROTTLE_MAXSIZE;
3192                         max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
3193                 } else {
3194                         max_rd_size  = MAX_UPL_TRANSFER * PAGE_SIZE;
3195                         max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 8;
3196                 }
3197                 max_io_size = filesize - uio->uio_offset;
3198
3199         // LP64todo - fix this
3200                 if (max_io_size < (off_t)((unsigned int)uio_resid(uio)))
3201                         io_size = max_io_size;
3202                 else
3203                         io_size = uio_resid(uio);
3204
3205                 /*
3206                  * First look for pages already in the cache
3207                  * and move them to user space.
3208                  */
3209                 retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
3210
3211                 if (retval) {
3212                         /*
3213                          * we may have already spun some portion of this request
3214                          * off as async requests... we need to wait for the I/O
3215                          * to complete before returning
3216                          */
3217                         goto wait_for_reads;
3218                 }
3219                 /*
3220                  * If we are already finished with this read, then return
3221                  */
3222                 if (io_size == 0) {
3223                         /*
3224                          * we may have already spun some portion of this request
3225                          * off as async requests... we need to wait for the I/O
3226                          * to complete before returning
3227                          */
3228                         goto wait_for_reads;
3229                 }
3230                 max_io_size = io_size;
3231
3232                 if (max_io_size > max_rd_size)
3233                         max_io_size = max_rd_size;
3234
3235                 io_size = 0;
3236
3237                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
3238
3239                 if (io_size == 0)
3240                         /*
3241                          * we may have already spun some portion of this request
3242                          * off as async requests... we need to wait for the I/O
3243                          * to complete before returning
3244                          */
3245                         goto wait_for_reads;
3246
3247                 iov_base = uio_curriovbase(uio);
3248
3249                 // LP64todo - fix this!
3250                 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
3251                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
3252
3253                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
3254                              (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
3255
3256                 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
3257                         no_zero_fill = 1;
3258                         abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY;
3259                 } else {
3260                         no_zero_fill = 0;
3261                         abort_flag = UPL_ABORT_FREE_ON_EMPTY;
3262                 }
3263                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
3264                         pages_in_pl = 0;
3265                         upl_size = upl_needed_size;
3266                         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3267
3268                         if (no_zero_fill)
3269                                 upl_flags |= UPL_NOZEROFILL;
3270                         if (force_data_sync)
3271                                 upl_flags |= UPL_FORCE_DATA_SYNC;
3272
3273                         // LP64todo - fix this!
3274                         kret = vm_map_create_upl(current_map(),
3275                                                  (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
3276                                                  &upl_size, &upl, NULL, &pages_in_pl, &upl_flags);
3277
3278                         if (kret != KERN_SUCCESS) {
3279                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3280                                              (int)upl_offset, upl_size, io_size, kret, 0);
3281                                 /*
3282                                  * cluster_nocopy_read: failed to get pagelist
3283                                  *
3284                                  * we may have already spun some portion of this request
3285                                  * off as async requests... we need to wait for the I/O
3286                                  * to complete before returning
3287                                  */
3288                                 goto wait_for_reads;
3289                         }
3290                         pages_in_pl = upl_size / PAGE_SIZE;
3291                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3292
3293                         for (i = 0; i < pages_in_pl; i++) {
3294                                 if (!upl_valid_page(pl, i))
3295                                         break;
3296                         }
3297                         if (i == pages_in_pl)
3298                                 break;
3299
3300                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3301                 }
3302                 if (force_data_sync >= 3) {
3303                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3304                                      (int)upl_offset, upl_size, io_size, kret, 0);
3305
3306                         goto wait_for_reads;
3307                 }
3308                 /*
3309                  * Consider the possibility that upl_size wasn't satisfied.
3310                  */
3311                 if (upl_size != upl_needed_size)
3312                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
3313
3314                 if (io_size == 0) {
3315                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3316                         goto wait_for_reads;
3317                 }
3318                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3319                              (int)upl_offset, upl_size, io_size, kret, 0);
3320
3321                 /*
3322                  * request asynchronously so that we can overlap
3323                  * the preparation of the next I/O
3324                  * if there are already too many outstanding reads
3325                  * wait until some have completed before issuing the next read
3326                  */
3327                 lck_mtx_lock(cl_mtxp);
3328
3329                 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
3330                         iostate.io_wanted = 1;
3331                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
3332                 }
3333                 lck_mtx_unlock(cl_mtxp);
3334
3335                 if (iostate.io_error) {
3336                         /*
3337                          * one of the earlier reads we issued ran into a hard error
3338                          * don't issue any more reads, cleanup the UPL
3339                          * that was just created but not used, then
3340                          * go wait for any other reads to complete before
3341                          * returning the error to the caller
3342                          */
3343                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3344
3345                         goto wait_for_reads;
3346                 }
3347                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
3348                              (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
3349
3350                 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size,
3351                                    CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
3352                                    (buf_t)NULL, &iostate);
3353
3354                 /*
3355                  * update the uio structure
3356                  */
3357                 uio_update(uio, (user_size_t)io_size);
3358
3359                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
3360                              (int)upl, (int)uio->uio_offset, (int)uio_resid(uio), retval, 0);
3361
3362         } /* end while */
3363
3364 wait_for_reads:
3365         /*
3366          * make sure all async reads that are part of this stream
3367          * have completed before we return
3368          */
3369         lck_mtx_lock(cl_mtxp);
3370
3371         while (iostate.io_issued != iostate.io_completed) {
3372                 iostate.io_wanted = 1;
3373                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
3374         }
3375         lck_mtx_unlock(cl_mtxp);
3376
3377         if (iostate.io_error)
3378                 retval = iostate.io_error;
3379
3380         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
3381                      (int)uio->uio_offset, (int)uio_resid(uio), 6, retval, 0);
3382
3383         return (retval);
3384 }
3385
3386
3387 static int
3388 cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize)
3389 {
3390         upl_page_info_t *pl;
3391         upl_t            upl;
3392         vm_offset_t      upl_offset;
3393         addr64_t         dst_paddr;
3394         off_t            max_size;
3395         int              io_size;
3396         user_size_t      iov_len;
3397         user_addr_t      iov_base;
3398         int              tail_size;
3399         int              upl_size;
3400         int              upl_needed_size;
3401         int              pages_in_pl;
3402         int              upl_flags;
3403         kern_return_t    kret;
3404         struct clios     iostate;
3405         int              error;
3406         int              devblocksize;
3407
3408         devblocksize = vp->v_mount->mnt_devblocksize;
3409         /*
3410          * When we enter this routine, we know
3411          *  -- the resid will not exceed iov_len
3412          *  -- the target address is physically contiguous
3413          */
3414
3415 #if LP64_DEBUG
3416         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
3417                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
3418         }
3419 #endif /* LP64_DEBUG */
3420
3421         iov_len = uio_curriovlen(uio);
3422         iov_base = uio_curriovbase(uio);
3423
3424         max_size = filesize - uio->uio_offset;
3425
3426         // LP64todo - fix this!
3427         if (max_size < 0 || (u_int64_t)max_size > iov_len)
3428                 io_size = iov_len;
3429         else
3430                 io_size = max_size;
3431
3432         // LP64todo - fix this!
3433         upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
3434         upl_needed_size = upl_offset + io_size;
3435
3436         error       = 0;
3437         pages_in_pl = 0;
3438         upl_size = upl_needed_size;
3439         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3440
3441         kret = vm_map_get_upl(current_map(),
3442                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
3443                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
3444
3445         if (kret != KERN_SUCCESS) {
3446                 /*
3447                  * cluster_phys_read: failed to get pagelist
3448                  */
3449                 return(EINVAL);
3450         }
3451         if (upl_size < upl_needed_size) {
3452                 /*
3453                  * The upl_size wasn't satisfied.
3454                  */
3455                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3456
3457                 return(EINVAL);
3458         }
3459         pl = ubc_upl_pageinfo(upl);
3460
3461         dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
3462
3463         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3464                 int   head_size;
3465
3466                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
3467
3468                 if (head_size > io_size)
3469                         head_size = io_size;
3470
3471                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ);
3472
3473                 if (error) {
3474                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3475
3476                         return(EINVAL);
3477                 }
3478                 upl_offset += head_size;
3479                 dst_paddr  += head_size;
3480                 io_size    -= head_size;
3481         }
3482         tail_size = io_size & (devblocksize - 1);
3483         io_size  -= tail_size;
3484
3485         iostate.io_completed = 0;
3486         iostate.io_issued = 0;
3487         iostate.io_error = 0;
3488         iostate.io_wanted = 0;
3489
3490         while (io_size && error == 0) {
3491                 int  xsize;
3492
3493                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3494                         xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
3495                 else
3496                         xsize = io_size;
3497                 /*
3498                  * request asynchronously so that we can overlap
3499                  * the preparation of the next I/O... we'll do
3500                  * the commit after all the I/O has completed
3501                  * since its all issued against the same UPL
3502                  * if there are already too many outstanding reads
3503                  * wait until some have completed before issuing the next
3504                  */
3505                 lck_mtx_lock(cl_mtxp);
3506
3507                 while ((iostate.io_issued - iostate.io_completed) > (8 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
3508                         iostate.io_wanted = 1;
3509                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
3510                 }
3511                 lck_mtx_unlock(cl_mtxp);
3512
3513                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize,
3514                                    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3515                                    (buf_t)NULL, &iostate);
3516                 /*
3517                  * The cluster_io read was issued successfully,
3518                  * update the uio structure
3519                  */
3520                 if (error == 0) {
3521                         uio_update(uio, (user_size_t)xsize);
3522
3523                         dst_paddr  += xsize;
3524                         upl_offset += xsize;
3525                         io_size    -= xsize;
3526                 }
3527         }
3528         /*
3529          * make sure all async reads that are part of this stream
3530          * have completed before we proceed
3531          */
3532         lck_mtx_lock(cl_mtxp);
3533
3534         while (iostate.io_issued != iostate.io_completed) {
3535                 iostate.io_wanted = 1;
3536                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
3537         }
3538         lck_mtx_unlock(cl_mtxp);
3539
3540         if (iostate.io_error)
3541                 error = iostate.io_error;
3542
3543         if (error == 0 && tail_size)
3544                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ);
3545
3546         /*
3547          * just release our hold on the physically contiguous
3548          * region without changing any state
3549          */
3550         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3551
3552         return (error);
3553 }
3554
3555
3556 /*
3557  * generate advisory I/O's in the largest chunks possible
3558  * the completed pages will be released into the VM cache
3559  */
3560 int
3561 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
3562 {
3563         upl_page_info_t *pl;
3564         upl_t            upl;
3565         vm_offset_t      upl_offset;
3566         int              upl_size;
3567         off_t            upl_f_offset;
3568         int              start_offset;
3569         int              start_pg;
3570         int              last_pg;
3571         int              pages_in_upl;
3572         off_t            max_size;
3573         int              io_size;
3574         kern_return_t    kret;
3575         int              retval = 0;
3576         int              issued_io;
3577         int              skip_range;
3578
3579         if ( !UBCINFOEXISTS(vp))
3580                 return(EINVAL);
3581
3582         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3583                      (int)f_offset, resid, (int)filesize, 0, 0);
3584
3585         while (resid && f_offset < filesize && retval == 0) {
3586                 /*
3587                  * compute the size of the upl needed to encompass
3588                  * the requested read... limit each call to cluster_io
3589                  * to the maximum UPL size... cluster_io will clip if
3590                  * this exceeds the maximum io_size for the device,
3591                  * make sure to account for
3592                  * a starting offset that's not page aligned
3593                  */
3594                 start_offset = (int)(f_offset & PAGE_MASK_64);
3595                 upl_f_offset = f_offset - (off_t)start_offset;
3596                 max_size     = filesize - f_offset;
3597
3598                 if (resid < max_size)
3599                         io_size = resid;
3600                 else
3601                         io_size = max_size;
3602
3603                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3604                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3605                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3606
3607                 skip_range = 0;
3608                 /*
3609                  * return the number of contiguously present pages in the cache
3610                  * starting at upl_f_offset within the file
3611                  */
3612                 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
3613
3614                 if (skip_range) {
3615                         /*
3616                          * skip over pages already present in the cache
3617                          */
3618                         io_size = skip_range - start_offset;
3619
3620                         f_offset += io_size;
3621                         resid    -= io_size;
3622
3623                         if (skip_range == upl_size)
3624                                 continue;
3625                         /*
3626                          * have to issue some real I/O
3627                          * at this point, we know it's starting on a page boundary
3628                          * because we've skipped over at least the first page in the request
3629                          */
3630                         start_offset = 0;
3631                         upl_f_offset += skip_range;
3632                         upl_size     -= skip_range;
3633                 }
3634                 pages_in_upl = upl_size / PAGE_SIZE;
3635
3636                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
3637                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3638
3639                 kret = ubc_create_upl(vp,
3640                                       upl_f_offset,
3641                                       upl_size,
3642                                       &upl,
3643                                       &pl,
3644                                       UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
3645                 if (kret != KERN_SUCCESS)
3646                         return(retval);
3647                 issued_io = 0;
3648
3649                 /*
3650                  * before we start marching forward, we must make sure we end on
3651                  * a present page, otherwise we will be working with a freed
3652                  * upl
3653                  */
3654                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3655                         if (upl_page_present(pl, last_pg))
3656                                 break;
3657                 }
3658                 pages_in_upl = last_pg + 1;
3659
3660
3661                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
3662                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3663
3664
3665                 for (last_pg = 0; last_pg < pages_in_upl; ) {
3666                         /*
3667                          * scan from the beginning of the upl looking for the first
3668                          * page that is present.... this will become the first page in
3669                          * the request we're going to make to 'cluster_io'... if all
3670                          * of the pages are absent, we won't call through to 'cluster_io'
3671                          */
3672                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3673                                 if (upl_page_present(pl, start_pg))
3674                                         break;
3675                         }
3676
3677                         /*
3678                          * scan from the starting present page looking for an absent
3679                          * page before the end of the upl is reached, if we
3680                          * find one, then it will terminate the range of pages being
3681                          * presented to 'cluster_io'
3682                          */
3683                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3684                                 if (!upl_page_present(pl, last_pg))
3685                                         break;
3686                         }
3687
3688                         if (last_pg > start_pg) {
3689                                 /*
3690                                  * we found a range of pages that must be filled
3691                                  * if the last page in this range is the last page of the file
3692                                  * we may have to clip the size of it to keep from reading past
3693                                  * the end of the last physical block associated with the file
3694                                  */
3695                                 upl_offset = start_pg * PAGE_SIZE;
3696                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
3697
3698                                 if ((upl_f_offset + upl_offset + io_size) > filesize)
3699                                         io_size = filesize - (upl_f_offset + upl_offset);
3700
3701                                 /*
3702                                  * issue an asynchronous read to cluster_io
3703                                  */
3704                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
3705                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (buf_t)NULL, (struct clios *)NULL);
3706
3707                                 issued_io = 1;
3708                         }
3709                 }
3710                 if (issued_io == 0)
3711                         ubc_upl_abort(upl, 0);
3712
3713                 io_size = upl_size - start_offset;
3714
3715                 if (io_size > resid)
3716                         io_size = resid;
3717                 f_offset += io_size;
3718                 resid    -= io_size;
3719         }
3720
3721         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3722                      (int)f_offset, resid, retval, 0, 0);
3723
3724         return(retval);
3725 }
3726
3727
3728 int
3729 cluster_push(vnode_t vp, int flags)
3730 {
3731         int     retval;
3732         struct  cl_writebehind *wbp;
3733
3734         if ( !UBCINFOEXISTS(vp)) {
3735                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -1, 0);
3736                 return (0);
3737         }
3738         /* return if deferred write is set */
3739         if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
3740                 return (0);
3741         }
3742         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
3743                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -2, 0);
3744                 return (0);
3745         }
3746         if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
3747                 lck_mtx_unlock(&wbp->cl_lockw);
3748
3749                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -3, 0);
3750                 return(0);
3751         }
3752         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3753                      (int)wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
3754
3755         if (wbp->cl_scmap) {
3756                 sparse_cluster_push(wbp, vp, ubc_getsize(vp), 1);
3757
3758                 retval = 1;
3759         } else
3760                 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), 0, 1);
3761
3762         lck_mtx_unlock(&wbp->cl_lockw);
3763
3764         if (flags & IO_SYNC)
3765                 (void)vnode_waitforwrites(vp, 0, 0, 0, (char *)"cluster_push");
3766
3767         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3768                      (int)wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
3769
3770         return (retval);
3771 }
3772
3773
3774 __private_extern__ void
3775 cluster_release(struct ubc_info *ubc)
3776 {
3777         struct cl_writebehind *wbp;
3778         struct cl_readahead   *rap;
3779
3780         if ((wbp = ubc->cl_wbehind)) {
3781
3782                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
3783
3784                 if (wbp->cl_scmap)
3785                         vfs_drt_control(&(wbp->cl_scmap), 0);
3786         } else {
3787                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, 0, 0, 0, 0);
3788         }
3789
3790         rap = ubc->cl_rahead;
3791
3792         if (wbp != NULL) {
3793                 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
3794                 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
3795         }
3796         if ((rap = ubc->cl_rahead)) {
3797                 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
3798                 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
3799         }
3800         ubc->cl_rahead  = NULL;
3801         ubc->cl_wbehind = NULL;
3802
3803         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)ubc, (int)rap, (int)wbp, 0, 0);
3804 }
3805
3806
3807 static void
3808 cluster_push_EOF(vnode_t vp, off_t EOF)
3809 {
3810         struct cl_writebehind *wbp;
3811
3812         wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3813
3814         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3815                      (int)wbp->cl_scmap, wbp->cl_number, (int)EOF, 0, 0);
3816
3817         if (wbp->cl_scmap)
3818                 sparse_cluster_push(wbp, vp, EOF, 1);
3819         else
3820                 cluster_try_push(wbp, vp, EOF, 0, 1);
3821
3822         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3823                      (int)wbp->cl_scmap, wbp->cl_number, 0, 0, 0);
3824
3825         lck_mtx_unlock(&wbp->cl_lockw);
3826 }
3827
3828
3829 static int
3830 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int can_delay, int push_all)
3831 {
3832         int cl_index;
3833         int cl_index1;
3834         int min_index;
3835         int cl_len;
3836         int cl_pushed = 0;
3837         struct cl_wextent l_clusters[MAX_CLUSTERS];
3838
3839         /*
3840          * the write behind context exists and has
3841          * already been locked...
3842          *
3843          * make a local 'sorted' copy of the clusters
3844          * and clear wbp->cl_number so that new clusters can
3845          * be developed
3846          */
3847         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3848                 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
3849                         if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
3850                                 continue;
3851                         if (min_index == -1)
3852                                 min_index = cl_index1;
3853                         else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
3854                                 min_index = cl_index1;
3855                 }
3856                 if (min_index == -1)
3857                         break;
3858                 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
3859                 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
3860                 l_clusters[cl_index].io_nocache = wbp->cl_clusters[min_index].io_nocache;
3861
3862                 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
3863         }
3864         wbp->cl_number = 0;
3865
3866         cl_len = cl_index;
3867
3868         if (can_delay && cl_len == MAX_CLUSTERS) {
3869                 int   i;
3870
3871                 /*
3872                  * determine if we appear to be writing the file sequentially
3873                  * if not, by returning without having pushed any clusters
3874                  * we will cause this vnode to be pushed into the sparse cluster mechanism
3875                  * used for managing more random I/O patterns
3876                  *
3877                  * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3878                  * that's why we're in try_push with can_delay true...
3879                  *
3880                  * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3881                  * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3882                  * so we can just make a simple pass through, up to, but not including the last one...
3883                  * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
3884                  * are sequential
3885                  *
3886                  * we let the last one be partial as long as it was adjacent to the previous one...
3887                  * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3888                  * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3889                  */
3890                 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
3891                         if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != MAX_UPL_TRANSFER)
3892                                 goto dont_try;
3893                         if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
3894                                 goto dont_try;
3895                 }
3896         }
3897         /*
3898          * drop the lock while we're firing off the I/Os...
3899          * this is safe since I'm working off of a private sorted copy
3900          * of the clusters, and I'm going to re-evaluate the public
3901          * state after I retake the lock
3902          */
3903         lck_mtx_unlock(&wbp->cl_lockw);
3904
3905         for (cl_index = 0; cl_index < cl_len; cl_index++) {
3906                 int flags;
3907                 struct cl_extent cl;
3908
3909                 /*
3910                  * try to push each cluster in turn...
3911                  */
3912                 if (l_clusters[cl_index].io_nocache)
3913                         flags = IO_NOCACHE;
3914                 else
3915                         flags = 0;
3916                 cl.b_addr = l_clusters[cl_index].b_addr;
3917                 cl.e_addr = l_clusters[cl_index].e_addr;
3918
3919                 cluster_push_x(vp, &cl, EOF, flags);
3920
3921                 l_clusters[cl_index].b_addr = 0;
3922                 l_clusters[cl_index].e_addr = 0;
3923
3924                 cl_pushed++;
3925
3926                 if (push_all == 0)
3927                         break;
3928         }
3929         lck_mtx_lock(&wbp->cl_lockw);
3930
3931 dont_try:
3932         if (cl_len > cl_pushed) {
3933                /*
3934                 * we didn't push all of the clusters, so
3935                 * lets try to merge them back in to the vnode
3936                 */
3937                 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
3938                         /*
3939                          * we picked up some new clusters while we were trying to
3940                          * push the old ones... this can happen because I've dropped
3941                          * the vnode lock... the sum of the
3942                          * leftovers plus the new cluster count exceeds our ability
3943                          * to represent them, so switch to the sparse cluster mechanism
3944                          *
3945                          * collect the active public clusters...
3946                          */
3947                         sparse_cluster_switch(wbp, vp, EOF);
3948
3949                         for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
3950                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
3951                                         continue;
3952                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3953                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3954                                 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
3955
3956                                 cl_index1++;
3957                         }
3958                         /*
3959                          * update the cluster count
3960                          */
3961                         wbp->cl_number = cl_index1;
3962
3963                         /*
3964                          * and collect the original clusters that were moved into the
3965                          * local storage for sorting purposes
3966                          */
3967                         sparse_cluster_switch(wbp, vp, EOF);
3968
3969                 } else {
3970                         /*
3971                          * we've got room to merge the leftovers back in
3972                          * just append them starting at the next 'hole'
3973                          * represented by wbp->cl_number
3974                          */
3975                         for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
3976                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
3977                                         continue;
3978
3979                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3980                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3981                                 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
3982
3983                                 cl_index1++;
3984                         }
3985                         /*
3986                          * update the cluster count
3987                          */
3988                         wbp->cl_number = cl_index1;
3989                 }
3990         }
3991         return(MAX_CLUSTERS - wbp->cl_number);
3992 }
3993
3994
3995
3996 static int
3997 cluster_push_x(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags)
3998 {
3999         upl_page_info_t *pl;
4000         upl_t            upl;
4001         vm_offset_t      upl_offset;
4002         int              upl_size;
4003         off_t            upl_f_offset;
4004         int              pages_in_upl;
4005         int              start_pg;
4006         int              last_pg;
4007         int              io_size;
4008         int              io_flags;
4009         int              upl_flags;
4010         int              size;
4011         int              error = 0;
4012         int              retval;
4013         kern_return_t    kret;
4014
4015
4016         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
4017                      (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
4018
4019         if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
4020                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
4021
4022                 return (0);
4023         }
4024         upl_size = pages_in_upl * PAGE_SIZE;
4025         upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4026
4027         if (upl_f_offset + upl_size >= EOF) {
4028
4029                 if (upl_f_offset >= EOF) {
4030                         /*
4031                          * must have truncated the file and missed
4032                          * clearing a dangling cluster (i.e. it's completely
4033                          * beyond the new EOF
4034                          */
4035                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
4036
4037                         return(0);
4038                 }
4039                 size = EOF - upl_f_offset;
4040
4041                 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4042                 pages_in_upl = upl_size / PAGE_SIZE;
4043         } else
4044                 size = upl_size;
4045
4046         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
4047
4048         /*
4049          * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
4050          *
4051          * - only pages that are currently dirty are returned... these are the ones we need to clean
4052          * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
4053          * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
4054          * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
4055          *   someone dirties this page while the I/O is in progress, we don't lose track of the new state
4056          *
4057          * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
4058          */
4059
4060         if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE))
4061                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
4062         else
4063                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
4064
4065         kret = ubc_create_upl(vp,
4066                                 upl_f_offset,
4067                                 upl_size,
4068                                 &upl,
4069                                 &pl,
4070                                 upl_flags);
4071         if (kret != KERN_SUCCESS)
4072                 panic("cluster_push: failed to get pagelist");
4073
4074         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
4075
4076         /*
4077          * since we only asked for the dirty pages back
4078          * it's possible that we may only get a few or even none, so...
4079          * before we start marching forward, we must make sure we know
4080          * where the last present page is in the UPL, otherwise we could
4081          * end up working with a freed upl due to the FREE_ON_EMPTY semantics
4082          * employed by commit_range and abort_range.
4083          */
4084         for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
4085                 if (upl_page_present(pl, last_pg))
4086                         break;
4087         }
4088         pages_in_upl = last_pg + 1;
4089
4090         if (pages_in_upl == 0) {
4091                 ubc_upl_abort(upl, 0);
4092
4093                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
4094                 return(0);
4095         }
4096
4097         for (last_pg = 0; last_pg < pages_in_upl; ) {
4098                 /*
4099                  * find the next dirty page in the UPL
4100                  * this will become the first page in the
4101                  * next I/O to generate
4102                  */
4103                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
4104                         if (upl_dirty_page(pl, start_pg))
4105                                 break;
4106                         if (upl_page_present(pl, start_pg))
4107                                 /*
4108                                  * RET_ONLY_DIRTY will return non-dirty 'precious' pages
4109                                  * just release these unchanged since we're not going
4110                                  * to steal them or change their state
4111                                  */
4112                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4113                 }
4114                 if (start_pg >= pages_in_upl)
4115                         /*
4116                          * done... no more dirty pages to push
4117                          */
4118                         break;
4119                 if (start_pg > last_pg)
4120                         /*
4121                          * skipped over some non-dirty pages
4122                          */
4123                         size -= ((start_pg - last_pg) * PAGE_SIZE);
4124
4125                 /*
4126                  * find a range of dirty pages to write
4127                  */
4128                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4129                         if (!upl_dirty_page(pl, last_pg))
4130                                 break;
4131                 }
4132                 upl_offset = start_pg * PAGE_SIZE;
4133
4134                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
4135
4136                 io_flags = CL_THROTTLE | CL_COMMIT;
4137
4138                 if ( !(flags & IO_SYNC))
4139                         io_flags |= CL_ASYNC;
4140
4141                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
4142                                     io_flags, (buf_t)NULL, (struct clios *)NULL);
4143
4144                 if (error == 0 && retval)
4145                         error = retval;
4146
4147                 size -= io_size;
4148         }
4149         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
4150
4151         return(error);
4152 }
4153
4154
4155 /*
4156  * sparse_cluster_switch is called with the write behind lock held
4157  */
4158 static void
4159 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF)
4160 {
4161         int     cl_index;
4162
4163         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4164
4165         if (wbp->cl_scmap == NULL)
4166                 wbp->cl_scdirty = 0;
4167
4168         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
4169                 int       flags;
4170                 struct cl_extent cl;
4171
4172                 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
4173
4174                         if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
4175                                 if (flags & UPL_POP_DIRTY) {
4176                                         cl.e_addr = cl.b_addr + 1;
4177
4178                                         sparse_cluster_add(wbp, vp, &cl, EOF);
4179                                 }
4180                         }
4181                 }
4182         }
4183         wbp->cl_number = 0;
4184
4185         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4186 }
4187
4188
4189 /*
4190  * sparse_cluster_push is called with the write behind lock held
4191  */
4192 static void
4193 sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_all)
4194 {
4195         struct cl_extent cl;
4196         off_t           offset;
4197         u_int           length;
4198
4199         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, push_all, 0);
4200
4201         if (push_all)
4202                 vfs_drt_control(&(wbp->cl_scmap), 1);
4203
4204         for (;;) {
4205                 if (vfs_drt_get_cluster(&(wbp->cl_scmap), &offset, &length) != KERN_SUCCESS)
4206                         break;
4207
4208                 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
4209                 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
4210
4211                 wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr);
4212
4213                 cluster_push_x(vp, &cl, EOF, 0);
4214
4215                 if (push_all == 0)
4216                         break;
4217         }
4218         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4219 }
4220
4221
4222 /*
4223  * sparse_cluster_add is called with the write behind lock held
4224  */
4225 static void
4226 sparse_cluster_add(struct cl_writebehind *wbp, vnode_t vp, struct cl_extent *cl, off_t EOF)
4227 {
4228         u_int   new_dirty;
4229         u_int   length;
4230         off_t   offset;
4231
4232         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)wbp->cl_scmap, wbp->cl_scdirty, (int)cl->b_addr, (int)cl->e_addr, 0);
4233
4234         offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4235         length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
4236
4237         while (vfs_drt_mark_pages(&(wbp->cl_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
4238                 /*
4239                  * no room left in the map
4240                  * only a partial update was done
4241                  * push out some pages and try again
4242                  */
4243                 wbp->cl_scdirty += new_dirty;
4244
4245                 sparse_cluster_push(wbp, vp, EOF, 0);
4246
4247                 offset += (new_dirty * PAGE_SIZE_64);
4248                 length -= (new_dirty * PAGE_SIZE);
4249         }
4250         wbp->cl_scdirty += new_dirty;
4251
4252         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4253 }
4254
4255
4256 static int
4257 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags)
4258 {
4259         upl_page_info_t  *pl;
4260         upl_t            upl;
4261         addr64_t         ubc_paddr;
4262         kern_return_t    kret;
4263         int              error = 0;
4264         int              did_read = 0;
4265         int              abort_flags;
4266         int              upl_flags;
4267
4268         upl_flags = UPL_SET_LITE;
4269         if (! (flags & CL_READ)) {
4270                 /*
4271                  * "write" operation:  let the UPL subsystem know
4272                  * that we intend to modify the buffer cache pages
4273                  * we're gathering.
4274                  */
4275                 upl_flags |= UPL_WILL_MODIFY;
4276         }
4277
4278         kret = ubc_create_upl(vp,
4279                               uio->uio_offset & ~PAGE_MASK_64,
4280                               PAGE_SIZE,
4281                               &upl,
4282                               &pl,
4283                               upl_flags);
4284
4285         if (kret != KERN_SUCCESS)
4286                 return(EINVAL);
4287
4288         if (!upl_valid_page(pl, 0)) {
4289                 /*
4290                  * issue a synchronous read to cluster_io
4291                  */
4292                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4293                                    CL_READ, (buf_t)NULL, (struct clios *)NULL);
4294                 if (error) {
4295                           ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4296
4297                           return(error);
4298                 }
4299                 did_read = 1;
4300         }
4301         ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
4302
4303 /*
4304  *      NOTE:  There is no prototype for the following in BSD. It, and the definitions
4305  *      of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
4306  *      osfmk/ppc/mappings.h.  They are not included here because there appears to be no
4307  *      way to do so without exporting them to kexts as well.
4308  */
4309         if (flags & CL_READ)
4310 //              copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);    /* Copy physical to physical and flush the destination */
4311                 copypv(ubc_paddr, usr_paddr, xsize,        2 |        1 |        4);    /* Copy physical to physical and flush the destination */
4312         else
4313 //              copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);    /* Copy physical to physical and flush the source */
4314                 copypv(usr_paddr, ubc_paddr, xsize,        2 |        1 |        8);    /* Copy physical to physical and flush the source */
4315
4316         if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
4317                 /*
4318                  * issue a synchronous write to cluster_io
4319                  */
4320                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4321                                         0, (buf_t)NULL, (struct clios *)NULL);
4322         }
4323         if (error == 0)
4324                 uio_update(uio, (user_size_t)xsize);
4325
4326         if (did_read)
4327                 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4328         else
4329                 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
4330
4331         ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
4332
4333         return (error);
4334 }
4335
4336
4337
4338 int
4339 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
4340 {
4341         int       pg_offset;
4342         int       pg_index;
4343         int       csize;
4344         int       segflg;
4345         int       retval = 0;
4346         upl_page_info_t *pl;
4347
4348         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
4349                      (int)uio->uio_offset, uio_resid(uio), upl_offset, xsize, 0);
4350
4351         segflg = uio->uio_segflg;
4352
4353         switch(segflg) {
4354
4355           case UIO_USERSPACE32:
4356           case UIO_USERISPACE32:
4357                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4358                 break;
4359
4360           case UIO_USERSPACE:
4361           case UIO_USERISPACE:
4362                 uio->uio_segflg = UIO_PHYS_USERSPACE;
4363                 break;
4364
4365           case UIO_USERSPACE64:
4366           case UIO_USERISPACE64:
4367                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4368                 break;
4369
4370           case UIO_SYSSPACE32:
4371                 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4372                 break;
4373
4374           case UIO_SYSSPACE:
4375                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4376                 break;
4377
4378           case UIO_SYSSPACE64:
4379                 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4380                 break;
4381         }
4382         pl = ubc_upl_pageinfo(upl);
4383
4384         pg_index  = upl_offset / PAGE_SIZE;
4385         pg_offset = upl_offset & PAGE_MASK;
4386         csize     = min(PAGE_SIZE - pg_offset, xsize);
4387
4388         while (xsize && retval == 0) {
4389                 addr64_t  paddr;
4390
4391                 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
4392
4393                 retval = uiomove64(paddr, csize, uio);
4394
4395                 pg_index += 1;
4396                 pg_offset = 0;
4397                 xsize    -= csize;
4398                 csize     = min(PAGE_SIZE, xsize);
4399         }
4400         uio->uio_segflg = segflg;
4401
4402         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4403                      (int)uio->uio_offset, uio_resid(uio), retval, segflg, 0);
4404
4405         return (retval);
4406 }
4407
4408
4409 int
4410 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
4411 {
4412         int       segflg;
4413         int       io_size;
4414         int       xsize;
4415         int       start_offset;
4416         int       retval = 0;
4417         memory_object_control_t  control;
4418
4419
4420         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
4421                      (int)uio->uio_offset, uio_resid(uio), 0, *io_resid, 0);
4422
4423         control = ubc_getobject(vp, UBC_FLAGS_NONE);
4424         if (control == MEMORY_OBJECT_CONTROL_NULL) {
4425                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4426                              (int)uio->uio_offset, uio_resid(uio), retval, 3, 0);
4427
4428                 return(0);
4429         }
4430         segflg = uio->uio_segflg;
4431
4432         switch(segflg) {
4433
4434           case UIO_USERSPACE32:
4435           case UIO_USERISPACE32:
4436                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4437                 break;
4438
4439           case UIO_USERSPACE64:
4440           case UIO_USERISPACE64:
4441                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4442                 break;
4443
4444           case UIO_SYSSPACE32:
4445                 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4446                 break;
4447
4448           case UIO_SYSSPACE64:
4449                 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4450                 break;
4451
4452           case UIO_USERSPACE:
4453           case UIO_USERISPACE:
4454                 uio->uio_segflg = UIO_PHYS_USERSPACE;
4455                 break;
4456
4457           case UIO_SYSSPACE:
4458                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4459                 break;
4460         }
4461
4462         if ( (io_size = *io_resid) ) {
4463                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4464                 xsize = uio_resid(uio);
4465
4466                 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset,
4467                                                        uio, start_offset, io_size, mark_dirty);
4468                 xsize -= uio_resid(uio);
4469                 io_size -= xsize;
4470         }
4471         uio->uio_segflg = segflg;
4472         *io_resid       = io_size;
4473
4474         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4475                      (int)uio->uio_offset, uio_resid(uio), retval, 0x80000000 | segflg, 0);
4476
4477         return(retval);
4478 }
4479
4480
4481 int
4482 is_file_clean(vnode_t vp, off_t filesize)
4483 {
4484         off_t f_offset;
4485         int   flags;
4486         int   total_dirty = 0;
4487
4488         for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
4489                 if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
4490                         if (flags & UPL_POP_DIRTY) {
4491                                 total_dirty++;
4492                         }
4493                 }
4494         }
4495         if (total_dirty)
4496                 return(EINVAL);
4497
4498         return (0);
4499 }
4500
4501
4502
4503 /*
4504  * Dirty region tracking/clustering mechanism.
4505  *
4506  * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4507  * dirty regions within a larger space (file).  It is primarily intended to
4508  * support clustering in large files with many dirty areas.
4509  *
4510  * The implementation assumes that the dirty regions are pages.
4511  *
4512  * To represent dirty pages within the file, we store bit vectors in a
4513  * variable-size circular hash.
4514  */
4515
4516 /*
4517  * Bitvector size.  This determines the number of pages we group in a
4518  * single hashtable entry.  Each hashtable entry is aligned to this
4519  * size within the file.
4520  */
4521 #define DRT_BITVECTOR_PAGES             256
4522
4523 /*
4524  * File offset handling.
4525  *
4526  * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4527  * the correct formula is  (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4528  */
4529 #define DRT_ADDRESS_MASK                (~((1 << 20) - 1))
4530 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK)
4531
4532 /*
4533  * Hashtable address field handling.
4534  *
4535  * The low-order bits of the hashtable address are used to conserve
4536  * space.
4537  *
4538  * DRT_HASH_COUNT_MASK must be large enough to store the range
4539  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4540  * to indicate that the bucket is actually unoccupied.
4541  */
4542 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4543 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \
4544         do {                                                                                            \
4545                 (scm)->scm_hashtable[(i)].dhe_control =                                                 \
4546                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4547         } while (0)
4548 #define DRT_HASH_COUNT_MASK             0x1ff
4549 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4550 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \
4551         do {                                                                                                            \
4552                 (scm)->scm_hashtable[(i)].dhe_control =                                                                 \
4553                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \
4554         } while (0)
4555 #define DRT_HASH_CLEAR(scm, i)                                                                                          \
4556         do {                                                                                                            \
4557                 (scm)->scm_hashtable[(i)].dhe_control = 0;                                                              \
4558         } while (0)
4559 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4560 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4561 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \
4562         do {                                                                                            \
4563                 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \
4564                 DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \
4565         } while(0);
4566
4567
4568 /*
4569  * Hash table moduli.
4570  *
4571  * Since the hashtable entry's size is dependent on the size of
4572  * the bitvector, and since the hashtable size is constrained to
4573  * both being prime and fitting within the desired allocation
4574  * size, these values need to be manually determined.
4575  *
4576  * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4577  *
4578  * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4579  * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4580  */
4581 #define DRT_HASH_SMALL_MODULUS  23
4582 #define DRT_HASH_LARGE_MODULUS  401
4583
4584 #define DRT_SMALL_ALLOCATION    1024    /* 104 bytes spare */
4585 #define DRT_LARGE_ALLOCATION    16384   /* 344 bytes spare */
4586
4587 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4588
4589 /*
4590  * Hashtable bitvector handling.
4591  *
4592  * Bitvector fields are 32 bits long.
4593  */
4594
4595 #define DRT_HASH_SET_BIT(scm, i, bit)                           \
4596         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4597
4598 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \
4599         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4600
4601 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \
4602         ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4603
4604 #define DRT_BITVECTOR_CLEAR(scm, i)                             \
4605         bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4606
4607 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \
4608         bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \
4609             &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \
4610             (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4611
4612
4613
4614 /*
4615  * Hashtable entry.
4616  */
4617 struct vfs_drt_hashentry {
4618         u_int64_t       dhe_control;
4619         u_int32_t       dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
4620 };
4621
4622 /*
4623  * Dirty Region Tracking structure.
4624  *
4625  * The hashtable is allocated entirely inside the DRT structure.
4626  *
4627  * The hash is a simple circular prime modulus arrangement, the structure
4628  * is resized from small to large if it overflows.
4629  */
4630
4631 struct vfs_drt_clustermap {
4632         u_int32_t               scm_magic;      /* sanity/detection */
4633 #define DRT_SCM_MAGIC           0x12020003
4634         u_int32_t               scm_modulus;    /* current ring size */
4635         u_int32_t               scm_buckets;    /* number of occupied buckets */
4636         u_int32_t               scm_lastclean;  /* last entry we cleaned */
4637         u_int32_t               scm_iskips;     /* number of slot skips */
4638
4639         struct vfs_drt_hashentry scm_hashtable[0];
4640 };
4641
4642
4643 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus)
4644 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus)
4645
4646 /*
4647  * Debugging codes and arguments.
4648  */
4649 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4650 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4651 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4652 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4653 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4654                                                             * dirty */
4655                                                            /* 0, setcount */
4656                                                            /* 1 (clean, no map) */
4657                                                            /* 2 (map alloc fail) */
4658                                                            /* 3, resid (partial) */
4659 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87))
4660 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4661                                                             * lastclean, iskips */
4662
4663
4664 static kern_return_t    vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
4665 static kern_return_t    vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
4666 static kern_return_t    vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
4667         u_int64_t offset, int *indexp);
4668 static kern_return_t    vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
4669         u_int64_t offset,
4670         int *indexp,
4671         int recursed);
4672 static kern_return_t    vfs_drt_do_mark_pages(
4673         void            **cmapp,
4674         u_int64_t       offset,
4675         u_int           length,
4676         int             *setcountp,
4677         int             dirty);
4678 static void             vfs_drt_trace(
4679         struct vfs_drt_clustermap *cmap,
4680         int code,
4681         int arg1,
4682         int arg2,
4683         int arg3,
4684         int arg4);
4685
4686
4687 /*
4688  * Allocate and initialise a sparse cluster map.
4689  *
4690  * Will allocate a new map, resize or compact an existing map.
4691  *
4692  * XXX we should probably have at least one intermediate map size,
4693  * as the 1:16 ratio seems a bit drastic.
4694  */
4695 static kern_return_t
4696 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
4697 {
4698         struct vfs_drt_clustermap *cmap, *ocmap;
4699         kern_return_t   kret;
4700         u_int64_t       offset;
4701         int             nsize, i, active_buckets, index, copycount;
4702
4703         ocmap = NULL;
4704         if (cmapp != NULL)
4705                 ocmap = *cmapp;
4706
4707         /*
4708          * Decide on the size of the new map.
4709          */
4710         if (ocmap == NULL) {
4711                 nsize = DRT_HASH_SMALL_MODULUS;
4712         } else {
4713                 /* count the number of active buckets in the old map */
4714                 active_buckets = 0;
4715                 for (i = 0; i < ocmap->scm_modulus; i++) {
4716                         if (!DRT_HASH_VACANT(ocmap, i) &&
4717                             (DRT_HASH_GET_COUNT(ocmap, i) != 0))
4718                                 active_buckets++;
4719                 }
4720                 /*
4721                  * If we're currently using the small allocation, check to
4722                  * see whether we should grow to the large one.
4723                  */
4724                 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
4725                         /* if the ring is nearly full */
4726                         if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
4727                                 nsize = DRT_HASH_LARGE_MODULUS;
4728                         } else {
4729                                 nsize = DRT_HASH_SMALL_MODULUS;
4730                         }
4731                 } else {
4732                         /* already using the large modulus */
4733                         nsize = DRT_HASH_LARGE_MODULUS;
4734                         /*
4735                          * If the ring is completely full, there's
4736                          * nothing useful for us to do.  Behave as
4737                          * though we had compacted into the new
4738                          * array and return.
4739                          */
4740                         if (active_buckets >= DRT_HASH_LARGE_MODULUS)
4741                                 return(KERN_SUCCESS);
4742                 }
4743         }
4744
4745         /*
4746          * Allocate and initialise the new map.
4747          */
4748
4749         kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
4750             (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4751         if (kret != KERN_SUCCESS)
4752                 return(kret);
4753         cmap->scm_magic = DRT_SCM_MAGIC;
4754         cmap->scm_modulus = nsize;
4755         cmap->scm_buckets = 0;
4756         cmap->scm_lastclean = 0;
4757         cmap->scm_iskips = 0;
4758         for (i = 0; i < cmap->scm_modulus; i++) {
4759                 DRT_HASH_CLEAR(cmap, i);
4760                 DRT_HASH_VACATE(cmap, i);
4761                 DRT_BITVECTOR_CLEAR(cmap, i);
4762         }
4763
4764         /*
4765          * If there's an old map, re-hash entries from it into the new map.
4766          */
4767         copycount = 0;
4768         if (ocmap != NULL) {
4769                 for (i = 0; i < ocmap->scm_modulus; i++) {
4770                         /* skip empty buckets */
4771                         if (DRT_HASH_VACANT(ocmap, i) ||
4772                             (DRT_HASH_GET_COUNT(ocmap, i) == 0))
4773                                 continue;
4774                         /* get new index */
4775                         offset = DRT_HASH_GET_ADDRESS(ocmap, i);
4776                         kret = vfs_drt_get_index(&cmap, offset, &index, 1);
4777                         if (kret != KERN_SUCCESS) {
4778                                 /* XXX need to bail out gracefully here */
4779                                 panic("vfs_drt: new cluster map mysteriously too small");
4780                         }
4781                         /* copy */
4782                         DRT_HASH_COPY(ocmap, i, cmap, index);
4783                         copycount++;
4784                 }
4785         }
4786
4787         /* log what we've done */
4788         vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
4789
4790         /*
4791          * It's important to ensure that *cmapp always points to
4792          * a valid map, so we must overwrite it before freeing
4793          * the old map.
4794          */
4795         *cmapp = cmap;
4796         if (ocmap != NULL) {
4797                 /* emit stats into trace buffer */
4798                 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
4799                               ocmap->scm_modulus,
4800                               ocmap->scm_buckets,
4801                               ocmap->scm_lastclean,
4802                               ocmap->scm_iskips);
4803
4804                 vfs_drt_free_map(ocmap);
4805         }
4806         return(KERN_SUCCESS);
4807 }
4808
4809
4810 /*
4811  * Free a sparse cluster map.
4812  */
4813 static kern_return_t
4814 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
4815 {
4816         kmem_free(kernel_map, (vm_offset_t)cmap,
4817                   (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4818         return(KERN_SUCCESS);
4819 }
4820
4821
4822 /*
4823  * Find the hashtable slot currently occupied by an entry for the supplied offset.
4824  */
4825 static kern_return_t
4826 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
4827 {
4828         int             index, i;
4829
4830         offset = DRT_ALIGN_ADDRESS(offset);
4831         index = DRT_HASH(cmap, offset);
4832
4833         /* traverse the hashtable */
4834         for (i = 0; i < cmap->scm_modulus; i++) {
4835
4836                 /*
4837                  * If the slot is vacant, we can stop.
4838                  */
4839                 if (DRT_HASH_VACANT(cmap, index))
4840                         break;
4841
4842                 /*
4843                  * If the address matches our offset, we have success.
4844                  */
4845                 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
4846                         *indexp = index;
4847                         return(KERN_SUCCESS);
4848                 }
4849
4850                 /*
4851                  * Move to the next slot, try again.
4852                  */
4853                 index = DRT_HASH_NEXT(cmap, index);
4854         }
4855         /*
4856          * It's not there.
4857          */
4858         return(KERN_FAILURE);
4859 }
4860
4861 /*
4862  * Find the hashtable slot for the supplied offset.  If we haven't allocated
4863  * one yet, allocate one and populate the address field.  Note that it will
4864  * not have a nonzero page count and thus will still technically be free, so
4865  * in the case where we are called to clean pages, the slot will remain free.
4866  */
4867 static kern_return_t
4868 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
4869 {
4870         struct vfs_drt_clustermap *cmap;
4871         kern_return_t   kret;
4872         int             index, i;
4873
4874         cmap = *cmapp;
4875
4876         /* look for an existing entry */
4877         kret = vfs_drt_search_index(cmap, offset, indexp);
4878         if (kret == KERN_SUCCESS)
4879                 return(kret);
4880
4881         /* need to allocate an entry */
4882         offset = DRT_ALIGN_ADDRESS(offset);
4883         index = DRT_HASH(cmap, offset);
4884
4885         /* scan from the index forwards looking for a vacant slot */
4886         for (i = 0; i < cmap->scm_modulus; i++) {
4887                 /* slot vacant? */
4888                 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
4889                         cmap->scm_buckets++;
4890                         if (index < cmap->scm_lastclean)
4891                                 cmap->scm_lastclean = index;
4892                         DRT_HASH_SET_ADDRESS(cmap, index, offset);
4893                         DRT_HASH_SET_COUNT(cmap, index, 0);
4894                         DRT_BITVECTOR_CLEAR(cmap, index);
4895                         *indexp = index;
4896                         vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
4897                         return(KERN_SUCCESS);
4898                 }
4899                 cmap->scm_iskips += i;
4900                 index = DRT_HASH_NEXT(cmap, index);
4901         }
4902
4903         /*
4904          * We haven't found a vacant slot, so the map is full.  If we're not
4905          * already recursed, try reallocating/compacting it.
4906          */
4907         if (recursed)
4908                 return(KERN_FAILURE);
4909         kret = vfs_drt_alloc_map(cmapp);
4910         if (kret == KERN_SUCCESS) {
4911                 /* now try to insert again */
4912                 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
4913         }
4914         return(kret);
4915 }
4916
4917 /*
4918  * Implementation of set dirty/clean.
4919  *
4920  * In the 'clean' case, not finding a map is OK.
4921  */
4922 static kern_return_t
4923 vfs_drt_do_mark_pages(
4924         void            **private,
4925         u_int64_t       offset,
4926         u_int           length,
4927         int             *setcountp,
4928         int             dirty)
4929 {
4930         struct vfs_drt_clustermap *cmap, **cmapp;
4931         kern_return_t   kret;
4932         int             i, index, pgoff, pgcount, setcount, ecount;
4933
4934         cmapp = (struct vfs_drt_clustermap **)private;
4935         cmap = *cmapp;
4936
4937         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
4938
4939         if (setcountp != NULL)
4940                 *setcountp = 0;
4941
4942         /* allocate a cluster map if we don't already have one */
4943         if (cmap == NULL) {
4944                 /* no cluster map, nothing to clean */
4945                 if (!dirty) {
4946                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
4947                         return(KERN_SUCCESS);
4948                 }
4949                 kret = vfs_drt_alloc_map(cmapp);
4950                 if (kret != KERN_SUCCESS) {
4951                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
4952                         return(kret);
4953                 }
4954         }
4955         setcount = 0;
4956
4957         /*
4958          * Iterate over the length of the region.
4959          */
4960         while (length > 0) {
4961                 /*
4962                  * Get the hashtable index for this offset.
4963                  *
4964                  * XXX this will add blank entries if we are clearing a range
4965                  * that hasn't been dirtied.
4966                  */
4967                 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
4968                 cmap = *cmapp;  /* may have changed! */
4969                 /* this may be a partial-success return */
4970                 if (kret != KERN_SUCCESS) {
4971                         if (setcountp != NULL)
4972                                 *setcountp = setcount;
4973                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
4974
4975                         return(kret);
4976                 }
4977
4978                 /*
4979                  * Work out how many pages we're modifying in this
4980                  * hashtable entry.
4981                  */
4982                 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
4983                 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
4984
4985                 /*
4986                  * Iterate over pages, dirty/clearing as we go.
4987                  */
4988                 ecount = DRT_HASH_GET_COUNT(cmap, index);
4989                 for (i = 0; i < pgcount; i++) {
4990                         if (dirty) {
4991                                 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4992                                         DRT_HASH_SET_BIT(cmap, index, pgoff + i);
4993                                         ecount++;
4994                                         setcount++;
4995                                 }
4996                         } else {
4997                                 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4998                                         DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
4999                                         ecount--;
5000                                         setcount++;
5001                                 }
5002                         }
5003                 }
5004                 DRT_HASH_SET_COUNT(cmap, index, ecount);
5005
5006                 offset += pgcount * PAGE_SIZE;
5007                 length -= pgcount * PAGE_SIZE;
5008         }
5009         if (setcountp != NULL)
5010                 *setcountp = setcount;
5011
5012         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
5013
5014         return(KERN_SUCCESS);
5015 }
5016
5017 /*
5018  * Mark a set of pages as dirty/clean.
5019  *
5020  * This is a public interface.
5021  *
5022  * cmapp
5023  *      Pointer to storage suitable for holding a pointer.  Note that
5024  *      this must either be NULL or a value set by this function.
5025  *
5026  * size
5027  *      Current file size in bytes.
5028  *
5029  * offset
5030  *      Offset of the first page to be marked as dirty, in bytes.  Must be
5031  *      page-aligned.
5032  *
5033  * length
5034  *      Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE.
5035  *
5036  * setcountp
5037  *      Number of pages newly marked dirty by this call (optional).
5038  *
5039  * Returns KERN_SUCCESS if all the pages were successfully marked.
5040  */
5041 static kern_return_t
5042 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp)
5043 {
5044         /* XXX size unused, drop from interface */
5045         return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
5046 }
5047
5048 #if 0
5049 static kern_return_t
5050 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
5051 {
5052         return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
5053 }
5054 #endif
5055
5056 /*
5057  * Get a cluster of dirty pages.
5058  *
5059  * This is a public interface.
5060  *
5061  * cmapp
5062  *      Pointer to storage managed by drt_mark_pages.  Note that this must
5063  *      be NULL or a value set by drt_mark_pages.
5064  *
5065  * offsetp
5066  *      Returns the byte offset into the file of the first page in the cluster.
5067  *
5068  * lengthp
5069  *      Returns the length in bytes of the cluster of dirty pages.
5070  *
5071  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there
5072  * are no dirty pages meeting the minmum size criteria.  Private storage will
5073  * be released if there are no more dirty pages left in the map
5074  *
5075  */
5076 static kern_return_t
5077 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
5078 {
5079         struct vfs_drt_clustermap *cmap;
5080         u_int64_t       offset;
5081         u_int           length;
5082         int             index, i, j, fs, ls;
5083
5084         /* sanity */
5085         if ((cmapp == NULL) || (*cmapp == NULL))
5086                 return(KERN_FAILURE);
5087         cmap = *cmapp;
5088
5089         /* walk the hashtable */
5090         for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
5091                 index = DRT_HASH(cmap, offset);
5092
5093                 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
5094                         continue;
5095
5096                 /* scan the bitfield for a string of bits */
5097                 fs = -1;
5098
5099                 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5100                         if (DRT_HASH_TEST_BIT(cmap, index, i)) {
5101                                 fs = i;
5102                                 break;
5103                         }
5104                 }
5105                 if (fs == -1) {
5106                         /*  didn't find any bits set */
5107                         panic("vfs_drt: entry summary count > 0 but no bits set in map");
5108                 }
5109                 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
5110                         if (!DRT_HASH_TEST_BIT(cmap, index, i))
5111                                 break;
5112                 }
5113
5114                 /* compute offset and length, mark pages clean */
5115                 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
5116                 length = ls * PAGE_SIZE;
5117                 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
5118                 cmap->scm_lastclean = index;
5119
5120                 /* return successful */
5121                 *offsetp = (off_t)offset;
5122                 *lengthp = length;
5123
5124                 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
5125                 return(KERN_SUCCESS);
5126         }
5127         /*
5128          * We didn't find anything... hashtable is empty
5129          * emit stats into trace buffer and
5130          * then free it
5131          */
5132         vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5133                       cmap->scm_modulus,
5134                       cmap->scm_buckets,
5135                       cmap->scm_lastclean,
5136                       cmap->scm_iskips);
5137
5138         vfs_drt_free_map(cmap);
5139         *cmapp = NULL;
5140
5141         return(KERN_FAILURE);
5142 }
5143
5144
5145 static kern_return_t
5146 vfs_drt_control(void **cmapp, int op_type)
5147 {
5148         struct vfs_drt_clustermap *cmap;
5149
5150         /* sanity */
5151         if ((cmapp == NULL) || (*cmapp == NULL))
5152                 return(KERN_FAILURE);
5153         cmap = *cmapp;
5154
5155         switch (op_type) {
5156         case 0:
5157                 /* emit stats into trace buffer */
5158                 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5159                               cmap->scm_modulus,
5160                               cmap->scm_buckets,
5161                               cmap->scm_lastclean,
5162                               cmap->scm_iskips);
5163
5164                 vfs_drt_free_map(cmap);
5165                 *cmapp = NULL;
5166                 break;
5167
5168         case 1:
5169                 cmap->scm_lastclean = 0;
5170                 break;
5171         }
5172         return(KERN_SUCCESS);
5173 }
5174
5175
5176
5177 /*
5178  * Emit a summary of the state of the clustermap into the trace buffer
5179  * along with some caller-provided data.
5180  */
5181 #if KDEBUG
5182 static void
5183 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
5184 {
5185         KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
5186 }
5187 #else
5188 static void
5189 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
5190                           __unused int arg1, __unused int arg2, __unused int arg3,
5191                           __unused int arg4)
5192 {
5193 }
5194 #endif
5195
5196 #if 0
5197 /*
5198  * Perform basic sanity check on the hash entry summary count
5199  * vs. the actual bits set in the entry.
5200  */
5201 static void
5202 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
5203 {
5204         int index, i;
5205         int bits_on;
5206
5207         for (index = 0; index < cmap->scm_modulus; index++) {
5208                 if (DRT_HASH_VACANT(cmap, index))
5209                         continue;
5210
5211                 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5212                         if (DRT_HASH_TEST_BIT(cmap, index, i))
5213                                 bits_on++;
5214                 }
5215                 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
5216                         panic("bits_on = %d,  index = %d\n", bits_on, index);
5217         }
5218 }
5219 #endif