bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * Redistribution and use in source and binary forms, with or without
  28  * modification, are permitted provided that the following conditions
  29  * are met:
  30  * 1. Redistributions of source code must retain the above copyright
  31  *    notice, this list of conditions and the following disclaimer.
  32  * 2. Redistributions in binary form must reproduce the above copyright
  33  *    notice, this list of conditions and the following disclaimer in the
  34  *    documentation and/or other materials provided with the distribution.
  35  * 3. All advertising materials mentioning features or use of this software
  36  *    must display the following acknowledgement:
  37  *      This product includes software developed by the University of
  38  *      California, Berkeley and its contributors.
  39  * 4. Neither the name of the University nor the names of its contributors
  40  *    may be used to endorse or promote products derived from this software
  41  *    without specific prior written permission.
  42  *
  43  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  46  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  53  * SUCH DAMAGE.
  54  *
  55  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  56  */
  57
  58 #include <sys/param.h>
  59 #include <sys/proc_internal.h>
  60 #include <sys/buf_internal.h>
  61 #include <sys/mount_internal.h>
  62 #include <sys/vnode_internal.h>
  63 #include <sys/trace.h>
  64 #include <sys/malloc.h>
  65 #include <sys/time.h>
  66 #include <sys/kernel.h>
  67 #include <sys/resourcevar.h>
  68 #include <sys/uio_internal.h>
  69 #include <libkern/libkern.h>
  70 #include <machine/machine_routines.h>
  71
  72 #include <sys/ubc_internal.h>
  73
  74 #include <mach/mach_types.h>
  75 #include <mach/memory_object_types.h>
  76 #include <mach/vm_map.h>
  77 #include <mach/upl.h>
  78
  79 #include <vm/vm_kern.h>
  80 #include <vm/vm_map.h>
  81 #include <vm/vm_pageout.h>
  82
  83 #include <sys/kdebug.h>
  84
  85
  86 #define CL_READ      0x01
  87 #define CL_ASYNC     0x02
  88 #define CL_COMMIT    0x04
  89 #define CL_PAGEOUT   0x10
  90 #define CL_AGE       0x20
  91 #define CL_DUMP      0x40
  92 #define CL_NOZERO    0x80
  93 #define CL_PAGEIN    0x100
  94 #define CL_DEV_MEMORY 0x200
  95 #define CL_PRESERVE   0x400
  96 #define CL_THROTTLE   0x800
  97 #define CL_KEEPCACHED 0x1000
  98
  99
 100 struct clios {
 101         u_int  io_completed;       /* amount of io that has currently completed */
 102         u_int  io_issued;          /* amount of io that was successfully issued */
 103         int    io_error;           /* error code of first error encountered */
 104         int    io_wanted;          /* someone is sleeping waiting for a change in state */
 105 };
 106
 107 static lck_grp_t        *cl_mtx_grp;
 108 static lck_attr_t       *cl_mtx_attr;
 109 static lck_grp_attr_t   *cl_mtx_grp_attr;
 110 static lck_mtx_t        *cl_mtxp;
 111
 112
 113 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 114                       int flags, buf_t real_bp, struct clios *iostate);
 115 static int cluster_iodone(buf_t bp, void *dummy);
 116 static int cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize);
 117 static int cluster_hard_throttle_on(vnode_t vp);
 118
 119 static int cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags);
 120 static int cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
 121                            off_t headOff, off_t tailOff, int flags);
 122 static int cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize);
 123 static int cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF);
 124 static int cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize);
 125 static int cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF);
 126 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags);
 127
 128 static void     cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra);
 129
 130 static int      cluster_push_x(vnode_t vp, struct cl_extent *, off_t EOF, int flags);
 131 static void     cluster_push_EOF(vnode_t vp, off_t EOF);
 132
 133 static int      cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int can_delay, int push_all);
 134
 135 static void     sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF);
 136 static void     sparse_cluster_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_all);
 137 static void     sparse_cluster_add(struct cl_writebehind *, vnode_t vp, struct cl_extent *, off_t EOF);
 138
 139 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp);
 140 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
 141 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
 142
 143 int     is_file_clean(vnode_t, off_t);
 144
 145 /*
 146  * throttle the number of async writes that
 147  * can be outstanding on a single vnode
 148  * before we issue a synchronous write
 149  */
 150 #define HARD_THROTTLE_MAXCNT    0
 151 #define HARD_THROTTLE_MAXSIZE   (64 * 1024)
 152
 153 int hard_throttle_on_root = 0;
 154 struct timeval priority_IO_timestamp_for_root;
 155
 156
 157 void
 158 cluster_init(void) {
 159         /*
 160          * allocate lock group attribute and group
 161          */
 162         cl_mtx_grp_attr = lck_grp_attr_alloc_init();
 163         //lck_grp_attr_setstat(cl_mtx_grp_attr);
 164         cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
 165
 166         /*
 167          * allocate the lock attribute
 168          */
 169         cl_mtx_attr = lck_attr_alloc_init();
 170         //lck_attr_setdebug(clf_mtx_attr);
 171
 172         /*
 173          * allocate and initialize mutex's used to protect updates and waits
 174          * on the cluster_io context
 175          */
 176         cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
 177
 178         if (cl_mtxp == NULL)
 179                 panic("cluster_init: failed to allocate cl_mtxp");
 180 }
 181
 182
 183
 184 #define CLW_ALLOCATE            0x01
 185 #define CLW_RETURNLOCKED        0x02
 186 /*
 187  * if the read ahead context doesn't yet exist,
 188  * allocate and initialize it...
 189  * the vnode lock serializes multiple callers
 190  * during the actual assignment... first one
 191  * to grab the lock wins... the other callers
 192  * will release the now unnecessary storage
 193  *
 194  * once the context is present, try to grab (but don't block on)
 195  * the lock associated with it... if someone
 196  * else currently owns it, than the read
 197  * will run without read-ahead.  this allows
 198  * multiple readers to run in parallel and
 199  * since there's only 1 read ahead context,
 200  * there's no real loss in only allowing 1
 201  * reader to have read-ahead enabled.
 202  */
 203 static struct cl_readahead *
 204 cluster_get_rap(vnode_t vp)
 205 {
 206         struct ubc_info         *ubc;
 207         struct cl_readahead     *rap;
 208
 209         ubc = vp->v_ubcinfo;
 210
 211         if ((rap = ubc->cl_rahead) == NULL) {
 212                 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
 213
 214                 bzero(rap, sizeof *rap);
 215                 rap->cl_lastr = -1;
 216                 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
 217
 218                 vnode_lock(vp);
 219
 220                 if (ubc->cl_rahead == NULL)
 221                         ubc->cl_rahead = rap;
 222                 else {
 223                         lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
 224                         FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
 225                                 rap = ubc->cl_rahead;
 226                 }
 227                 vnode_unlock(vp);
 228         }
 229         if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
 230                 return(rap);
 231
 232         return ((struct cl_readahead *)NULL);
 233 }
 234
 235
 236 /*
 237  * if the write behind context doesn't yet exist,
 238  * and CLW_ALLOCATE is specified, allocate and initialize it...
 239  * the vnode lock serializes multiple callers
 240  * during the actual assignment... first one
 241  * to grab the lock wins... the other callers
 242  * will release the now unnecessary storage
 243  *
 244  * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
 245  * the lock associated with the write behind context before
 246  * returning
 247  */
 248
 249 static struct cl_writebehind *
 250 cluster_get_wbp(vnode_t vp, int flags)
 251 {
 252         struct ubc_info *ubc;
 253         struct cl_writebehind *wbp;
 254
 255         ubc = vp->v_ubcinfo;
 256
 257         if ((wbp = ubc->cl_wbehind) == NULL) {
 258
 259                 if ( !(flags & CLW_ALLOCATE))
 260                         return ((struct cl_writebehind *)NULL);
 261
 262                 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
 263
 264                 bzero(wbp, sizeof *wbp);
 265                 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
 266
 267                 vnode_lock(vp);
 268
 269                 if (ubc->cl_wbehind == NULL)
 270                         ubc->cl_wbehind = wbp;
 271                 else {
 272                         lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
 273                         FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
 274                                 wbp = ubc->cl_wbehind;
 275                 }
 276                 vnode_unlock(vp);
 277         }
 278         if (flags & CLW_RETURNLOCKED)
 279                 lck_mtx_lock(&wbp->cl_lockw);
 280
 281         return (wbp);
 282 }
 283
 284
 285 static int
 286 cluster_hard_throttle_on(vnode_t vp)
 287 {
 288         static struct timeval hard_throttle_maxelapsed = { 0, 200000 };
 289
 290         if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
 291                 struct timeval elapsed;
 292
 293                 if (hard_throttle_on_root)
 294                         return(1);
 295
 296                 microuptime(&elapsed);
 297                 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
 298
 299                 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
 300                         return(1);
 301         }
 302         return(0);
 303 }
 304
 305
 306 static int
 307 cluster_iodone(buf_t bp, __unused void *dummy)
 308 {
 309         int     b_flags;
 310         int     error;
 311         int     total_size;
 312         int     total_resid;
 313         int     upl_offset;
 314         int     zero_offset;
 315         upl_t   upl;
 316         buf_t   cbp;
 317         buf_t   cbp_head;
 318         buf_t   cbp_next;
 319         buf_t   real_bp;
 320         struct  clios *iostate;
 321         int     commit_size;
 322         int     pg_offset;
 323
 324         cbp_head = (buf_t)(bp->b_trans_head);
 325
 326         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 327                      (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 328
 329         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 330                 /*
 331                  * all I/O requests that are part of this transaction
 332                  * have to complete before we can process it
 333                  */
 334                 if ( !(cbp->b_flags & B_DONE)) {
 335
 336                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 337                                      (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
 338
 339                         return 0;
 340                 }
 341         }
 342         error       = 0;
 343         total_size  = 0;
 344         total_resid = 0;
 345
 346         cbp        = cbp_head;
 347         upl_offset = cbp->b_uploffset;
 348         upl        = cbp->b_upl;
 349         b_flags    = cbp->b_flags;
 350         real_bp    = cbp->b_real_bp;
 351         zero_offset= cbp->b_validend;
 352         iostate    = (struct clios *)cbp->b_iostate;
 353
 354         if (real_bp)
 355                 real_bp->b_dev = cbp->b_dev;
 356
 357         while (cbp) {
 358                 if ((cbp->b_flags & B_ERROR) && error == 0)
 359                         error = cbp->b_error;
 360
 361                 total_resid += cbp->b_resid;
 362                 total_size  += cbp->b_bcount;
 363
 364                 cbp_next = cbp->b_trans_next;
 365
 366                 free_io_buf(cbp);
 367
 368                 cbp = cbp_next;
 369         }
 370         if (zero_offset)
 371                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 372
 373         if (iostate) {
 374                 int need_wakeup = 0;
 375
 376                 /*
 377                  * someone has issued multiple I/Os asynchrounsly
 378                  * and is waiting for them to complete (streaming)
 379                  */
 380                 lck_mtx_lock(cl_mtxp);
 381
 382                 if (error && iostate->io_error == 0)
 383                         iostate->io_error = error;
 384
 385                 iostate->io_completed += total_size;
 386
 387                 if (iostate->io_wanted) {
 388                         /*
 389                          * someone is waiting for the state of
 390                          * this io stream to change
 391                          */
 392                         iostate->io_wanted = 0;
 393                         need_wakeup = 1;
 394                 }
 395                 lck_mtx_unlock(cl_mtxp);
 396
 397                 if (need_wakeup)
 398                         wakeup((caddr_t)&iostate->io_wanted);
 399         }
 400         if ((b_flags & B_NEED_IODONE) && real_bp) {
 401                 if (error) {
 402                         real_bp->b_flags |= B_ERROR;
 403                         real_bp->b_error = error;
 404                 }
 405                 real_bp->b_resid = total_resid;
 406
 407                 buf_biodone(real_bp);
 408         }
 409         if (error == 0 && total_resid)
 410                 error = EIO;
 411
 412         if (b_flags & B_COMMIT_UPL) {
 413                 pg_offset   = upl_offset & PAGE_MASK;
 414                 commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 415
 416                 if (error || (b_flags & B_NOCACHE)) {
 417                         int upl_abort_code;
 418                         int page_in  = 0;
 419                         int page_out = 0;
 420
 421                         if (b_flags & B_PAGEIO) {
 422                                 if (b_flags & B_READ)
 423                                         page_in  = 1;
 424                                 else
 425                                         page_out = 1;
 426                         }
 427                         if (b_flags & B_CACHE)          /* leave pages in the cache unchanged on error */
 428                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 429                         else if (page_out && (error != ENXIO)) /* transient error */
 430                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 431                         else if (page_in)
 432                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 433                         else
 434                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 435
 436                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 437                                                     upl_abort_code);
 438
 439                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 440                                      (int)upl, upl_offset - pg_offset, commit_size,
 441                                      0x80000000|upl_abort_code, 0);
 442
 443                 } else {
 444                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 445
 446                         if ((b_flags & B_PHYS) && (b_flags & B_READ))
 447                                 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
 448
 449                         if (b_flags & B_AGE)
 450                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 451
 452                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 453                                         upl_commit_flags);
 454
 455                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 456                                      (int)upl, upl_offset - pg_offset, commit_size,
 457                                      upl_commit_flags, 0);
 458                 }
 459         } else {
 460                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 461                              (int)upl, upl_offset, 0, error, 0);
 462         }
 463
 464         return (error);
 465 }
 466
 467
 468 void
 469 cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, buf_t bp)
 470 {
 471         upl_page_info_t *pl;
 472
 473         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
 474                      upl_offset, size, (int)bp, 0, 0);
 475
 476         if (bp == NULL || bp->b_datap == 0) {
 477
 478                 pl = ubc_upl_pageinfo(upl);
 479
 480                 while (size) {
 481                         int           page_offset;
 482                         int           page_index;
 483                         addr64_t      zero_addr;
 484                         int           zero_cnt;
 485
 486                         page_index  = upl_offset / PAGE_SIZE;
 487                         page_offset = upl_offset & PAGE_MASK;
 488
 489                         zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
 490                         zero_cnt  = min(PAGE_SIZE - page_offset, size);
 491
 492                         bzero_phys(zero_addr, zero_cnt);
 493
 494                         size       -= zero_cnt;
 495                         upl_offset += zero_cnt;
 496                 }
 497         } else
 498                 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
 499
 500         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
 501                      upl_offset, size, 0, 0, 0);
 502 }
 503
 504
 505 static int
 506 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 507            int flags, buf_t real_bp, struct clios *iostate)
 508 {
 509         buf_t   cbp;
 510         u_int   size;
 511         u_int   io_size;
 512         int     io_flags;
 513         int     bmap_flags;
 514         int     error = 0;
 515         int     retval = 0;
 516         buf_t   cbp_head = NULL;
 517         buf_t   cbp_tail = NULL;
 518         int     trans_count = 0;
 519         u_int   pg_count;
 520         int     pg_offset;
 521         u_int   max_iosize;
 522         u_int   max_vectors;
 523         int     priv;
 524         int     zero_offset = 0;
 525         int     async_throttle = 0;
 526         mount_t mp;
 527
 528         mp = vp->v_mount;
 529
 530         if (mp->mnt_devblocksize > 1) {
 531                 /*
 532                  * round the requested size up so that this I/O ends on a
 533                  * page boundary in case this is a 'write'... if the filesystem
 534                  * has blocks allocated to back the page beyond the EOF, we want to
 535                  * make sure to write out the zero's that are sitting beyond the EOF
 536                  * so that in case the filesystem doesn't explicitly zero this area
 537                  * if a hole is created via a lseek/write beyond the current EOF,
 538                  * it will return zeros when it's read back from the disk.  If the
 539                  * physical allocation doesn't extend for the whole page, we'll
 540                  * only write/read from the disk up to the end of this allocation
 541                  * via the extent info returned from the VNOP_BLOCKMAP call.
 542                  */
 543                 pg_offset = upl_offset & PAGE_MASK;
 544
 545                 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
 546         } else {
 547                 /*
 548                  * anyone advertising a blocksize of 1 byte probably
 549                  * can't deal with us rounding up the request size
 550                  * AFP is one such filesystem/device
 551                  */
 552                 size = non_rounded_size;
 553         }
 554         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 555                      (int)f_offset, size, upl_offset, flags, 0);
 556
 557         if (flags & CL_READ) {
 558                 io_flags = (B_READ);
 559                 bmap_flags = VNODE_READ;
 560
 561                 max_iosize  = mp->mnt_maxreadcnt;
 562                 max_vectors = mp->mnt_segreadcnt;
 563         } else {
 564                 io_flags = 0;
 565                 bmap_flags = VNODE_WRITE;
 566
 567                 max_iosize  = mp->mnt_maxwritecnt;
 568                 max_vectors = mp->mnt_segwritecnt;
 569         }
 570         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
 571
 572         /*
 573          * make sure the maximum iosize is a
 574          * multiple of the page size
 575          */
 576         max_iosize  &= ~PAGE_MASK;
 577
 578         if (flags & CL_THROTTLE) {
 579                 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
 580                         if (max_iosize > HARD_THROTTLE_MAXSIZE)
 581                                 max_iosize = HARD_THROTTLE_MAXSIZE;
 582                         async_throttle = HARD_THROTTLE_MAXCNT;
 583                 } else
 584                         async_throttle = VNODE_ASYNC_THROTTLE;
 585         }
 586         if (flags & CL_AGE)
 587                 io_flags |= B_AGE;
 588         if (flags & CL_DUMP)
 589                 io_flags |= B_NOCACHE;
 590         if (flags & (CL_PAGEIN | CL_PAGEOUT))
 591                 io_flags |= B_PAGEIO;
 592         if (flags & CL_COMMIT)
 593                 io_flags |= B_COMMIT_UPL;
 594         if (flags & CL_PRESERVE)
 595                 io_flags |= B_PHYS;
 596         if (flags & CL_KEEPCACHED)
 597                 io_flags |= B_CACHE;
 598
 599         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 600                 /*
 601                  * then we are going to end up
 602                  * with a page that we can't complete (the file size wasn't a multiple
 603                  * of PAGE_SIZE and we're trying to read to the end of the file
 604                  * so we'll go ahead and zero out the portion of the page we can't
 605                  * read in from the file
 606                  */
 607                 zero_offset = upl_offset + non_rounded_size;
 608         }
 609         while (size) {
 610                 int     pg_resid;
 611                 daddr64_t blkno;
 612                 daddr64_t lblkno;
 613
 614                 if (size > max_iosize)
 615                         io_size = max_iosize;
 616                 else
 617                         io_size = size;
 618
 619                 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, bmap_flags, NULL))) {
 620                         break;
 621                 }
 622                 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
 623                         real_bp->b_blkno = blkno;
 624
 625                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 626                              (int)f_offset, (int)blkno, io_size, zero_offset, 0);
 627
 628                 if (io_size == 0) {
 629                         /*
 630                          * vnop_blockmap didn't return an error... however, it did
 631                          * return an extent size of 0 which means we can't
 632                          * make forward progress on this I/O... a hole in the
 633                          * file would be returned as a blkno of -1 with a non-zero io_size
 634                          * a real extent is returned with a blkno != -1 and a non-zero io_size
 635                          */
 636                         error = EINVAL;
 637                         break;
 638                 }
 639                 if ( !(flags & CL_READ) && blkno == -1) {
 640                         off_t e_offset;
 641
 642                         /*
 643                          * we're writing into a 'hole'
 644                          */
 645                         if (flags & CL_PAGEOUT) {
 646                                 /*
 647                                  * if we got here via cluster_pageout
 648                                  * then just error the request and return
 649                                  * the 'hole' should already have been covered
 650                                  */
 651                                 error = EINVAL;
 652                                 break;
 653                         }
 654                         if ( !(flags & CL_COMMIT)) {
 655                                 /*
 656                                  * currently writes always request the commit to happen
 657                                  * as part of the io completion... however, if the CL_COMMIT
 658                                  * flag isn't specified, than we can't issue the abort_range
 659                                  * since the call site is going to abort or commit the same upl..
 660                                  * in this case we can only return an error
 661                                  */
 662                                 error = EINVAL;
 663                                 break;
 664                         }
 665                         /*
 666                          * we can get here if the cluster code happens to
 667                          * pick up a page that was dirtied via mmap vs
 668                          * a 'write' and the page targets a 'hole'...
 669                          * i.e. the writes to the cluster were sparse
 670                          * and the file was being written for the first time
 671                          *
 672                          * we can also get here if the filesystem supports
 673                          * 'holes' that are less than PAGE_SIZE.... because
 674                          * we can't know if the range in the page that covers
 675                          * the 'hole' has been dirtied via an mmap or not,
 676                          * we have to assume the worst and try to push the
 677                          * entire page to storage.
 678                          *
 679                          * Try paging out the page individually before
 680                          * giving up entirely and dumping it (the pageout
 681                          * path will insure that the zero extent accounting
 682                          * has been taken care of before we get back into cluster_io)
 683                          */
 684                         ubc_upl_abort_range(upl, trunc_page(upl_offset), PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
 685
 686                         e_offset = round_page_64(f_offset + 1);
 687
 688                         if (ubc_sync_range(vp, f_offset, e_offset, UBC_PUSHDIRTY) == 0) {
 689                                 error = EINVAL;
 690                                 break;
 691                         }
 692                         io_size = e_offset - f_offset;
 693
 694                         f_offset   += io_size;
 695                         upl_offset += io_size;
 696
 697                         if (size >= io_size)
 698                                 size -= io_size;
 699                         else
 700                                 size = 0;
 701                         /*
 702                          * keep track of how much of the original request
 703                          * that we've actually completed... non_rounded_size
 704                          * may go negative due to us rounding the request
 705                          * to a page size multiple (i.e.  size > non_rounded_size)
 706                          */
 707                         non_rounded_size -= io_size;
 708
 709                         if (non_rounded_size <= 0) {
 710                                 /*
 711                                  * we've transferred all of the data in the original
 712                                  * request, but we were unable to complete the tail
 713                                  * of the last page because the file didn't have
 714                                  * an allocation to back that portion... this is ok.
 715                                  */
 716                                 size = 0;
 717                         }
 718                         continue;
 719                 }
 720                 lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64);
 721                 /*
 722                  * we have now figured out how much I/O we can do - this is in 'io_size'
 723                  * pg_offset is the starting point in the first page for the I/O
 724                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 725                  */
 726                 pg_offset = upl_offset & PAGE_MASK;
 727
 728                 if (flags & CL_DEV_MEMORY) {
 729                         /*
 730                          * currently, can't deal with reading 'holes' in file
 731                          */
 732                         if (blkno == -1) {
 733                                 error = EINVAL;
 734                                 break;
 735                         }
 736                         /*
 737                          * treat physical requests as one 'giant' page
 738                          */
 739                         pg_count = 1;
 740                 } else
 741                         pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 742
 743                 if ((flags & CL_READ) && blkno == -1) {
 744                         int bytes_to_zero;
 745
 746                         /*
 747                          * if we're reading and blkno == -1, then we've got a
 748                          * 'hole' in the file that we need to deal with by zeroing
 749                          * out the affected area in the upl
 750                          */
 751                         if (zero_offset && io_size == size) {
 752                                 /*
 753                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
 754                                  * than 'zero_offset' will be non-zero
 755                                  * if the 'hole' returned by vnop_blockmap extends all the way to the eof
 756                                  * (indicated by the io_size finishing off the I/O request for this UPL)
 757                                  * than we're not going to issue an I/O for the
 758                                  * last page in this upl... we need to zero both the hole and the tail
 759                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
 760                                  */
 761                                 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
 762
 763                                 zero_offset = 0;
 764                         } else
 765                                 bytes_to_zero = io_size;
 766
 767                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
 768
 769                         if (cbp_head)
 770                                 /*
 771                                  * if there is a current I/O chain pending
 772                                  * then the first page of the group we just zero'd
 773                                  * will be handled by the I/O completion if the zero
 774                                  * fill started in the middle of the page
 775                                  */
 776                                 pg_count = (io_size - pg_offset) / PAGE_SIZE;
 777                         else {
 778                                 /*
 779                                  * no pending I/O to pick up that first page
 780                                  * so, we have to make sure it gets committed
 781                                  * here.
 782                                  * set the pg_offset to 0 so that the upl_commit_range
 783                                  * starts with this page
 784                                  */
 785                                 pg_count = (io_size + pg_offset) / PAGE_SIZE;
 786                                 pg_offset = 0;
 787                         }
 788                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 789                                 /*
 790                                  * if we're done with the request for this UPL
 791                                  * then we have to make sure to commit the last page
 792                                  * even if we only partially zero-filled it
 793                                  */
 794                                 pg_count++;
 795
 796                         if (pg_count) {
 797                                 if (pg_offset)
 798                                         pg_resid = PAGE_SIZE - pg_offset;
 799                                 else
 800                                         pg_resid = 0;
 801
 802                                 if (flags & CL_COMMIT)
 803                                         ubc_upl_commit_range(upl,
 804                                                         (upl_offset + pg_resid) & ~PAGE_MASK,
 805                                                         pg_count * PAGE_SIZE,
 806                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 807                         }
 808                         upl_offset += io_size;
 809                         f_offset   += io_size;
 810                         size       -= io_size;
 811                         /*
 812                          * keep track of how much of the original request
 813                          * that we've actually completed... non_rounded_size
 814                          * may go negative due to us rounding the request
 815                          * to a page size multiple (i.e.  size > non_rounded_size)
 816                          */
 817                         non_rounded_size -= io_size;
 818
 819                         if (non_rounded_size <= 0) {
 820                                 /*
 821                                  * we've transferred all of the data in the original
 822                                  * request, but we were unable to complete the tail
 823                                  * of the last page because the file didn't have
 824                                  * an allocation to back that portion... this is ok.
 825                                  */
 826                                 size = 0;
 827                         }
 828                         if (cbp_head && pg_count)
 829                                 goto start_io;
 830                         continue;
 831
 832                 }
 833                 if (pg_count > max_vectors) {
 834                         if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
 835                                 io_size = PAGE_SIZE - pg_offset;
 836                                 pg_count = 1;
 837                         } else {
 838                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 839                                 pg_count = max_vectors;
 840                         }
 841                 }
 842
 843                 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
 844                         /*
 845                          * if we're not targeting a virtual device i.e. a disk image
 846                          * it's safe to dip into the reserve pool since real devices
 847                          * can complete this I/O request without requiring additional
 848                          * bufs from the alloc_io_buf pool
 849                          */
 850                         priv = 1;
 851                 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 852                         /*
 853                          * Throttle the speculative IO
 854                          */
 855                         priv = 0;
 856                 else
 857                         priv = 1;
 858
 859                 cbp = alloc_io_buf(vp, priv);
 860
 861                 if (flags & CL_PAGEOUT) {
 862                         u_int i;
 863
 864                         for (i = 0; i < pg_count; i++) {
 865                                 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
 866                                         panic("BUSY bp found in cluster_io");
 867                         }
 868                 }
 869                 if (flags & CL_ASYNC) {
 870                         if (buf_setcallback(cbp, (void *)cluster_iodone, NULL))
 871                                 panic("buf_setcallback failed\n");
 872                 }
 873                 cbp->b_flags |= io_flags;
 874
 875                 cbp->b_lblkno = lblkno;
 876                 cbp->b_blkno  = blkno;
 877                 cbp->b_bcount = io_size;
 878
 879                 if (buf_setupl(cbp, upl, upl_offset))
 880                         panic("buf_setupl failed\n");
 881
 882                 cbp->b_trans_next = (buf_t)NULL;
 883
 884                 if ((cbp->b_iostate = (void *)iostate))
 885                         /*
 886                          * caller wants to track the state of this
 887                          * io... bump the amount issued against this stream
 888                          */
 889                         iostate->io_issued += io_size;
 890
 891                 if (flags & CL_READ) {
 892                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 893                                      (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
 894                 }
 895                 else {
 896                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 897                                      (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
 898                 }
 899
 900                 if (cbp_head) {
 901                         cbp_tail->b_trans_next = cbp;
 902                         cbp_tail = cbp;
 903                 } else {
 904                         cbp_head = cbp;
 905                         cbp_tail = cbp;
 906                 }
 907                 (buf_t)(cbp->b_trans_head) = cbp_head;
 908                 trans_count++;
 909
 910                 upl_offset += io_size;
 911                 f_offset   += io_size;
 912                 size       -= io_size;
 913                 /*
 914                  * keep track of how much of the original request
 915                  * that we've actually completed... non_rounded_size
 916                  * may go negative due to us rounding the request
 917                  * to a page size multiple (i.e.  size > non_rounded_size)
 918                  */
 919                 non_rounded_size -= io_size;
 920
 921                 if (non_rounded_size <= 0) {
 922                         /*
 923                          * we've transferred all of the data in the original
 924                          * request, but we were unable to complete the tail
 925                          * of the last page because the file didn't have
 926                          * an allocation to back that portion... this is ok.
 927                          */
 928                         size = 0;
 929                 }
 930                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || trans_count > 8)) || size == 0) {
 931                         /*
 932                          * if we have no more I/O to issue or
 933                          * the current I/O we've prepared fully
 934                          * completes the last page in this request
 935                          * and it's either an ASYNC request or
 936                          * we've already accumulated more than 8 I/O's into
 937                          * this transaction and it's not an I/O directed to
 938                          * special DEVICE memory
 939                          * then go ahead and issue the I/O
 940                          */
 941 start_io:
 942                         if (real_bp) {
 943                                 cbp_head->b_flags |= B_NEED_IODONE;
 944                                 cbp_head->b_real_bp = real_bp;
 945                         } else
 946                                 cbp_head->b_real_bp = (buf_t)NULL;
 947
 948                         if (size == 0) {
 949                                 /*
 950                                  * we're about to issue the last I/O for this upl
 951                                  * if this was a read to the eof and the eof doesn't
 952                                  * finish on a page boundary, than we need to zero-fill
 953                                  * the rest of the page....
 954                                  */
 955                                 cbp_head->b_validend = zero_offset;
 956                         } else
 957                                 cbp_head->b_validend = 0;
 958
 959                         if (flags & CL_THROTTLE)
 960                                 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, (char *)"cluster_io");
 961
 962                         for (cbp = cbp_head; cbp;) {
 963                                 buf_t   cbp_next;
 964
 965                                 if ( !(io_flags & B_READ))
 966                                         vnode_startwrite(vp);
 967
 968                                 cbp_next = cbp->b_trans_next;
 969
 970                                 (void) VNOP_STRATEGY(cbp);
 971                                 cbp = cbp_next;
 972                         }
 973                         if ( !(flags & CL_ASYNC)) {
 974                                 int dummy;
 975
 976                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 977                                         buf_biowait(cbp);
 978
 979                                 if ((error = cluster_iodone(cbp_head, (void *)&dummy))) {
 980                                         if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) == CL_PAGEOUT) && (error == ENXIO))
 981                                                 error = 0;      /* drop the error */
 982                                         else {
 983                                                 if (retval == 0)
 984                                                         retval = error;
 985                                                 error = 0;
 986                                         }
 987                                 }
 988                         }
 989                         cbp_head = (buf_t)NULL;
 990                         cbp_tail = (buf_t)NULL;
 991
 992                         trans_count = 0;
 993                 }
 994         }
 995         if (error) {
 996                 int abort_size;
 997
 998                 io_size = 0;
 999
1000                 for (cbp = cbp_head; cbp;) {
1001                         buf_t   cbp_next;
1002
1003                         upl_offset -= cbp->b_bcount;
1004                         size       += cbp->b_bcount;
1005                         io_size    += cbp->b_bcount;
1006
1007                         cbp_next = cbp->b_trans_next;
1008                         free_io_buf(cbp);
1009                         cbp = cbp_next;
1010                 }
1011                 if (iostate) {
1012                         int need_wakeup = 0;
1013
1014                         /*
1015                          * update the error condition for this stream
1016                          * since we never really issued the io
1017                          * just go ahead and adjust it back
1018                          */
1019                         lck_mtx_lock(cl_mtxp);
1020
1021                         if (iostate->io_error == 0)
1022                                 iostate->io_error = error;
1023                         iostate->io_issued -= io_size;
1024
1025                         if (iostate->io_wanted) {
1026                                 /*
1027                                  * someone is waiting for the state of
1028                                  * this io stream to change
1029                                  */
1030                                 iostate->io_wanted = 0;
1031                                 need_wakeup = 0;
1032                         }
1033                         lck_mtx_unlock(cl_mtxp);
1034
1035                         if (need_wakeup)
1036                                 wakeup((caddr_t)&iostate->io_wanted);
1037                 }
1038                 pg_offset  = upl_offset & PAGE_MASK;
1039                 abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1040
1041                 if (flags & CL_COMMIT) {
1042                         int upl_abort_code;
1043
1044                         if (flags & CL_PRESERVE) {
1045                                 ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
1046                                                      UPL_COMMIT_FREE_ON_EMPTY);
1047                         } else {
1048                                 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
1049                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
1050                                 else if (flags & CL_PAGEIN)
1051                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1052                                 else
1053                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
1054
1055                                 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
1056                                                 upl_abort_code);
1057                         }
1058                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
1059                                      (int)upl, upl_offset - pg_offset, abort_size, error, 0);
1060                 }
1061                 if (real_bp) {
1062                         real_bp->b_flags |= B_ERROR;
1063                         real_bp->b_error  = error;
1064
1065                         buf_biodone(real_bp);
1066                 }
1067                 if (retval == 0)
1068                         retval = error;
1069         }
1070         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
1071                      (int)f_offset, size, upl_offset, retval, 0);
1072
1073         return (retval);
1074 }
1075
1076
1077 static int
1078 cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize)
1079 {
1080         int           pages_in_prefetch;
1081
1082         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
1083                      (int)f_offset, size, (int)filesize, 0, 0);
1084
1085         if (f_offset >= filesize) {
1086                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1087                              (int)f_offset, 0, 0, 0, 0);
1088                 return(0);
1089         }
1090         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1091                 size = (MAX_UPL_TRANSFER * PAGE_SIZE);
1092         else
1093                 size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1094
1095         if ((off_t)size > (filesize - f_offset))
1096                 size = filesize - f_offset;
1097         pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1098
1099         advisory_read(vp, filesize, f_offset, size);
1100
1101         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1102                      (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1103
1104         return (pages_in_prefetch);
1105 }
1106
1107
1108
1109 static void
1110 cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap)
1111 {
1112         daddr64_t       r_addr;
1113         off_t           f_offset;
1114         int             size_of_prefetch;
1115
1116
1117         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
1118                      (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
1119
1120         if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
1121                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1122                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
1123                 return;
1124         }
1125         if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1) &&
1126                                    (extent->b_addr != (rap->cl_maxra + 1) || rap->cl_ralen == 0))) {
1127                 rap->cl_ralen = 0;
1128                 rap->cl_maxra = 0;
1129
1130                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1131                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
1132
1133                 return;
1134         }
1135         if (extent->e_addr < rap->cl_maxra) {
1136                 if ((rap->cl_maxra - extent->e_addr) > (MAX_UPL_TRANSFER / 4)) {
1137
1138                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1139                                      rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
1140                         return;
1141                 }
1142         }
1143         r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
1144         f_offset = (off_t)(r_addr * PAGE_SIZE_64);
1145
1146         size_of_prefetch = 0;
1147
1148         ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
1149
1150         if (size_of_prefetch) {
1151                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1152                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
1153                 return;
1154         }
1155         if (f_offset < filesize) {
1156                 daddr64_t read_size;
1157
1158                 rap->cl_ralen = rap->cl_ralen ? min(MAX_UPL_TRANSFER, rap->cl_ralen << 1) : 1;
1159
1160                 read_size = (extent->e_addr + 1) - extent->b_addr;
1161
1162                 if (read_size > rap->cl_ralen) {
1163                         if (read_size > MAX_UPL_TRANSFER)
1164                                 rap->cl_ralen = MAX_UPL_TRANSFER;
1165                         else
1166                                 rap->cl_ralen = read_size;
1167                 }
1168                 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize);
1169
1170                 if (size_of_prefetch)
1171                         rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
1172         }
1173         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1174                      rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
1175 }
1176
1177 int
1178 cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1179                 int size, off_t filesize, int flags)
1180 {
1181         int           io_size;
1182         int           rounded_size;
1183         off_t         max_size;
1184         int           local_flags;
1185         struct cl_writebehind *wbp;
1186
1187         if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
1188                 /*
1189                  * if we know we're issuing this I/O to a virtual device (i.e. disk image)
1190                  * then we don't want to enforce this throttle... if we do, we can
1191                  * potentially deadlock since we're stalling the pageout thread at a time
1192                  * when the disk image might need additional memory (which won't be available
1193                  * if the pageout thread can't run)... instead we'll just depend on the throttle
1194                  * that the pageout thread now has in place to deal with external files
1195                  */
1196                 local_flags = CL_PAGEOUT;
1197         else
1198                 local_flags = CL_PAGEOUT | CL_THROTTLE;
1199
1200         if ((flags & UPL_IOSYNC) == 0)
1201                 local_flags |= CL_ASYNC;
1202         if ((flags & UPL_NOCOMMIT) == 0)
1203                 local_flags |= CL_COMMIT;
1204         if ((flags & UPL_KEEPCACHED))
1205                 local_flags |= CL_KEEPCACHED;
1206
1207
1208         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
1209                      (int)f_offset, size, (int)filesize, local_flags, 0);
1210
1211         /*
1212          * If they didn't specify any I/O, then we are done...
1213          * we can't issue an abort because we don't know how
1214          * big the upl really is
1215          */
1216         if (size <= 0)
1217                 return (EINVAL);
1218
1219         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1220                 if (local_flags & CL_COMMIT)
1221                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1222                 return (EROFS);
1223         }
1224         /*
1225          * can't page-in from a negative offset
1226          * or if we're starting beyond the EOF
1227          * or if the file offset isn't page aligned
1228          * or the size requested isn't a multiple of PAGE_SIZE
1229          */
1230         if (f_offset < 0 || f_offset >= filesize ||
1231            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
1232                 if (local_flags & CL_COMMIT)
1233                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1234                 return (EINVAL);
1235         }
1236         max_size = filesize - f_offset;
1237
1238         if (size < max_size)
1239                 io_size = size;
1240         else
1241                 io_size = max_size;
1242
1243         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1244
1245         if (size > rounded_size) {
1246                 if (local_flags & CL_COMMIT)
1247                         ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
1248                                         UPL_ABORT_FREE_ON_EMPTY);
1249         }
1250         if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1251                 wbp->cl_hasbeenpaged = 1;
1252
1253         return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
1254                            local_flags, (buf_t)NULL, (struct clios *)NULL));
1255 }
1256
1257 int
1258 cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1259                int size, off_t filesize, int flags)
1260 {
1261         u_int         io_size;
1262         int           rounded_size;
1263         off_t         max_size;
1264         int           retval;
1265         int           local_flags = 0;
1266
1267         if (upl == NULL || size < 0)
1268                 panic("cluster_pagein: NULL upl passed in");
1269
1270         if ((flags & UPL_IOSYNC) == 0)
1271                 local_flags |= CL_ASYNC;
1272         if ((flags & UPL_NOCOMMIT) == 0)
1273                 local_flags |= CL_COMMIT;
1274
1275
1276         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1277                      (int)f_offset, size, (int)filesize, local_flags, 0);
1278
1279         /*
1280          * can't page-in from a negative offset
1281          * or if we're starting beyond the EOF
1282          * or if the file offset isn't page aligned
1283          * or the size requested isn't a multiple of PAGE_SIZE
1284          */
1285         if (f_offset < 0 || f_offset >= filesize ||
1286            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1287                 if (local_flags & CL_COMMIT)
1288                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1289                 return (EINVAL);
1290         }
1291         max_size = filesize - f_offset;
1292
1293         if (size < max_size)
1294                 io_size = size;
1295         else
1296                 io_size = max_size;
1297
1298         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1299
1300         if (size > rounded_size && (local_flags & CL_COMMIT))
1301                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1302                                     size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1303
1304         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
1305                            local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL);
1306
1307         if (retval == 0 && !(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) {
1308                 struct cl_readahead *rap;
1309
1310                 rap = cluster_get_rap(vp);
1311
1312                 if (rap != NULL) {
1313                         struct cl_extent extent;
1314
1315                         extent.b_addr = (daddr64_t)(f_offset / PAGE_SIZE_64);
1316                         extent.e_addr = (daddr64_t)((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1317
1318                         if (rounded_size == PAGE_SIZE) {
1319                                 /*
1320                                  * we haven't read the last page in of the file yet
1321                                  * so let's try to read ahead if we're in
1322                                  * a sequential access pattern
1323                                  */
1324                                 cluster_rd_ahead(vp, &extent, filesize, rap);
1325                         }
1326                         rap->cl_lastr = extent.e_addr;
1327
1328                         lck_mtx_unlock(&rap->cl_lockr);
1329                 }
1330         }
1331         return (retval);
1332 }
1333
1334 int
1335 cluster_bp(buf_t bp)
1336 {
1337         off_t  f_offset;
1338         int    flags;
1339
1340         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1341                      (int)bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1342
1343         if (bp->b_flags & B_READ)
1344                 flags = CL_ASYNC | CL_READ;
1345         else
1346                 flags = CL_ASYNC;
1347
1348         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1349
1350         return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL));
1351 }
1352
1353 int
1354 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
1355 {
1356         int           prev_resid;
1357         u_int         clip_size;
1358         off_t         max_io_size;
1359         int           upl_size;
1360         int           upl_flags;
1361         upl_t         upl;
1362         int           retval = 0;
1363         int           flags;
1364
1365         flags = xflags;
1366
1367         if (vp->v_flag & VNOCACHE_DATA)
1368                 flags |= IO_NOCACHE;
1369
1370         if ( (!(flags & IO_NOCACHE)) || (!uio) || (!UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
1371                 /*
1372                  * go do a write through the cache if one of the following is true....
1373                  *   NOCACHE is not true
1374                  *   there is no uio structure or it doesn't target USERSPACE
1375                  */
1376                 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1377         }
1378
1379 #if LP64_DEBUG
1380         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1381                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1382         }
1383 #endif /* LP64_DEBUG */
1384
1385         while (uio_resid(uio) && uio->uio_offset < newEOF && retval == 0) {
1386                 user_size_t     iov_len;
1387                 user_addr_t     iov_base;
1388
1389                 /*
1390                  * we know we have a resid, so this is safe
1391                  * skip over any emtpy vectors
1392                  */
1393                 uio_update(uio, (user_size_t)0);
1394
1395                 iov_len  = uio_curriovlen(uio);
1396                 iov_base = uio_curriovbase(uio);
1397
1398                 upl_size  = PAGE_SIZE;
1399                 upl_flags = UPL_QUERY_OBJECT_TYPE;
1400
1401                 // LP64todo - fix this!
1402                 if ((vm_map_get_upl(current_map(),
1403                                     (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1404                                     &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
1405                         /*
1406                          * the user app must have passed in an invalid address
1407                          */
1408                         return (EFAULT);
1409                 }
1410
1411                 /*
1412                  * We check every vector target but if it is physically
1413                  * contiguous space, we skip the sanity checks.
1414                  */
1415                 if (upl_flags & UPL_PHYS_CONTIG) {
1416                         int zflags;
1417
1418                         zflags = flags & ~IO_TAILZEROFILL;
1419                         zflags |= IO_HEADZEROFILL;
1420
1421                         if (flags & IO_HEADZEROFILL) {
1422                                 /*
1423                                  * in case we have additional vectors, we don't want to do this again
1424                                  */
1425                                 flags &= ~IO_HEADZEROFILL;
1426
1427                                 if ((retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, zflags)))
1428                                         return(retval);
1429                         }
1430                         retval = cluster_phys_write(vp, uio, newEOF);
1431
1432                         if (uio_resid(uio) == 0 && (flags & IO_TAILZEROFILL)) {
1433                                 return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, zflags));
1434                         }
1435                 }
1436                 else if ((uio_resid(uio) < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL))) {
1437                         /*
1438                          * we're here because we're don't have a physically contiguous target buffer
1439                          * go do a write through the cache if one of the following is true....
1440                          *   the total xfer size is less than a page...
1441                          *   we're being asked to ZEROFILL either the head or the tail of the I/O...
1442                          */
1443                         return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1444                 }
1445                 // LP64todo - fix this!
1446                 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1447                         if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1448                                 /*
1449                                  * Bring the file offset write up to a pagesize boundary
1450                                  * this will also bring the base address to a page boundary
1451                                  * since they both are currently on the same offset within a page
1452                                  * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1453                                  * so the computed clip_size must always be less than the current uio_resid
1454                                  */
1455                                 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1456
1457                                 /*
1458                                  * Fake the resid going into the cluster_write_x call
1459                                  * and restore it on the way out.
1460                                  */
1461                                 // LP64todo - fix this
1462                                 prev_resid = uio_resid(uio);
1463                                 uio_setresid(uio, clip_size);
1464
1465                                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1466
1467                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1468                         } else {
1469                                 /*
1470                                  * can't get both the file offset and the buffer offset aligned to a page boundary
1471                                  * so fire an I/O through the cache for this entire vector
1472                                  */
1473                                 // LP64todo - fix this
1474                                 clip_size = iov_len;
1475                                 // LP64todo - fix this
1476                                 prev_resid = uio_resid(uio);
1477                                 uio_setresid(uio, clip_size);
1478
1479                                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1480
1481                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1482                         }
1483                 } else {
1484                         /*
1485                          * If we come in here, we know the offset into
1486                          * the file is on a pagesize boundary and the
1487                          * target buffer address is also on a page boundary
1488                          */
1489                         max_io_size = newEOF - uio->uio_offset;
1490                         // LP64todo - fix this
1491                         clip_size = uio_resid(uio);
1492                         if (iov_len < clip_size)
1493                                 // LP64todo - fix this!
1494                                 clip_size = iov_len;
1495                         if (max_io_size < clip_size)
1496                                 clip_size = max_io_size;
1497
1498                         if (clip_size < PAGE_SIZE) {
1499                                 /*
1500                                  * Take care of tail end of write in this vector
1501                                  */
1502                                 // LP64todo - fix this
1503                                 prev_resid = uio_resid(uio);
1504                                 uio_setresid(uio, clip_size);
1505
1506                                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1507
1508                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1509                         } else {
1510                                 /* round clip_size down to a multiple of pagesize */
1511                                 clip_size = clip_size & ~(PAGE_MASK);
1512                                 // LP64todo - fix this
1513                                 prev_resid = uio_resid(uio);
1514                                 uio_setresid(uio, clip_size);
1515
1516                                 retval = cluster_nocopy_write(vp, uio, newEOF);
1517
1518                                 if ((retval == 0) && uio_resid(uio))
1519                                         retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1520
1521                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1522                         }
1523                 } /* end else */
1524         } /* end while */
1525
1526         return(retval);
1527 }
1528
1529
1530 static int
1531 cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF)
1532 {
1533         upl_t            upl;
1534         upl_page_info_t  *pl;
1535         vm_offset_t      upl_offset;
1536         int              io_size;
1537         int              io_flag;
1538         int              upl_size;
1539         int              upl_needed_size;
1540         int              pages_in_pl;
1541         int              upl_flags;
1542         kern_return_t    kret;
1543         int              i;
1544         int              force_data_sync;
1545         int              error  = 0;
1546         struct clios     iostate;
1547         struct cl_writebehind *wbp;
1548
1549
1550         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1551                      (int)uio->uio_offset, (int)uio_resid(uio),
1552                      (int)newEOF, 0, 0);
1553
1554         /*
1555          * When we enter this routine, we know
1556          *  -- the offset into the file is on a pagesize boundary
1557          *  -- the resid is a page multiple
1558          *  -- the resid will not exceed iov_len
1559          */
1560
1561         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1562
1563                 cluster_try_push(wbp, vp, newEOF, 0, 1);
1564
1565                 lck_mtx_unlock(&wbp->cl_lockw);
1566         }
1567         iostate.io_completed = 0;
1568         iostate.io_issued = 0;
1569         iostate.io_error = 0;
1570         iostate.io_wanted = 0;
1571
1572         while (uio_resid(uio) && uio->uio_offset < newEOF && error == 0) {
1573                 user_addr_t     iov_base;
1574
1575                 io_size = uio_resid(uio);
1576
1577                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1578                         io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1579
1580                 iov_base = uio_curriovbase(uio);
1581
1582                 // LP64todo - fix this!
1583                 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
1584
1585                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1586
1587                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1588                              (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
1589
1590                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1591                         pages_in_pl = 0;
1592                         upl_size = upl_needed_size;
1593                         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1594                                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1595
1596                         // LP64todo - fix this!
1597                         kret = vm_map_get_upl(current_map(),
1598                                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1599                                               &upl_size,
1600                                               &upl,
1601                                               NULL,
1602                                               &pages_in_pl,
1603                                               &upl_flags,
1604                                               force_data_sync);
1605
1606                         if (kret != KERN_SUCCESS) {
1607                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1608                                              0, 0, 0, kret, 0);
1609                                 /*
1610                                  * cluster_nocopy_write: failed to get pagelist
1611                                  *
1612                                  * we may have already spun some portion of this request
1613                                  * off as async requests... we need to wait for the I/O
1614                                  * to complete before returning
1615                                  */
1616                                 goto wait_for_writes;
1617                         }
1618                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1619                         pages_in_pl = upl_size / PAGE_SIZE;
1620
1621                         for (i = 0; i < pages_in_pl; i++) {
1622                                 if (!upl_valid_page(pl, i))
1623                                         break;
1624                         }
1625                         if (i == pages_in_pl)
1626                                 break;
1627
1628                         /*
1629                          * didn't get all the pages back that we
1630                          * needed... release this upl and try again
1631                          */
1632                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1633                                             UPL_ABORT_FREE_ON_EMPTY);
1634                 }
1635                 if (force_data_sync >= 3) {
1636                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1637                                      i, pages_in_pl, upl_size, kret, 0);
1638                         /*
1639                          * for some reason, we couldn't acquire a hold on all
1640                          * the pages needed in the user's address space
1641                          *
1642                          * we may have already spun some portion of this request
1643                          * off as async requests... we need to wait for the I/O
1644                          * to complete before returning
1645                          */
1646                         goto wait_for_writes;
1647                 }
1648
1649                 /*
1650                  * Consider the possibility that upl_size wasn't satisfied.
1651                  */
1652                 if (upl_size != upl_needed_size)
1653                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1654
1655                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1656                              (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
1657
1658                 if (io_size == 0) {
1659                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1660                                             UPL_ABORT_FREE_ON_EMPTY);
1661                         /*
1662                          * we may have already spun some portion of this request
1663                          * off as async requests... we need to wait for the I/O
1664                          * to complete before returning
1665                          */
1666                         goto wait_for_writes;
1667                 }
1668                 /*
1669                  * Now look for pages already in the cache
1670                  * and throw them away.
1671                  * uio->uio_offset is page aligned within the file
1672                  * io_size is a multiple of PAGE_SIZE
1673                  */
1674                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
1675
1676                 /*
1677                  * we want push out these writes asynchronously so that we can overlap
1678                  * the preparation of the next I/O
1679                  * if there are already too many outstanding writes
1680                  * wait until some complete before issuing the next
1681                  */
1682                 lck_mtx_lock(cl_mtxp);
1683
1684                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1685                         iostate.io_wanted = 1;
1686                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
1687                 }
1688                 lck_mtx_unlock(cl_mtxp);
1689
1690                 if (iostate.io_error) {
1691                         /*
1692                          * one of the earlier writes we issued ran into a hard error
1693                          * don't issue any more writes, cleanup the UPL
1694                          * that was just created but not used, then
1695                          * go wait for all writes that are part of this stream
1696                          * to complete before returning the error to the caller
1697                          */
1698                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1699                                             UPL_ABORT_FREE_ON_EMPTY);
1700
1701                         goto wait_for_writes;
1702                 }
1703                 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE;
1704
1705                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1706                              (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1707
1708                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1709                                    io_size, io_flag, (buf_t)NULL, &iostate);
1710
1711                 uio_update(uio, (user_size_t)io_size);
1712
1713                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1714                              (int)upl_offset, (int)uio->uio_offset, (int)uio_resid(uio), error, 0);
1715
1716         } /* end while */
1717
1718 wait_for_writes:
1719         /*
1720          * make sure all async writes issued as part of this stream
1721          * have completed before we return
1722          */
1723         lck_mtx_lock(cl_mtxp);
1724
1725         while (iostate.io_issued != iostate.io_completed) {
1726                 iostate.io_wanted = 1;
1727                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
1728         }
1729         lck_mtx_unlock(cl_mtxp);
1730
1731         if (iostate.io_error)
1732                 error = iostate.io_error;
1733
1734         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1735                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1736
1737         return (error);
1738 }
1739
1740
1741 static int
1742 cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF)
1743 {
1744         upl_page_info_t *pl;
1745         addr64_t         src_paddr;
1746         upl_t            upl;
1747         vm_offset_t      upl_offset;
1748         int              tail_size;
1749         int              io_size;
1750         int              upl_size;
1751         int              upl_needed_size;
1752         int              pages_in_pl;
1753         int              upl_flags;
1754         kern_return_t    kret;
1755         int              error  = 0;
1756         user_addr_t      iov_base;
1757         int              devblocksize;
1758         struct cl_writebehind *wbp;
1759
1760         devblocksize = vp->v_mount->mnt_devblocksize;
1761         /*
1762          * When we enter this routine, we know
1763          *  -- the resid will not exceed iov_len
1764          *  -- the vector target address is physcially contiguous
1765          */
1766         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1767
1768                 cluster_try_push(wbp, vp, newEOF, 0, 1);
1769
1770                 lck_mtx_unlock(&wbp->cl_lockw);
1771         }
1772 #if LP64_DEBUG
1773         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1774                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1775         }
1776 #endif /* LP64_DEBUG */
1777
1778         // LP64todo - fix this!
1779         io_size = (int)uio_curriovlen(uio);
1780         iov_base = uio_curriovbase(uio);
1781
1782         upl_offset = CAST_DOWN(upl_offset_t, iov_base) & PAGE_MASK;
1783         upl_needed_size = upl_offset + io_size;
1784
1785         pages_in_pl = 0;
1786         upl_size = upl_needed_size;
1787         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1788                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1789
1790         // LP64todo - fix this!
1791         kret = vm_map_get_upl(current_map(),
1792                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1793                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1794
1795         if (kret != KERN_SUCCESS) {
1796                 /*
1797                  * cluster_phys_write: failed to get pagelist
1798                  * note: return kret here
1799                  */
1800               return(EINVAL);
1801         }
1802         /*
1803          * Consider the possibility that upl_size wasn't satisfied.
1804          * This is a failure in the physical memory case.
1805          */
1806         if (upl_size < upl_needed_size) {
1807                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1808                 return(EINVAL);
1809         }
1810         pl = ubc_upl_pageinfo(upl);
1811
1812         src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
1813
1814         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1815                 int   head_size;
1816
1817                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1818
1819                 if (head_size > io_size)
1820                         head_size = io_size;
1821
1822                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0);
1823
1824                 if (error) {
1825                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1826
1827                         return(EINVAL);
1828                 }
1829                 upl_offset += head_size;
1830                 src_paddr  += head_size;
1831                 io_size    -= head_size;
1832         }
1833         tail_size = io_size & (devblocksize - 1);
1834         io_size  -= tail_size;
1835
1836         if (io_size) {
1837                 /*
1838                  * issue a synchronous write to cluster_io
1839                  */
1840                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1841                                    io_size, CL_DEV_MEMORY, (buf_t)NULL, (struct clios *)NULL);
1842         }
1843         if (error == 0) {
1844                 /*
1845                  * The cluster_io write completed successfully,
1846                  * update the uio structure
1847                  */
1848                 uio_update(uio, (user_size_t)io_size);
1849
1850                 src_paddr += io_size;
1851
1852                 if (tail_size)
1853                         error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0);
1854         }
1855         /*
1856          * just release our hold on the physically contiguous
1857          * region without changing any state
1858          */
1859         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1860
1861         return (error);
1862 }
1863
1864
1865 static int
1866 cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int flags)
1867 {
1868         upl_page_info_t *pl;
1869         upl_t            upl;
1870         vm_offset_t      upl_offset = 0;
1871         int              upl_size;
1872         off_t            upl_f_offset;
1873         int              pages_in_upl;
1874         int              start_offset;
1875         int              xfer_resid;
1876         int              io_size;
1877         int              io_offset;
1878         int              bytes_to_zero;
1879         int              bytes_to_move;
1880         kern_return_t    kret;
1881         int              retval = 0;
1882         int              io_resid;
1883         long long        total_size;
1884         long long        zero_cnt;
1885         off_t            zero_off;
1886         long long        zero_cnt1;
1887         off_t            zero_off1;
1888         struct cl_extent cl;
1889         int              intersection;
1890         struct cl_writebehind *wbp;
1891
1892         if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1893         {
1894                 if (wbp->cl_hasbeenpaged) {
1895                         /*
1896                          * this vnode had pages cleaned to it by
1897                          * the pager which indicates that either
1898                          * it's not very 'hot', or the system is
1899                          * being overwhelmed by a lot of dirty
1900                          * data being delayed in the VM cache...
1901                          * in either event, we'll push our remaining
1902                          * delayed data at this point...  this will
1903                          * be more efficient than paging out 1 page at
1904                          * a time, and will also act as a throttle
1905                          * by delaying this client from writing any
1906                          * more data until all his delayed data has
1907                          * at least been queued to the uderlying driver.
1908                          */
1909                         if (wbp->cl_number || wbp->cl_scmap)
1910                                 cluster_push_EOF(vp, newEOF);
1911
1912                         wbp->cl_hasbeenpaged = 0;
1913                 }
1914         }
1915         if (uio) {
1916                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1917                              (int)uio->uio_offset, uio_resid(uio), (int)oldEOF, (int)newEOF, 0);
1918
1919                 // LP64todo - fix this
1920                 io_resid = uio_resid(uio);
1921         } else {
1922                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1923                              0, 0, (int)oldEOF, (int)newEOF, 0);
1924
1925                 io_resid = 0;
1926         }
1927         zero_cnt  = 0;
1928         zero_cnt1 = 0;
1929         zero_off  = 0;
1930         zero_off1 = 0;
1931
1932         if (flags & IO_HEADZEROFILL) {
1933                 /*
1934                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1935                  * so we zero fill the intervening space between the old EOF and the offset
1936                  * where the next chunk of real data begins.... ftruncate will also use this
1937                  * routine to zero fill to the new EOF when growing a file... in this case, the
1938                  * uio structure will not be provided
1939                  */
1940                 if (uio) {
1941                         if (headOff < uio->uio_offset) {
1942                                 zero_cnt = uio->uio_offset - headOff;
1943                                 zero_off = headOff;
1944                         }
1945                 } else if (headOff < newEOF) {
1946                         zero_cnt = newEOF - headOff;
1947                         zero_off = headOff;
1948                 }
1949         }
1950         if (flags & IO_TAILZEROFILL) {
1951                 if (uio) {
1952                         // LP64todo - fix this
1953                         zero_off1 = uio->uio_offset + uio_resid(uio);
1954
1955                         if (zero_off1 < tailOff)
1956                                 zero_cnt1 = tailOff - zero_off1;
1957                 }
1958         }
1959         if (zero_cnt == 0 && uio == (struct uio *) 0) {
1960                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1961                              retval, 0, 0, 0, 0);
1962                 return (0);
1963         }
1964
1965         while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1966                 /*
1967                  * for this iteration of the loop, figure out where our starting point is
1968                  */
1969                 if (zero_cnt) {
1970                         start_offset = (int)(zero_off & PAGE_MASK_64);
1971                         upl_f_offset = zero_off - start_offset;
1972                 } else if (io_resid) {
1973                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1974                         upl_f_offset = uio->uio_offset - start_offset;
1975                 } else {
1976                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1977                         upl_f_offset = zero_off1 - start_offset;
1978                 }
1979                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1980                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1981
1982                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1983                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1984
1985                 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
1986
1987                 if (uio && ((flags & (IO_NOCACHE | IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
1988                         /*
1989                          * assumption... total_size <= io_resid
1990                          * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1991                          */
1992                         if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
1993                                 total_size -= start_offset;
1994                         xfer_resid = total_size;
1995
1996                         retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
1997
1998                         if (retval)
1999                                 break;
2000
2001                         io_resid   -= (total_size - xfer_resid);
2002                         total_size   = xfer_resid;
2003                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2004                         upl_f_offset = uio->uio_offset - start_offset;
2005
2006                         if (total_size == 0) {
2007                                 if (start_offset) {
2008                                         /*
2009                                          * the write did not finish on a page boundary
2010                                          * which will leave upl_f_offset pointing to the
2011                                          * beginning of the last page written instead of
2012                                          * the page beyond it... bump it in this case
2013                                          * so that the cluster code records the last page
2014                                          * written as dirty
2015                                          */
2016                                         upl_f_offset += PAGE_SIZE_64;
2017                                 }
2018                                 upl_size = 0;
2019
2020                                 goto check_cluster;
2021                         }
2022                 }
2023                 /*
2024                  * compute the size of the upl needed to encompass
2025                  * the requested write... limit each call to cluster_io
2026                  * to the maximum UPL size... cluster_io will clip if
2027                  * this exceeds the maximum io_size for the device,
2028                  * make sure to account for
2029                  * a starting offset that's not page aligned
2030                  */
2031                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2032
2033                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2034                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2035
2036                 pages_in_upl = upl_size / PAGE_SIZE;
2037                 io_size      = upl_size - start_offset;
2038
2039                 if ((long long)io_size > total_size)
2040                         io_size = total_size;
2041
2042                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
2043
2044
2045                 /*
2046                  * Gather the pages from the buffer cache.
2047                  * The UPL_WILL_MODIFY flag lets the UPL subsystem know
2048                  * that we intend to modify these pages.
2049                  */
2050                 kret = ubc_create_upl(vp,
2051                                       upl_f_offset,
2052                                       upl_size,
2053                                       &upl,
2054                                       &pl,
2055                                       UPL_SET_LITE | UPL_WILL_MODIFY);
2056                 if (kret != KERN_SUCCESS)
2057                         panic("cluster_write: failed to get pagelist");
2058
2059                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
2060                         (int)upl, (int)upl_f_offset, start_offset, 0, 0);
2061
2062                 if (start_offset && !upl_valid_page(pl, 0)) {
2063                         int   read_size;
2064
2065                         /*
2066                          * we're starting in the middle of the first page of the upl
2067                          * and the page isn't currently valid, so we're going to have
2068                          * to read it in first... this is a synchronous operation
2069                          */
2070                         read_size = PAGE_SIZE;
2071
2072                         if ((upl_f_offset + read_size) > newEOF)
2073                                 read_size = newEOF - upl_f_offset;
2074
2075                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
2076                                             CL_READ, (buf_t)NULL, (struct clios *)NULL);
2077                         if (retval) {
2078                                 /*
2079                                  * we had an error during the read which causes us to abort
2080                                  * the current cluster_write request... before we do, we need
2081                                  * to release the rest of the pages in the upl without modifying
2082                                  * there state and mark the failed page in error
2083                                  */
2084                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2085
2086                                 if (upl_size > PAGE_SIZE)
2087                                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2088
2089                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2090                                              (int)upl, 0, 0, retval, 0);
2091                                 break;
2092                         }
2093                 }
2094                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
2095                         /*
2096                          * the last offset we're writing to in this upl does not end on a page
2097                          * boundary... if it's not beyond the old EOF, then we'll also need to
2098                          * pre-read this page in if it isn't already valid
2099                          */
2100                         upl_offset = upl_size - PAGE_SIZE;
2101
2102                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
2103                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
2104                                 int   read_size;
2105
2106                                 read_size = PAGE_SIZE;
2107
2108                                 if ((upl_f_offset + upl_offset + read_size) > newEOF)
2109                                         read_size = newEOF - (upl_f_offset + upl_offset);
2110
2111                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
2112                                                     CL_READ, (buf_t)NULL, (struct clios *)NULL);
2113                                 if (retval) {
2114                                         /*
2115                                          * we had an error during the read which causes us to abort
2116                                          * the current cluster_write request... before we do, we
2117                                          * need to release the rest of the pages in the upl without
2118                                          * modifying there state and mark the failed page in error
2119                                          */
2120                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2121
2122                                         if (upl_size > PAGE_SIZE)
2123                                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2124
2125                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2126                                                      (int)upl, 0, 0, retval, 0);
2127                                         break;
2128                                 }
2129                         }
2130                 }
2131                 xfer_resid = io_size;
2132                 io_offset = start_offset;
2133
2134                 while (zero_cnt && xfer_resid) {
2135
2136                         if (zero_cnt < (long long)xfer_resid)
2137                                 bytes_to_zero = zero_cnt;
2138                         else
2139                                 bytes_to_zero = xfer_resid;
2140
2141                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2142                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2143                         } else {
2144                                 int zero_pg_index;
2145
2146                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
2147                                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
2148
2149                                 if ( !upl_valid_page(pl, zero_pg_index)) {
2150                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2151
2152                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2153                                            !upl_dirty_page(pl, zero_pg_index)) {
2154                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2155                                 }
2156                         }
2157                         xfer_resid -= bytes_to_zero;
2158                         zero_cnt   -= bytes_to_zero;
2159                         zero_off   += bytes_to_zero;
2160                         io_offset  += bytes_to_zero;
2161                 }
2162                 if (xfer_resid && io_resid) {
2163                         bytes_to_move = min(io_resid, xfer_resid);
2164
2165                         retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
2166
2167                         if (retval) {
2168
2169                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2170
2171                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2172                                              (int)upl, 0, 0, retval, 0);
2173                         } else {
2174                                 io_resid  -= bytes_to_move;
2175                                 xfer_resid -= bytes_to_move;
2176                                 io_offset  += bytes_to_move;
2177                         }
2178                 }
2179                 while (xfer_resid && zero_cnt1 && retval == 0) {
2180
2181                         if (zero_cnt1 < (long long)xfer_resid)
2182                                 bytes_to_zero = zero_cnt1;
2183                         else
2184                                 bytes_to_zero = xfer_resid;
2185
2186                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2187                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2188                         } else {
2189                                 int zero_pg_index;
2190
2191                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
2192                                 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
2193
2194                                 if ( !upl_valid_page(pl, zero_pg_index)) {
2195                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2196                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2197                                            !upl_dirty_page(pl, zero_pg_index)) {
2198                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2199                                 }
2200                         }
2201                         xfer_resid -= bytes_to_zero;
2202                         zero_cnt1  -= bytes_to_zero;
2203                         zero_off1  += bytes_to_zero;
2204                         io_offset  += bytes_to_zero;
2205                 }
2206
2207                 if (retval == 0) {
2208                         int cl_index;
2209                         int can_delay;
2210
2211                         io_size += start_offset;
2212
2213                         if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
2214                                 /*
2215                                  * if we're extending the file with this write
2216                                  * we'll zero fill the rest of the page so that
2217                                  * if the file gets extended again in such a way as to leave a
2218                                  * hole starting at this EOF, we'll have zero's in the correct spot
2219                                  */
2220                                 cluster_zero(upl, io_size, upl_size - io_size, NULL);
2221                         }
2222                         if (flags & IO_SYNC)
2223                                 /*
2224                                  * if the IO_SYNC flag is set than we need to
2225                                  * bypass any clusters and immediately issue
2226                                  * the I/O
2227                                  */
2228                                 goto issue_io;
2229 check_cluster:
2230                         /*
2231                          * take the lock to protect our accesses
2232                          * of the writebehind and sparse cluster state
2233                          */
2234                         wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
2235
2236                         /*
2237                          * calculate the last logical block number
2238                          * that this delayed I/O encompassed
2239                          */
2240                         cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
2241
2242                         if (wbp->cl_scmap) {
2243
2244                                 if ( !(flags & IO_NOCACHE)) {
2245                                         /*
2246                                          * we've fallen into the sparse
2247                                          * cluster method of delaying dirty pages
2248                                          * first, we need to release the upl if we hold one
2249                                          * since pages in it may be present in the sparse cluster map
2250                                          * and may span 2 separate buckets there... if they do and
2251                                          * we happen to have to flush a bucket to make room and it intersects
2252                                          * this upl, a deadlock may result on page BUSY
2253                                          */
2254                                         if (upl_size)
2255                                                 ubc_upl_commit_range(upl, 0, upl_size,
2256                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2257
2258                                         sparse_cluster_add(wbp, vp, &cl, newEOF);
2259
2260                                         lck_mtx_unlock(&wbp->cl_lockw);
2261
2262                                         continue;
2263                                 }
2264                                 /*
2265                                  * must have done cached writes that fell into
2266                                  * the sparse cluster mechanism... we've switched
2267                                  * to uncached writes on the file, so go ahead
2268                                  * and push whatever's in the sparse map
2269                                  * and switch back to normal clustering
2270                                  *
2271                                  * see the comment above concerning a possible deadlock...
2272                                  */
2273                                 if (upl_size) {
2274                                         ubc_upl_commit_range(upl, 0, upl_size,
2275                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2276                                         /*
2277                                          * setting upl_size to 0 keeps us from committing a
2278                                          * second time in the start_new_cluster path
2279                                          */
2280                                         upl_size = 0;
2281                                 }
2282                                 sparse_cluster_push(wbp, vp, newEOF, 1);
2283
2284                                 wbp->cl_number = 0;
2285                                 /*
2286                                  * no clusters of either type present at this point
2287                                  * so just go directly to start_new_cluster since
2288                                  * we know we need to delay this I/O since we've
2289                                  * already released the pages back into the cache
2290                                  * to avoid the deadlock with sparse_cluster_push
2291                                  */
2292                                 goto start_new_cluster;
2293                         }
2294                         upl_offset = 0;
2295
2296                         if (wbp->cl_number == 0)
2297                                 /*
2298                                  * no clusters currently present
2299                                  */
2300                                 goto start_new_cluster;
2301
2302                         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
2303                                 /*
2304                                  * check each cluster that we currently hold
2305                                  * try to merge some or all of this write into
2306                                  * one or more of the existing clusters... if
2307                                  * any portion of the write remains, start a
2308                                  * new cluster
2309                                  */
2310                                 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
2311                                         /*
2312                                          * the current write starts at or after the current cluster
2313                                          */
2314                                         if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
2315                                                 /*
2316                                                  * we have a write that fits entirely
2317                                                  * within the existing cluster limits
2318                                                  */
2319                                                 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
2320                                                         /*
2321                                                          * update our idea of where the cluster ends
2322                                                          */
2323                                                         wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2324                                                 break;
2325                                         }
2326                                         if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
2327                                                 /*
2328                                                  * we have a write that starts in the middle of the current cluster
2329                                                  * but extends beyond the cluster's limit... we know this because
2330                                                  * of the previous checks
2331                                                  * we'll extend the current cluster to the max
2332                                                  * and update the b_addr for the current write to reflect that
2333                                                  * the head of it was absorbed into this cluster...
2334                                                  * note that we'll always have a leftover tail in this case since
2335                                                  * full absorbtion would have occurred in the clause above
2336                                                  */
2337                                                 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER;
2338
2339                                                 if (upl_size) {
2340                                                         daddr64_t start_pg_in_upl;
2341
2342                                                         start_pg_in_upl = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
2343
2344                                                         if (start_pg_in_upl < wbp->cl_clusters[cl_index].e_addr) {
2345                                                                 intersection = (int)((wbp->cl_clusters[cl_index].e_addr - start_pg_in_upl) * PAGE_SIZE);
2346
2347                                                                 ubc_upl_commit_range(upl, upl_offset, intersection,
2348                                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2349                                                                 upl_f_offset += intersection;
2350                                                                 upl_offset   += intersection;
2351                                                                 upl_size     -= intersection;
2352                                                         }
2353                                                 }
2354                                                 cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
2355                                         }
2356                                         /*
2357                                          * we come here for the case where the current write starts
2358                                          * beyond the limit of the existing cluster or we have a leftover
2359                                          * tail after a partial absorbtion
2360                                          *
2361                                          * in either case, we'll check the remaining clusters before
2362                                          * starting a new one
2363                                          */
2364                                 } else {
2365                                         /*
2366                                          * the current write starts in front of the cluster we're currently considering
2367                                          */
2368                                         if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= MAX_UPL_TRANSFER) {
2369                                                 /*
2370                                                  * we can just merge the new request into
2371                                                  * this cluster and leave it in the cache
2372                                                  * since the resulting cluster is still
2373                                                  * less than the maximum allowable size
2374                                                  */
2375                                                 wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
2376
2377                                                 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
2378                                                         /*
2379                                                          * the current write completely
2380                                                          * envelops the existing cluster and since
2381                                                          * each write is limited to at most MAX_UPL_TRANSFER bytes
2382                                                          * we can just use the start and last blocknos of the write
2383                                                          * to generate the cluster limits
2384                                                          */
2385                                                         wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2386                                                 }
2387                                                 break;
2388                                         }
2389
2390                                         /*
2391                                          * if we were to combine this write with the current cluster
2392                                          * we would exceed the cluster size limit.... so,
2393                                          * let's see if there's any overlap of the new I/O with
2394                                          * the cluster we're currently considering... in fact, we'll
2395                                          * stretch the cluster out to it's full limit and see if we
2396                                          * get an intersection with the current write
2397                                          *
2398                                          */
2399                                         if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER) {
2400                                                 /*
2401                                                  * the current write extends into the proposed cluster
2402                                                  * clip the length of the current write after first combining it's
2403                                                  * tail with the newly shaped cluster
2404                                                  */
2405                                                 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER;
2406
2407                                                 if (upl_size) {
2408                                                         intersection = (int)((cl.e_addr - wbp->cl_clusters[cl_index].b_addr) * PAGE_SIZE);
2409
2410                                                         if (intersection > upl_size)
2411                                                                 /*
2412                                                                  * because the current write may consist of a number of pages found in the cache
2413                                                                  * which are not part of the UPL, we may have an intersection that exceeds
2414                                                                  * the size of the UPL that is also part of this write
2415                                                                  */
2416                                                                 intersection = upl_size;
2417
2418                                                         ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2419                                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2420                                                         upl_size -= intersection;
2421                                                 }
2422                                                 cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
2423                                         }
2424                                         /*
2425                                          * if we get here, there was no way to merge
2426                                          * any portion of this write with this cluster
2427                                          * or we could only merge part of it which
2428                                          * will leave a tail...
2429                                          * we'll check the remaining clusters before starting a new one
2430                                          */
2431                                 }
2432                         }
2433                         if (cl_index < wbp->cl_number)
2434                                 /*
2435                                  * we found an existing cluster(s) that we
2436                                  * could entirely merge this I/O into
2437                                  */
2438                                 goto delay_io;
2439
2440                         if (wbp->cl_number < MAX_CLUSTERS && !(flags & IO_NOCACHE))
2441                                 /*
2442                                  * we didn't find an existing cluster to
2443                                  * merge into, but there's room to start
2444                                  * a new one
2445                                  */
2446                                 goto start_new_cluster;
2447
2448                         /*
2449                          * no exisitng cluster to merge with and no
2450                          * room to start a new one... we'll try
2451                          * pushing one of the existing ones... if none of
2452                          * them are able to be pushed, we'll switch
2453                          * to the sparse cluster mechanism
2454                          * cluster_try_push updates cl_number to the
2455                          * number of remaining clusters... and
2456                          * returns the number of currently unused clusters
2457                          */
2458                         int ret_cluster_try_push = 0;
2459                         /* if writes are not deferred, call cluster push immediately */
2460                         if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2461                                 if (flags & IO_NOCACHE)
2462                                         can_delay = 0;
2463                                 else
2464                                         can_delay = 1;
2465
2466                                 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, can_delay, 0);
2467                         }
2468
2469                         /* execute following regardless writes are deferred or not */
2470                         if (ret_cluster_try_push == 0) {
2471                                 /*
2472                                  * no more room in the normal cluster mechanism
2473                                  * so let's switch to the more expansive but expensive
2474                                  * sparse mechanism....
2475                                  * first, we need to release the upl if we hold one
2476                                  * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2477                                  * and may span 2 separate buckets there... if they do and
2478                                  * we happen to have to flush a bucket to make room and it intersects
2479                                  * this upl, a deadlock may result on page BUSY
2480                                  */
2481                                 if (upl_size)
2482                                         ubc_upl_commit_range(upl, upl_offset, upl_size,
2483                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2484
2485                                 sparse_cluster_switch(wbp, vp, newEOF);
2486                                 sparse_cluster_add(wbp, vp, &cl, newEOF);
2487
2488                                 lck_mtx_unlock(&wbp->cl_lockw);
2489
2490                                 continue;
2491                         }
2492                         /*
2493                          * we pushed one cluster successfully, so we must be sequentially writing this file
2494                          * otherwise, we would have failed and fallen into the sparse cluster support
2495                          * so let's take the opportunity to push out additional clusters as long as we
2496                          * remain below the throttle... this will give us better I/O locality if we're
2497                          * in a copy loop (i.e.  we won't jump back and forth between the read and write points
2498                          * however, we don't want to push so much out that the write throttle kicks in and
2499                          * hangs this thread up until some of the I/O completes...
2500                          */
2501                         if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2502                                 while (wbp->cl_number && (vp->v_numoutput <= (VNODE_ASYNC_THROTTLE / 2)))
2503                                         cluster_try_push(wbp, vp, newEOF, 0, 0);
2504                         }
2505
2506 start_new_cluster:
2507                         wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
2508                         wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
2509
2510                         if (flags & IO_NOCACHE)
2511                                 wbp->cl_clusters[wbp->cl_number].io_nocache = 1;
2512                         else
2513                                 wbp->cl_clusters[wbp->cl_number].io_nocache = 0;
2514                         wbp->cl_number++;
2515 delay_io:
2516                         if (upl_size)
2517                                 ubc_upl_commit_range(upl, upl_offset, upl_size,
2518                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2519
2520                         lck_mtx_unlock(&wbp->cl_lockw);
2521
2522                         continue;
2523 issue_io:
2524                         /*
2525                          * we don't hold the vnode lock at this point
2526                          *
2527                          * because we had to ask for a UPL that provides currenty non-present pages, the
2528                          * UPL has been automatically set to clear the dirty flags (both software and hardware)
2529                          * upon committing it... this is not the behavior we want since it's possible for
2530                          * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
2531                          * in order to maintain some semblance of coherency with mapped writes
2532                          * we need to drop the current upl and pick it back up with COPYOUT_FROM set
2533                          * so that we correctly deal with a change in state of the hardware modify bit...
2534                          * we do this via cluster_push_x... by passing along the IO_SYNC flag, we force
2535                          * cluster_push_x to wait until all the I/Os have completed... cluster_push_x is also
2536                          * responsible for generating the correct sized I/O(s)
2537                          */
2538                         ubc_upl_commit_range(upl, 0, upl_size,
2539                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2540
2541                         cl.e_addr = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
2542
2543                         retval = cluster_push_x(vp, &cl, newEOF, flags);
2544                 }
2545         }
2546         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2547                      retval, 0, io_resid, 0, 0);
2548
2549         return (retval);
2550 }
2551
2552 int
2553 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
2554 {
2555         int           prev_resid;
2556         u_int         clip_size;
2557         off_t         max_io_size;
2558         int           upl_size;
2559         int           upl_flags;
2560         upl_t         upl;
2561         int           retval = 0;
2562         int           flags;
2563
2564         flags = xflags;
2565
2566         if (vp->v_flag & VNOCACHE_DATA)
2567                 flags |= IO_NOCACHE;
2568         if (vp->v_flag & VRAOFF)
2569                 flags |= IO_RAOFF;
2570
2571         if (!((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
2572                 /*
2573                  * go do a read through the cache if one of the following is true....
2574                  *   NOCACHE is not true
2575                  *   the uio request doesn't target USERSPACE
2576                  */
2577                 return (cluster_read_x(vp, uio, filesize, flags));
2578         }
2579
2580 #if LP64_DEBUG
2581         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
2582                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
2583         }
2584 #endif /* LP64_DEBUG */
2585
2586         while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
2587                 user_size_t     iov_len;
2588                 user_addr_t     iov_base;
2589
2590                 /*
2591                  * we know we have a resid, so this is safe
2592                  * skip over any emtpy vectors
2593                  */
2594                 uio_update(uio, (user_size_t)0);
2595
2596                 iov_len  = uio_curriovlen(uio);
2597                 iov_base = uio_curriovbase(uio);
2598
2599                 upl_size  = PAGE_SIZE;
2600                 upl_flags = UPL_QUERY_OBJECT_TYPE;
2601
2602                 // LP64todo - fix this!
2603                 if ((vm_map_get_upl(current_map(),
2604                                     (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2605                                     &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
2606                         /*
2607                          * the user app must have passed in an invalid address
2608                          */
2609                         return (EFAULT);
2610                 }
2611
2612                 /*
2613                  * We check every vector target but if it is physically
2614                  * contiguous space, we skip the sanity checks.
2615                  */
2616                 if (upl_flags & UPL_PHYS_CONTIG) {
2617                         retval = cluster_phys_read(vp, uio, filesize);
2618                 }
2619                 else if (uio_resid(uio) < PAGE_SIZE) {
2620                         /*
2621                          * we're here because we're don't have a physically contiguous target buffer
2622                          * go do a read through the cache if
2623                          *   the total xfer size is less than a page...
2624                          */
2625                         return (cluster_read_x(vp, uio, filesize, flags));
2626                 }
2627                 // LP64todo - fix this!
2628                 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2629                        if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2630                                /*
2631                                 * Bring the file offset read up to a pagesize boundary
2632                                 * this will also bring the base address to a page boundary
2633                                 * since they both are currently on the same offset within a page
2634                                 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2635                                 * so the computed clip_size must always be less than the current uio_resid
2636                                 */
2637                                clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2638
2639                                /*
2640                                 * Fake the resid going into the cluster_read_x call
2641                                 * and restore it on the way out.
2642                                 */
2643                                prev_resid = uio_resid(uio);
2644                                // LP64todo - fix this
2645                                uio_setresid(uio, clip_size);
2646
2647                                retval = cluster_read_x(vp, uio, filesize, flags);
2648
2649                                uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2650                        } else {
2651                                /*
2652                                 * can't get both the file offset and the buffer offset aligned to a page boundary
2653                                 * so fire an I/O through the cache for this entire vector
2654                                 */
2655                                // LP64todo - fix this!
2656                                clip_size = iov_len;
2657                                prev_resid = uio_resid(uio);
2658                                uio_setresid(uio, clip_size);
2659
2660                                retval = cluster_read_x(vp, uio, filesize, flags);
2661
2662                                uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2663                        }
2664                 } else {
2665                         /*
2666                          * If we come in here, we know the offset into
2667                          * the file is on a pagesize boundary
2668                          */
2669                         max_io_size = filesize - uio->uio_offset;
2670                         // LP64todo - fix this
2671                         clip_size = uio_resid(uio);
2672                         if (iov_len < clip_size)
2673                                 clip_size = iov_len;
2674                         if (max_io_size < clip_size)
2675                                 clip_size = (int)max_io_size;
2676
2677                         if (clip_size < PAGE_SIZE) {
2678                                 /*
2679                                  * Take care of the tail end of the read in this vector.
2680                                  */
2681                                 // LP64todo - fix this
2682                                 prev_resid = uio_resid(uio);
2683                                 uio_setresid(uio, clip_size);
2684
2685                                 retval = cluster_read_x(vp, uio, filesize, flags);
2686
2687                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2688                         } else {
2689                                 /* round clip_size down to a multiple of pagesize */
2690                                 clip_size = clip_size & ~(PAGE_MASK);
2691                                 // LP64todo - fix this
2692                                 prev_resid = uio_resid(uio);
2693                                 uio_setresid(uio, clip_size);
2694
2695                                 retval = cluster_nocopy_read(vp, uio, filesize);
2696
2697                                 if ((retval==0) && uio_resid(uio))
2698                                         retval = cluster_read_x(vp, uio, filesize, flags);
2699
2700                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2701                         }
2702                 } /* end else */
2703         } /* end while */
2704
2705         return(retval);
2706 }
2707
2708 static int
2709 cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags)
2710 {
2711         upl_page_info_t *pl;
2712         upl_t            upl;
2713         vm_offset_t      upl_offset;
2714         int              upl_size;
2715         off_t            upl_f_offset;
2716         int              start_offset;
2717         int              start_pg;
2718         int              last_pg;
2719         int              uio_last = 0;
2720         int              pages_in_upl;
2721         off_t            max_size;
2722         off_t            last_ioread_offset;
2723         off_t            last_request_offset;
2724         u_int            size_of_prefetch;
2725         u_int            io_size;
2726         kern_return_t    kret;
2727         int              error  = 0;
2728         int              retval = 0;
2729         u_int            max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2730         u_int            rd_ahead_enabled = 1;
2731         u_int            prefetch_enabled = 1;
2732         struct cl_readahead *   rap;
2733         struct clios            iostate;
2734         struct cl_extent        extent;
2735
2736         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2737                      (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
2738
2739         // LP64todo - fix this
2740         last_request_offset = uio->uio_offset + uio_resid(uio);
2741
2742         if ((flags & (IO_RAOFF|IO_NOCACHE)) ||
2743                 ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
2744                 rd_ahead_enabled = 0;
2745                 rap = NULL;
2746         } else {
2747                 if (cluster_hard_throttle_on(vp)) {
2748                         rd_ahead_enabled = 0;
2749                         prefetch_enabled = 0;
2750
2751                         max_rd_size = HARD_THROTTLE_MAXSIZE;
2752                 }
2753                 if ((rap = cluster_get_rap(vp)) == NULL)
2754                         rd_ahead_enabled = 0;
2755         }
2756         if (last_request_offset > filesize)
2757                 last_request_offset = filesize;
2758         extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
2759         extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
2760
2761         if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
2762                 /*
2763                  * determine if we already have a read-ahead in the pipe courtesy of the
2764                  * last read systemcall that was issued...
2765                  * if so, pick up it's extent to determine where we should start
2766                  * with respect to any read-ahead that might be necessary to
2767                  * garner all the data needed to complete this read systemcall
2768                  */
2769                 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
2770
2771                 if (last_ioread_offset < uio->uio_offset)
2772                         last_ioread_offset = (off_t)0;
2773                 else if (last_ioread_offset > last_request_offset)
2774                         last_ioread_offset = last_request_offset;
2775         } else
2776                 last_ioread_offset = (off_t)0;
2777
2778         while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
2779                 /*
2780                  * compute the size of the upl needed to encompass
2781                  * the requested read... limit each call to cluster_io
2782                  * to the maximum UPL size... cluster_io will clip if
2783                  * this exceeds the maximum io_size for the device,
2784                  * make sure to account for
2785                  * a starting offset that's not page aligned
2786                  */
2787                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2788                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2789                 max_size     = filesize - uio->uio_offset;
2790
2791         // LP64todo - fix this!
2792                 if ((off_t)((unsigned int)uio_resid(uio)) < max_size)
2793                         io_size = uio_resid(uio);
2794                 else
2795                         io_size = max_size;
2796
2797                 if (!(flags & IO_NOCACHE)) {
2798
2799                         while (io_size) {
2800                                 u_int io_resid;
2801                                 u_int io_requested;
2802
2803                                 /*
2804                                  * if we keep finding the pages we need already in the cache, then
2805                                  * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2806                                  * to determine that we have all the pages we need... once we miss in
2807                                  * the cache and have issued an I/O, than we'll assume that we're likely
2808                                  * to continue to miss in the cache and it's to our advantage to try and prefetch
2809                                  */
2810                                 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
2811                                         if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
2812                                                 /*
2813                                                  * we've already issued I/O for this request and
2814                                                  * there's still work to do and
2815                                                  * our prefetch stream is running dry, so issue a
2816                                                  * pre-fetch I/O... the I/O latency will overlap
2817                                                  * with the copying of the data
2818                                                  */
2819                                                 if (size_of_prefetch > max_rd_size)
2820                                                         size_of_prefetch = max_rd_size;
2821
2822                                                 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
2823
2824                                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2825
2826                                                 if (last_ioread_offset > last_request_offset)
2827                                                         last_ioread_offset = last_request_offset;
2828                                         }
2829                                 }
2830                                 /*
2831                                  * limit the size of the copy we're about to do so that
2832                                  * we can notice that our I/O pipe is running dry and
2833                                  * get the next I/O issued before it does go dry
2834                                  */
2835                                 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
2836                                         io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
2837                                 else
2838                                         io_resid = io_size;
2839
2840                                 io_requested = io_resid;
2841
2842                                 retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
2843
2844                                 io_size -= (io_requested - io_resid);
2845
2846                                 if (retval || io_resid)
2847                                         /*
2848                                          * if we run into a real error or
2849                                          * a page that is not in the cache
2850                                          * we need to leave streaming mode
2851                                          */
2852                                         break;
2853
2854                                 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
2855                                         /*
2856                                          * we're already finished the I/O for this read request
2857                                          * let's see if we should do a read-ahead
2858                                          */
2859                                         cluster_rd_ahead(vp, &extent, filesize, rap);
2860                                 }
2861                         }
2862                         if (retval)
2863                                 break;
2864                         if (io_size == 0) {
2865                                 if (rap != NULL) {
2866                                         if (extent.e_addr < rap->cl_lastr)
2867                                                 rap->cl_maxra = 0;
2868                                         rap->cl_lastr = extent.e_addr;
2869                                 }
2870                                 break;
2871                         }
2872                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2873                         upl_f_offset = uio->uio_offset - (off_t)start_offset;
2874                         max_size     = filesize - uio->uio_offset;
2875                 }
2876                 if (io_size > max_rd_size)
2877                         io_size = max_rd_size;
2878
2879                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2880
2881                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2882                         upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
2883                 pages_in_upl = upl_size / PAGE_SIZE;
2884
2885                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2886                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2887
2888                 kret = ubc_create_upl(vp,
2889                                       upl_f_offset,
2890                                       upl_size,
2891                                       &upl,
2892                                       &pl,
2893                                       UPL_SET_LITE);
2894                 if (kret != KERN_SUCCESS)
2895                         panic("cluster_read: failed to get pagelist");
2896
2897                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2898                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2899
2900                 /*
2901                  * scan from the beginning of the upl looking for the first
2902                  * non-valid page.... this will become the first page in
2903                  * the request we're going to make to 'cluster_io'... if all
2904                  * of the pages are valid, we won't call through to 'cluster_io'
2905                  */
2906                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2907                         if (!upl_valid_page(pl, start_pg))
2908                                 break;
2909                 }
2910
2911                 /*
2912                  * scan from the starting invalid page looking for a valid
2913                  * page before the end of the upl is reached, if we
2914                  * find one, then it will be the last page of the request to
2915                  * 'cluster_io'
2916                  */
2917                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2918                         if (upl_valid_page(pl, last_pg))
2919                                 break;
2920                 }
2921                 iostate.io_completed = 0;
2922                 iostate.io_issued = 0;
2923                 iostate.io_error = 0;
2924                 iostate.io_wanted = 0;
2925
2926                 if (start_pg < last_pg) {
2927                         /*
2928                          * we found a range of 'invalid' pages that must be filled
2929                          * if the last page in this range is the last page of the file
2930                          * we may have to clip the size of it to keep from reading past
2931                          * the end of the last physical block associated with the file
2932                          */
2933                         upl_offset = start_pg * PAGE_SIZE;
2934                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2935
2936                         if ((upl_f_offset + upl_offset + io_size) > filesize)
2937                                 io_size = filesize - (upl_f_offset + upl_offset);
2938
2939                         /*
2940                          * issue an asynchronous read to cluster_io
2941                          */
2942
2943                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2944                                            io_size, CL_READ | CL_ASYNC, (buf_t)NULL, &iostate);
2945                 }
2946                 if (error == 0) {
2947                         /*
2948                          * if the read completed successfully, or there was no I/O request
2949                          * issued, than copy the data into user land via 'cluster_upl_copy_data'
2950                          * we'll first add on any 'valid'
2951                          * pages that were present in the upl when we acquired it.
2952                          */
2953                         u_int  val_size;
2954
2955                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2956                                 if (!upl_valid_page(pl, uio_last))
2957                                         break;
2958                         }
2959                         /*
2960                          * compute size to transfer this round,  if uio->uio_resid is
2961                          * still non-zero after this attempt, we'll loop around and
2962                          * set up for another I/O.
2963                          */
2964                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2965
2966                         if (val_size > max_size)
2967                                 val_size = max_size;
2968
2969                         if (val_size > uio_resid(uio))
2970         // LP64todo - fix this
2971                                 val_size = uio_resid(uio);
2972
2973                         if (last_ioread_offset == 0)
2974                                 last_ioread_offset = uio->uio_offset + val_size;
2975
2976                         if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
2977                                 /*
2978                                  * if there's still I/O left to do for this request, and...
2979                                  * we're not in hard throttle mode, then issue a
2980                                  * pre-fetch I/O... the I/O latency will overlap
2981                                  * with the copying of the data
2982                                  */
2983                                 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
2984
2985                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2986
2987                                 if (last_ioread_offset > last_request_offset)
2988                                         last_ioread_offset = last_request_offset;
2989
2990                         } else if ((uio->uio_offset + val_size) == last_request_offset) {
2991                                 /*
2992                                  * this transfer will finish this request, so...
2993                                  * let's try to read ahead if we're in
2994                                  * a sequential access pattern and we haven't
2995                                  * explicitly disabled it
2996                                  */
2997                                 if (rd_ahead_enabled)
2998                                         cluster_rd_ahead(vp, &extent, filesize, rap);
2999
3000                                 if (rap != NULL) {
3001                                         if (extent.e_addr < rap->cl_lastr)
3002                                                 rap->cl_maxra = 0;
3003                                         rap->cl_lastr = extent.e_addr;
3004                                 }
3005                         }
3006                         lck_mtx_lock(cl_mtxp);
3007
3008                         while (iostate.io_issued != iostate.io_completed) {
3009                                 iostate.io_wanted = 1;
3010                                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_x", 0);
3011                         }
3012                         lck_mtx_unlock(cl_mtxp);
3013
3014                         if (iostate.io_error)
3015                                 error = iostate.io_error;
3016                         else
3017                                 retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
3018                 }
3019                 if (start_pg < last_pg) {
3020                         /*
3021                          * compute the range of pages that we actually issued an I/O for
3022                          * and either commit them as valid if the I/O succeeded
3023                          * or abort them if the I/O failed
3024                          */
3025                         io_size = (last_pg - start_pg) * PAGE_SIZE;
3026
3027                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3028                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3029
3030                         if (error || (flags & IO_NOCACHE))
3031                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
3032                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3033                         else
3034                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
3035                                                      UPL_COMMIT_CLEAR_DIRTY |
3036                                                      UPL_COMMIT_FREE_ON_EMPTY |
3037                                                      UPL_COMMIT_INACTIVATE);
3038
3039                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
3040                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3041                 }
3042                 if ((last_pg - start_pg) < pages_in_upl) {
3043                         int cur_pg;
3044                         int commit_flags;
3045
3046                         /*
3047                          * the set of pages that we issued an I/O for did not encompass
3048                          * the entire upl... so just release these without modifying
3049                          * their state
3050                          */
3051                         if (error)
3052                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3053                         else {
3054                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3055                                              (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
3056
3057                                 if (start_pg) {
3058                                         /*
3059                                          * we found some already valid pages at the beginning of
3060                                          * the upl commit these back to the inactive list with
3061                                          * reference cleared
3062                                          */
3063                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
3064                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
3065                                                                    | UPL_COMMIT_INACTIVATE;
3066
3067                                                 if (upl_dirty_page(pl, cur_pg))
3068                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
3069
3070                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
3071                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3072                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3073                                                 else
3074                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3075                                                                 PAGE_SIZE, commit_flags);
3076                                         }
3077                                 }
3078                                 if (last_pg < uio_last) {
3079                                         /*
3080                                          * we found some already valid pages immediately after the
3081                                          * pages we issued I/O for, commit these back to the
3082                                          * inactive list with reference cleared
3083                                          */
3084                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
3085                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
3086                                                                                 | UPL_COMMIT_INACTIVATE;
3087
3088                                                 if (upl_dirty_page(pl, cur_pg))
3089                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
3090
3091                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
3092                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3093                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3094                                                 else
3095                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3096                                                                 PAGE_SIZE, commit_flags);
3097                                         }
3098                                 }
3099                                 if (uio_last < pages_in_upl) {
3100                                         /*
3101                                          * there were some invalid pages beyond the valid pages
3102                                          * that we didn't issue an I/O for, just release them
3103                                          * unchanged
3104                                          */
3105                                         ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
3106                                                             (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
3107                                 }
3108
3109                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
3110                                         (int)upl, -1, -1, 0, 0);
3111                         }
3112                 }
3113                 if (retval == 0)
3114                         retval = error;
3115
3116                 if ( uio_resid(uio) ) {
3117                         if (cluster_hard_throttle_on(vp)) {
3118                                 rd_ahead_enabled = 0;
3119                                 prefetch_enabled = 0;
3120
3121                                 max_rd_size = HARD_THROTTLE_MAXSIZE;
3122                         } else {
3123                                 if (rap != NULL)
3124                                         rd_ahead_enabled = 1;
3125                                 prefetch_enabled = 1;
3126
3127                                 max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3128                         }
3129                 }
3130         }
3131         if (rap != NULL) {
3132                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3133                              (int)uio->uio_offset, uio_resid(uio), rap->cl_lastr, retval, 0);
3134
3135                 lck_mtx_unlock(&rap->cl_lockr);
3136         } else {
3137                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3138                              (int)uio->uio_offset, uio_resid(uio), 0, retval, 0);
3139         }
3140
3141         return (retval);
3142 }
3143
3144
3145 static int
3146 cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize)
3147 {
3148         upl_t            upl;
3149         upl_page_info_t  *pl;
3150         vm_offset_t      upl_offset;
3151         off_t            max_io_size;
3152         int              io_size;
3153         int              upl_size;
3154         int              upl_needed_size;
3155         int              pages_in_pl;
3156         int              upl_flags;
3157         kern_return_t    kret;
3158         int              i;
3159         int              force_data_sync;
3160         int              retval = 0;
3161         int              no_zero_fill = 0;
3162         int              abort_flag = 0;
3163         struct clios     iostate;
3164         u_int            max_rd_size  = MAX_UPL_TRANSFER * PAGE_SIZE;
3165         u_int            max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
3166
3167
3168         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
3169                      (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
3170
3171         /*
3172          * When we enter this routine, we know
3173          *  -- the offset into the file is on a pagesize boundary
3174          *  -- the resid is a page multiple
3175          *  -- the resid will not exceed iov_len
3176          */
3177
3178         iostate.io_completed = 0;
3179         iostate.io_issued = 0;
3180         iostate.io_error = 0;
3181         iostate.io_wanted = 0;
3182
3183         while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
3184                 user_addr_t     iov_base;
3185
3186                 if (cluster_hard_throttle_on(vp)) {
3187                         max_rd_size  = HARD_THROTTLE_MAXSIZE;
3188                         max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
3189                 } else {
3190                         max_rd_size  = MAX_UPL_TRANSFER * PAGE_SIZE;
3191                         max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 8;
3192                 }
3193                 max_io_size = filesize - uio->uio_offset;
3194
3195         // LP64todo - fix this
3196                 if (max_io_size < (off_t)((unsigned int)uio_resid(uio)))
3197                         io_size = max_io_size;
3198                 else
3199                         io_size = uio_resid(uio);
3200
3201                 /*
3202                  * First look for pages already in the cache
3203                  * and move them to user space.
3204                  */
3205                 retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
3206
3207                 if (retval) {
3208                         /*
3209                          * we may have already spun some portion of this request
3210                          * off as async requests... we need to wait for the I/O
3211                          * to complete before returning
3212                          */
3213                         goto wait_for_reads;
3214                 }
3215                 /*
3216                  * If we are already finished with this read, then return
3217                  */
3218                 if (io_size == 0) {
3219                         /*
3220                          * we may have already spun some portion of this request
3221                          * off as async requests... we need to wait for the I/O
3222                          * to complete before returning
3223                          */
3224                         goto wait_for_reads;
3225                 }
3226                 max_io_size = io_size;
3227
3228                 if (max_io_size > max_rd_size)
3229                         max_io_size = max_rd_size;
3230
3231                 io_size = 0;
3232
3233                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
3234
3235                 if (io_size == 0)
3236                         /*
3237                          * we may have already spun some portion of this request
3238                          * off as async requests... we need to wait for the I/O
3239                          * to complete before returning
3240                          */
3241                         goto wait_for_reads;
3242
3243                 iov_base = uio_curriovbase(uio);
3244
3245                 // LP64todo - fix this!
3246                 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
3247                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
3248
3249                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
3250                              (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
3251
3252                 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
3253                         no_zero_fill = 1;
3254                         abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY;
3255                 } else {
3256                         no_zero_fill = 0;
3257                         abort_flag = UPL_ABORT_FREE_ON_EMPTY;
3258                 }
3259                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
3260                         pages_in_pl = 0;
3261                         upl_size = upl_needed_size;
3262                         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3263
3264                         if (no_zero_fill)
3265                                 upl_flags |= UPL_NOZEROFILL;
3266                         if (force_data_sync)
3267                                 upl_flags |= UPL_FORCE_DATA_SYNC;
3268
3269                         // LP64todo - fix this!
3270                         kret = vm_map_create_upl(current_map(),
3271                                                  (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
3272                                                  &upl_size, &upl, NULL, &pages_in_pl, &upl_flags);
3273
3274                         if (kret != KERN_SUCCESS) {
3275                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3276                                              (int)upl_offset, upl_size, io_size, kret, 0);
3277                                 /*
3278                                  * cluster_nocopy_read: failed to get pagelist
3279                                  *
3280                                  * we may have already spun some portion of this request
3281                                  * off as async requests... we need to wait for the I/O
3282                                  * to complete before returning
3283                                  */
3284                                 goto wait_for_reads;
3285                         }
3286                         pages_in_pl = upl_size / PAGE_SIZE;
3287                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3288
3289                         for (i = 0; i < pages_in_pl; i++) {
3290                                 if (!upl_valid_page(pl, i))
3291                                         break;
3292                         }
3293                         if (i == pages_in_pl)
3294                                 break;
3295
3296                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3297                 }
3298                 if (force_data_sync >= 3) {
3299                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3300                                      (int)upl_offset, upl_size, io_size, kret, 0);
3301
3302                         goto wait_for_reads;
3303                 }
3304                 /*
3305                  * Consider the possibility that upl_size wasn't satisfied.
3306                  */
3307                 if (upl_size != upl_needed_size)
3308                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
3309
3310                 if (io_size == 0) {
3311                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3312                         goto wait_for_reads;
3313                 }
3314                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3315                              (int)upl_offset, upl_size, io_size, kret, 0);
3316
3317                 /*
3318                  * request asynchronously so that we can overlap
3319                  * the preparation of the next I/O
3320                  * if there are already too many outstanding reads
3321                  * wait until some have completed before issuing the next read
3322                  */
3323                 lck_mtx_lock(cl_mtxp);
3324
3325                 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
3326                         iostate.io_wanted = 1;
3327                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
3328                 }
3329                 lck_mtx_unlock(cl_mtxp);
3330
3331                 if (iostate.io_error) {
3332                         /*
3333                          * one of the earlier reads we issued ran into a hard error
3334                          * don't issue any more reads, cleanup the UPL
3335                          * that was just created but not used, then
3336                          * go wait for any other reads to complete before
3337                          * returning the error to the caller
3338                          */
3339                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3340
3341                         goto wait_for_reads;
3342                 }
3343                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
3344                              (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
3345
3346                 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size,
3347                                    CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
3348                                    (buf_t)NULL, &iostate);
3349
3350                 /*
3351                  * update the uio structure
3352                  */
3353                 uio_update(uio, (user_size_t)io_size);
3354
3355                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
3356                              (int)upl, (int)uio->uio_offset, (int)uio_resid(uio), retval, 0);
3357
3358         } /* end while */
3359
3360 wait_for_reads:
3361         /*
3362          * make sure all async reads that are part of this stream
3363          * have completed before we return
3364          */
3365         lck_mtx_lock(cl_mtxp);
3366
3367         while (iostate.io_issued != iostate.io_completed) {
3368                 iostate.io_wanted = 1;
3369                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
3370         }
3371         lck_mtx_unlock(cl_mtxp);
3372
3373         if (iostate.io_error)
3374                 retval = iostate.io_error;
3375
3376         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
3377                      (int)uio->uio_offset, (int)uio_resid(uio), 6, retval, 0);
3378
3379         return (retval);
3380 }
3381
3382
3383 static int
3384 cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize)
3385 {
3386         upl_page_info_t *pl;
3387         upl_t            upl;
3388         vm_offset_t      upl_offset;
3389         addr64_t         dst_paddr;
3390         off_t            max_size;
3391         int              io_size;
3392         user_size_t      iov_len;
3393         user_addr_t      iov_base;
3394         int              tail_size;
3395         int              upl_size;
3396         int              upl_needed_size;
3397         int              pages_in_pl;
3398         int              upl_flags;
3399         kern_return_t    kret;
3400         struct clios     iostate;
3401         int              error;
3402         int              devblocksize;
3403
3404         devblocksize = vp->v_mount->mnt_devblocksize;
3405         /*
3406          * When we enter this routine, we know
3407          *  -- the resid will not exceed iov_len
3408          *  -- the target address is physically contiguous
3409          */
3410
3411 #if LP64_DEBUG
3412         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
3413                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
3414         }
3415 #endif /* LP64_DEBUG */
3416
3417         iov_len = uio_curriovlen(uio);
3418         iov_base = uio_curriovbase(uio);
3419
3420         max_size = filesize - uio->uio_offset;
3421
3422         // LP64todo - fix this!
3423         if (max_size < 0 || (u_int64_t)max_size > iov_len)
3424                 io_size = iov_len;
3425         else
3426                 io_size = max_size;
3427
3428         // LP64todo - fix this!
3429         upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
3430         upl_needed_size = upl_offset + io_size;
3431
3432         error       = 0;
3433         pages_in_pl = 0;
3434         upl_size = upl_needed_size;
3435         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3436
3437         kret = vm_map_get_upl(current_map(),
3438                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
3439                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
3440
3441         if (kret != KERN_SUCCESS) {
3442                 /*
3443                  * cluster_phys_read: failed to get pagelist
3444                  */
3445                 return(EINVAL);
3446         }
3447         if (upl_size < upl_needed_size) {
3448                 /*
3449                  * The upl_size wasn't satisfied.
3450                  */
3451                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3452
3453                 return(EINVAL);
3454         }
3455         pl = ubc_upl_pageinfo(upl);
3456
3457         dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
3458
3459         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3460                 int   head_size;
3461
3462                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
3463
3464                 if (head_size > io_size)
3465                         head_size = io_size;
3466
3467                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ);
3468
3469                 if (error) {
3470                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3471
3472                         return(EINVAL);
3473                 }
3474                 upl_offset += head_size;
3475                 dst_paddr  += head_size;
3476                 io_size    -= head_size;
3477         }
3478         tail_size = io_size & (devblocksize - 1);
3479         io_size  -= tail_size;
3480
3481         iostate.io_completed = 0;
3482         iostate.io_issued = 0;
3483         iostate.io_error = 0;
3484         iostate.io_wanted = 0;
3485
3486         while (io_size && error == 0) {
3487                 int  xsize;
3488
3489                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3490                         xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
3491                 else
3492                         xsize = io_size;
3493                 /*
3494                  * request asynchronously so that we can overlap
3495                  * the preparation of the next I/O... we'll do
3496                  * the commit after all the I/O has completed
3497                  * since its all issued against the same UPL
3498                  * if there are already too many outstanding reads
3499                  * wait until some have completed before issuing the next
3500                  */
3501                 lck_mtx_lock(cl_mtxp);
3502
3503                 while ((iostate.io_issued - iostate.io_completed) > (8 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
3504                         iostate.io_wanted = 1;
3505                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
3506                 }
3507                 lck_mtx_unlock(cl_mtxp);
3508
3509                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize,
3510                                    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3511                                    (buf_t)NULL, &iostate);
3512                 /*
3513                  * The cluster_io read was issued successfully,
3514                  * update the uio structure
3515                  */
3516                 if (error == 0) {
3517                         uio_update(uio, (user_size_t)xsize);
3518
3519                         dst_paddr  += xsize;
3520                         upl_offset += xsize;
3521                         io_size    -= xsize;
3522                 }
3523         }
3524         /*
3525          * make sure all async reads that are part of this stream
3526          * have completed before we proceed
3527          */
3528         lck_mtx_lock(cl_mtxp);
3529
3530         while (iostate.io_issued != iostate.io_completed) {
3531                 iostate.io_wanted = 1;
3532                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
3533         }
3534         lck_mtx_unlock(cl_mtxp);
3535
3536         if (iostate.io_error)
3537                 error = iostate.io_error;
3538
3539         if (error == 0 && tail_size)
3540                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ);
3541
3542         /*
3543          * just release our hold on the physically contiguous
3544          * region without changing any state
3545          */
3546         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3547
3548         return (error);
3549 }
3550
3551
3552 /*
3553  * generate advisory I/O's in the largest chunks possible
3554  * the completed pages will be released into the VM cache
3555  */
3556 int
3557 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
3558 {
3559         upl_page_info_t *pl;
3560         upl_t            upl;
3561         vm_offset_t      upl_offset;
3562         int              upl_size;
3563         off_t            upl_f_offset;
3564         int              start_offset;
3565         int              start_pg;
3566         int              last_pg;
3567         int              pages_in_upl;
3568         off_t            max_size;
3569         int              io_size;
3570         kern_return_t    kret;
3571         int              retval = 0;
3572         int              issued_io;
3573         int              skip_range;
3574
3575         if ( !UBCINFOEXISTS(vp))
3576                 return(EINVAL);
3577
3578         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3579                      (int)f_offset, resid, (int)filesize, 0, 0);
3580
3581         while (resid && f_offset < filesize && retval == 0) {
3582                 /*
3583                  * compute the size of the upl needed to encompass
3584                  * the requested read... limit each call to cluster_io
3585                  * to the maximum UPL size... cluster_io will clip if
3586                  * this exceeds the maximum io_size for the device,
3587                  * make sure to account for
3588                  * a starting offset that's not page aligned
3589                  */
3590                 start_offset = (int)(f_offset & PAGE_MASK_64);
3591                 upl_f_offset = f_offset - (off_t)start_offset;
3592                 max_size     = filesize - f_offset;
3593
3594                 if (resid < max_size)
3595                         io_size = resid;
3596                 else
3597                         io_size = max_size;
3598
3599                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3600                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3601                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3602
3603                 skip_range = 0;
3604                 /*
3605                  * return the number of contiguously present pages in the cache
3606                  * starting at upl_f_offset within the file
3607                  */
3608                 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
3609
3610                 if (skip_range) {
3611                         /*
3612                          * skip over pages already present in the cache
3613                          */
3614                         io_size = skip_range - start_offset;
3615
3616                         f_offset += io_size;
3617                         resid    -= io_size;
3618
3619                         if (skip_range == upl_size)
3620                                 continue;
3621                         /*
3622                          * have to issue some real I/O
3623                          * at this point, we know it's starting on a page boundary
3624                          * because we've skipped over at least the first page in the request
3625                          */
3626                         start_offset = 0;
3627                         upl_f_offset += skip_range;
3628                         upl_size     -= skip_range;
3629                 }
3630                 pages_in_upl = upl_size / PAGE_SIZE;
3631
3632                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
3633                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3634
3635                 kret = ubc_create_upl(vp,
3636                                       upl_f_offset,
3637                                       upl_size,
3638                                       &upl,
3639                                       &pl,
3640                                       UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
3641                 if (kret != KERN_SUCCESS)
3642                         return(retval);
3643                 issued_io = 0;
3644
3645                 /*
3646                  * before we start marching forward, we must make sure we end on
3647                  * a present page, otherwise we will be working with a freed
3648                  * upl
3649                  */
3650                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3651                         if (upl_page_present(pl, last_pg))
3652                                 break;
3653                 }
3654                 pages_in_upl = last_pg + 1;
3655
3656
3657                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
3658                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3659
3660
3661                 for (last_pg = 0; last_pg < pages_in_upl; ) {
3662                         /*
3663                          * scan from the beginning of the upl looking for the first
3664                          * page that is present.... this will become the first page in
3665                          * the request we're going to make to 'cluster_io'... if all
3666                          * of the pages are absent, we won't call through to 'cluster_io'
3667                          */
3668                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3669                                 if (upl_page_present(pl, start_pg))
3670                                         break;
3671                         }
3672
3673                         /*
3674                          * scan from the starting present page looking for an absent
3675                          * page before the end of the upl is reached, if we
3676                          * find one, then it will terminate the range of pages being
3677                          * presented to 'cluster_io'
3678                          */
3679                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3680                                 if (!upl_page_present(pl, last_pg))
3681                                         break;
3682                         }
3683
3684                         if (last_pg > start_pg) {
3685                                 /*
3686                                  * we found a range of pages that must be filled
3687                                  * if the last page in this range is the last page of the file
3688                                  * we may have to clip the size of it to keep from reading past
3689                                  * the end of the last physical block associated with the file
3690                                  */
3691                                 upl_offset = start_pg * PAGE_SIZE;
3692                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
3693
3694                                 if ((upl_f_offset + upl_offset + io_size) > filesize)
3695                                         io_size = filesize - (upl_f_offset + upl_offset);
3696
3697                                 /*
3698                                  * issue an asynchronous read to cluster_io
3699                                  */
3700                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
3701                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (buf_t)NULL, (struct clios *)NULL);
3702
3703                                 issued_io = 1;
3704                         }
3705                 }
3706                 if (issued_io == 0)
3707                         ubc_upl_abort(upl, 0);
3708
3709                 io_size = upl_size - start_offset;
3710
3711                 if (io_size > resid)
3712                         io_size = resid;
3713                 f_offset += io_size;
3714                 resid    -= io_size;
3715         }
3716
3717         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3718                      (int)f_offset, resid, retval, 0, 0);
3719
3720         return(retval);
3721 }
3722
3723
3724 int
3725 cluster_push(vnode_t vp, int flags)
3726 {
3727         int     retval;
3728         struct  cl_writebehind *wbp;
3729
3730         if ( !UBCINFOEXISTS(vp)) {
3731                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -1, 0);
3732                 return (0);
3733         }
3734         /* return if deferred write is set */
3735         if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
3736                 return (0);
3737         }
3738         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
3739                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -2, 0);
3740                 return (0);
3741         }
3742         if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
3743                 lck_mtx_unlock(&wbp->cl_lockw);
3744
3745                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -3, 0);
3746                 return(0);
3747         }
3748         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3749                      (int)wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
3750
3751         if (wbp->cl_scmap) {
3752                 sparse_cluster_push(wbp, vp, ubc_getsize(vp), 1);
3753
3754                 retval = 1;
3755         } else
3756                 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), 0, 1);
3757
3758         lck_mtx_unlock(&wbp->cl_lockw);
3759
3760         if (flags & IO_SYNC)
3761                 (void)vnode_waitforwrites(vp, 0, 0, 0, (char *)"cluster_push");
3762
3763         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3764                      (int)wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
3765
3766         return (retval);
3767 }
3768
3769
3770 __private_extern__ void
3771 cluster_release(struct ubc_info *ubc)
3772 {
3773         struct cl_writebehind *wbp;
3774         struct cl_readahead   *rap;
3775
3776         if ((wbp = ubc->cl_wbehind)) {
3777
3778                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
3779
3780                 if (wbp->cl_scmap)
3781                         vfs_drt_control(&(wbp->cl_scmap), 0);
3782         } else {
3783                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, 0, 0, 0, 0);
3784         }
3785
3786         rap = ubc->cl_rahead;
3787
3788         if (wbp != NULL) {
3789                 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
3790                 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
3791         }
3792         if ((rap = ubc->cl_rahead)) {
3793                 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
3794                 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
3795         }
3796         ubc->cl_rahead  = NULL;
3797         ubc->cl_wbehind = NULL;
3798
3799         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)ubc, (int)rap, (int)wbp, 0, 0);
3800 }
3801
3802
3803 static void
3804 cluster_push_EOF(vnode_t vp, off_t EOF)
3805 {
3806         struct cl_writebehind *wbp;
3807
3808         wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3809
3810         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3811                      (int)wbp->cl_scmap, wbp->cl_number, (int)EOF, 0, 0);
3812
3813         if (wbp->cl_scmap)
3814                 sparse_cluster_push(wbp, vp, EOF, 1);
3815         else
3816                 cluster_try_push(wbp, vp, EOF, 0, 1);
3817
3818         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3819                      (int)wbp->cl_scmap, wbp->cl_number, 0, 0, 0);
3820
3821         lck_mtx_unlock(&wbp->cl_lockw);
3822 }
3823
3824
3825 static int
3826 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int can_delay, int push_all)
3827 {
3828         int cl_index;
3829         int cl_index1;
3830         int min_index;
3831         int cl_len;
3832         int cl_pushed = 0;
3833         struct cl_wextent l_clusters[MAX_CLUSTERS];
3834
3835         /*
3836          * the write behind context exists and has
3837          * already been locked...
3838          *
3839          * make a local 'sorted' copy of the clusters
3840          * and clear wbp->cl_number so that new clusters can
3841          * be developed
3842          */
3843         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3844                 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
3845                         if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
3846                                 continue;
3847                         if (min_index == -1)
3848                                 min_index = cl_index1;
3849                         else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
3850                                 min_index = cl_index1;
3851                 }
3852                 if (min_index == -1)
3853                         break;
3854                 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
3855                 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
3856                 l_clusters[cl_index].io_nocache = wbp->cl_clusters[min_index].io_nocache;
3857
3858                 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
3859         }
3860         wbp->cl_number = 0;
3861
3862         cl_len = cl_index;
3863
3864         if (can_delay && cl_len == MAX_CLUSTERS) {
3865                 int   i;
3866
3867                 /*
3868                  * determine if we appear to be writing the file sequentially
3869                  * if not, by returning without having pushed any clusters
3870                  * we will cause this vnode to be pushed into the sparse cluster mechanism
3871                  * used for managing more random I/O patterns
3872                  *
3873                  * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3874                  * that's why we're in try_push with can_delay true...
3875                  *
3876                  * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3877                  * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3878                  * so we can just make a simple pass through, up to, but not including the last one...
3879                  * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
3880                  * are sequential
3881                  *
3882                  * we let the last one be partial as long as it was adjacent to the previous one...
3883                  * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3884                  * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3885                  */
3886                 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
3887                         if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != MAX_UPL_TRANSFER)
3888                                 goto dont_try;
3889                         if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
3890                                 goto dont_try;
3891                 }
3892         }
3893         /*
3894          * drop the lock while we're firing off the I/Os...
3895          * this is safe since I'm working off of a private sorted copy
3896          * of the clusters, and I'm going to re-evaluate the public
3897          * state after I retake the lock
3898          */
3899         lck_mtx_unlock(&wbp->cl_lockw);
3900
3901         for (cl_index = 0; cl_index < cl_len; cl_index++) {
3902                 int flags;
3903                 struct cl_extent cl;
3904
3905                 /*
3906                  * try to push each cluster in turn...
3907                  */
3908                 if (l_clusters[cl_index].io_nocache)
3909                         flags = IO_NOCACHE;
3910                 else
3911                         flags = 0;
3912                 cl.b_addr = l_clusters[cl_index].b_addr;
3913                 cl.e_addr = l_clusters[cl_index].e_addr;
3914
3915                 cluster_push_x(vp, &cl, EOF, flags);
3916
3917                 l_clusters[cl_index].b_addr = 0;
3918                 l_clusters[cl_index].e_addr = 0;
3919
3920                 cl_pushed++;
3921
3922                 if (push_all == 0)
3923                         break;
3924         }
3925         lck_mtx_lock(&wbp->cl_lockw);
3926
3927 dont_try:
3928         if (cl_len > cl_pushed) {
3929                /*
3930                 * we didn't push all of the clusters, so
3931                 * lets try to merge them back in to the vnode
3932                 */
3933                 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
3934                         /*
3935                          * we picked up some new clusters while we were trying to
3936                          * push the old ones... this can happen because I've dropped
3937                          * the vnode lock... the sum of the
3938                          * leftovers plus the new cluster count exceeds our ability
3939                          * to represent them, so switch to the sparse cluster mechanism
3940                          *
3941                          * collect the active public clusters...
3942                          */
3943                         sparse_cluster_switch(wbp, vp, EOF);
3944
3945                         for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
3946                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
3947                                         continue;
3948                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3949                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3950                                 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
3951
3952                                 cl_index1++;
3953                         }
3954                         /*
3955                          * update the cluster count
3956                          */
3957                         wbp->cl_number = cl_index1;
3958
3959                         /*
3960                          * and collect the original clusters that were moved into the
3961                          * local storage for sorting purposes
3962                          */
3963                         sparse_cluster_switch(wbp, vp, EOF);
3964
3965                 } else {
3966                         /*
3967                          * we've got room to merge the leftovers back in
3968                          * just append them starting at the next 'hole'
3969                          * represented by wbp->cl_number
3970                          */
3971                         for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
3972                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
3973                                         continue;
3974
3975                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3976                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3977                                 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
3978
3979                                 cl_index1++;
3980                         }
3981                         /*
3982                          * update the cluster count
3983                          */
3984                         wbp->cl_number = cl_index1;
3985                 }
3986         }
3987         return(MAX_CLUSTERS - wbp->cl_number);
3988 }
3989
3990
3991
3992 static int
3993 cluster_push_x(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags)
3994 {
3995         upl_page_info_t *pl;
3996         upl_t            upl;
3997         vm_offset_t      upl_offset;
3998         int              upl_size;
3999         off_t            upl_f_offset;
4000         int              pages_in_upl;
4001         int              start_pg;
4002         int              last_pg;
4003         int              io_size;
4004         int              io_flags;
4005         int              upl_flags;
4006         int              size;
4007         int              error = 0;
4008         int              retval;
4009         kern_return_t    kret;
4010
4011
4012         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
4013                      (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
4014
4015         if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
4016                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
4017
4018                 return (0);
4019         }
4020         upl_size = pages_in_upl * PAGE_SIZE;
4021         upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4022
4023         if (upl_f_offset + upl_size >= EOF) {
4024
4025                 if (upl_f_offset >= EOF) {
4026                         /*
4027                          * must have truncated the file and missed
4028                          * clearing a dangling cluster (i.e. it's completely
4029                          * beyond the new EOF
4030                          */
4031                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
4032
4033                         return(0);
4034                 }
4035                 size = EOF - upl_f_offset;
4036
4037                 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4038                 pages_in_upl = upl_size / PAGE_SIZE;
4039         } else
4040                 size = upl_size;
4041
4042         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
4043
4044         /*
4045          * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
4046          *
4047          * - only pages that are currently dirty are returned... these are the ones we need to clean
4048          * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
4049          * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
4050          * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
4051          *   someone dirties this page while the I/O is in progress, we don't lose track of the new state
4052          *
4053          * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
4054          */
4055
4056         if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE))
4057                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
4058         else
4059                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
4060
4061         kret = ubc_create_upl(vp,
4062                                 upl_f_offset,
4063                                 upl_size,
4064                                 &upl,
4065                                 &pl,
4066                                 upl_flags);
4067         if (kret != KERN_SUCCESS)
4068                 panic("cluster_push: failed to get pagelist");
4069
4070         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
4071
4072         /*
4073          * since we only asked for the dirty pages back
4074          * it's possible that we may only get a few or even none, so...
4075          * before we start marching forward, we must make sure we know
4076          * where the last present page is in the UPL, otherwise we could
4077          * end up working with a freed upl due to the FREE_ON_EMPTY semantics
4078          * employed by commit_range and abort_range.
4079          */
4080         for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
4081                 if (upl_page_present(pl, last_pg))
4082                         break;
4083         }
4084         pages_in_upl = last_pg + 1;
4085
4086         if (pages_in_upl == 0) {
4087                 ubc_upl_abort(upl, 0);
4088
4089                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
4090                 return(0);
4091         }
4092
4093         for (last_pg = 0; last_pg < pages_in_upl; ) {
4094                 /*
4095                  * find the next dirty page in the UPL
4096                  * this will become the first page in the
4097                  * next I/O to generate
4098                  */
4099                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
4100                         if (upl_dirty_page(pl, start_pg))
4101                                 break;
4102                         if (upl_page_present(pl, start_pg))
4103                                 /*
4104                                  * RET_ONLY_DIRTY will return non-dirty 'precious' pages
4105                                  * just release these unchanged since we're not going
4106                                  * to steal them or change their state
4107                                  */
4108                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4109                 }
4110                 if (start_pg >= pages_in_upl)
4111                         /*
4112                          * done... no more dirty pages to push
4113                          */
4114                         break;
4115                 if (start_pg > last_pg)
4116                         /*
4117                          * skipped over some non-dirty pages
4118                          */
4119                         size -= ((start_pg - last_pg) * PAGE_SIZE);
4120
4121                 /*
4122                  * find a range of dirty pages to write
4123                  */
4124                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4125                         if (!upl_dirty_page(pl, last_pg))
4126                                 break;
4127                 }
4128                 upl_offset = start_pg * PAGE_SIZE;
4129
4130                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
4131
4132                 io_flags = CL_THROTTLE | CL_COMMIT;
4133
4134                 if ( !(flags & IO_SYNC))
4135                         io_flags |= CL_ASYNC;
4136
4137                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
4138                                     io_flags, (buf_t)NULL, (struct clios *)NULL);
4139
4140                 if (error == 0 && retval)
4141                         error = retval;
4142
4143                 size -= io_size;
4144         }
4145         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
4146
4147         return(error);
4148 }
4149
4150
4151 /*
4152  * sparse_cluster_switch is called with the write behind lock held
4153  */
4154 static void
4155 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF)
4156 {
4157         int     cl_index;
4158
4159         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4160
4161         if (wbp->cl_scmap == NULL)
4162                 wbp->cl_scdirty = 0;
4163
4164         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
4165                 int       flags;
4166                 struct cl_extent cl;
4167
4168                 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
4169
4170                         if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
4171                                 if (flags & UPL_POP_DIRTY) {
4172                                         cl.e_addr = cl.b_addr + 1;
4173
4174                                         sparse_cluster_add(wbp, vp, &cl, EOF);
4175                                 }
4176                         }
4177                 }
4178         }
4179         wbp->cl_number = 0;
4180
4181         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4182 }
4183
4184
4185 /*
4186  * sparse_cluster_push is called with the write behind lock held
4187  */
4188 static void
4189 sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_all)
4190 {
4191         struct cl_extent cl;
4192         off_t           offset;
4193         u_int           length;
4194
4195         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, push_all, 0);
4196
4197         if (push_all)
4198                 vfs_drt_control(&(wbp->cl_scmap), 1);
4199
4200         for (;;) {
4201                 if (vfs_drt_get_cluster(&(wbp->cl_scmap), &offset, &length) != KERN_SUCCESS)
4202                         break;
4203
4204                 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
4205                 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
4206
4207                 wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr);
4208
4209                 cluster_push_x(vp, &cl, EOF, 0);
4210
4211                 if (push_all == 0)
4212                         break;
4213         }
4214         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4215 }
4216
4217
4218 /*
4219  * sparse_cluster_add is called with the write behind lock held
4220  */
4221 static void
4222 sparse_cluster_add(struct cl_writebehind *wbp, vnode_t vp, struct cl_extent *cl, off_t EOF)
4223 {
4224         u_int   new_dirty;
4225         u_int   length;
4226         off_t   offset;
4227
4228         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)wbp->cl_scmap, wbp->cl_scdirty, (int)cl->b_addr, (int)cl->e_addr, 0);
4229
4230         offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4231         length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
4232
4233         while (vfs_drt_mark_pages(&(wbp->cl_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
4234                 /*
4235                  * no room left in the map
4236                  * only a partial update was done
4237                  * push out some pages and try again
4238                  */
4239                 wbp->cl_scdirty += new_dirty;
4240
4241                 sparse_cluster_push(wbp, vp, EOF, 0);
4242
4243                 offset += (new_dirty * PAGE_SIZE_64);
4244                 length -= (new_dirty * PAGE_SIZE);
4245         }
4246         wbp->cl_scdirty += new_dirty;
4247
4248         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4249 }
4250
4251
4252 static int
4253 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags)
4254 {
4255         upl_page_info_t  *pl;
4256         upl_t            upl;
4257         addr64_t         ubc_paddr;
4258         kern_return_t    kret;
4259         int              error = 0;
4260         int              did_read = 0;
4261         int              abort_flags;
4262         int              upl_flags;
4263
4264         upl_flags = UPL_SET_LITE;
4265         if (! (flags & CL_READ)) {
4266                 /*
4267                  * "write" operation:  let the UPL subsystem know
4268                  * that we intend to modify the buffer cache pages
4269                  * we're gathering.
4270                  */
4271                 upl_flags |= UPL_WILL_MODIFY;
4272         }
4273
4274         kret = ubc_create_upl(vp,
4275                               uio->uio_offset & ~PAGE_MASK_64,
4276                               PAGE_SIZE,
4277                               &upl,
4278                               &pl,
4279                               upl_flags);
4280
4281         if (kret != KERN_SUCCESS)
4282                 return(EINVAL);
4283
4284         if (!upl_valid_page(pl, 0)) {
4285                 /*
4286                  * issue a synchronous read to cluster_io
4287                  */
4288                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4289                                    CL_READ, (buf_t)NULL, (struct clios *)NULL);
4290                 if (error) {
4291                           ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4292
4293                           return(error);
4294                 }
4295                 did_read = 1;
4296         }
4297         ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
4298
4299 /*
4300  *      NOTE:  There is no prototype for the following in BSD. It, and the definitions
4301  *      of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
4302  *      osfmk/ppc/mappings.h.  They are not included here because there appears to be no
4303  *      way to do so without exporting them to kexts as well.
4304  */
4305         if (flags & CL_READ)
4306 //              copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);    /* Copy physical to physical and flush the destination */
4307                 copypv(ubc_paddr, usr_paddr, xsize,        2 |        1 |        4);    /* Copy physical to physical and flush the destination */
4308         else
4309 //              copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);    /* Copy physical to physical and flush the source */
4310                 copypv(usr_paddr, ubc_paddr, xsize,        2 |        1 |        8);    /* Copy physical to physical and flush the source */
4311
4312         if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
4313                 /*
4314                  * issue a synchronous write to cluster_io
4315                  */
4316                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4317                                         0, (buf_t)NULL, (struct clios *)NULL);
4318         }
4319         if (error == 0)
4320                 uio_update(uio, (user_size_t)xsize);
4321
4322         if (did_read)
4323                 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4324         else
4325                 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
4326
4327         ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
4328
4329         return (error);
4330 }
4331
4332
4333
4334 int
4335 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
4336 {
4337         int       pg_offset;
4338         int       pg_index;
4339         int       csize;
4340         int       segflg;
4341         int       retval = 0;
4342         upl_page_info_t *pl;
4343
4344         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
4345                      (int)uio->uio_offset, uio_resid(uio), upl_offset, xsize, 0);
4346
4347         segflg = uio->uio_segflg;
4348
4349         switch(segflg) {
4350
4351           case UIO_USERSPACE32:
4352           case UIO_USERISPACE32:
4353                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4354                 break;
4355
4356           case UIO_USERSPACE:
4357           case UIO_USERISPACE:
4358                 uio->uio_segflg = UIO_PHYS_USERSPACE;
4359                 break;
4360
4361           case UIO_USERSPACE64:
4362           case UIO_USERISPACE64:
4363                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4364                 break;
4365
4366           case UIO_SYSSPACE32:
4367                 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4368                 break;
4369
4370           case UIO_SYSSPACE:
4371                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4372                 break;
4373
4374           case UIO_SYSSPACE64:
4375                 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4376                 break;
4377         }
4378         pl = ubc_upl_pageinfo(upl);
4379
4380         pg_index  = upl_offset / PAGE_SIZE;
4381         pg_offset = upl_offset & PAGE_MASK;
4382         csize     = min(PAGE_SIZE - pg_offset, xsize);
4383
4384         while (xsize && retval == 0) {
4385                 addr64_t  paddr;
4386
4387                 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
4388
4389                 retval = uiomove64(paddr, csize, uio);
4390
4391                 pg_index += 1;
4392                 pg_offset = 0;
4393                 xsize    -= csize;
4394                 csize     = min(PAGE_SIZE, xsize);
4395         }
4396         uio->uio_segflg = segflg;
4397
4398         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4399                      (int)uio->uio_offset, uio_resid(uio), retval, segflg, 0);
4400
4401         return (retval);
4402 }
4403
4404
4405 int
4406 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
4407 {
4408         int       segflg;
4409         int       io_size;
4410         int       xsize;
4411         int       start_offset;
4412         int       retval = 0;
4413         memory_object_control_t  control;
4414
4415
4416         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
4417                      (int)uio->uio_offset, uio_resid(uio), 0, *io_resid, 0);
4418
4419         control = ubc_getobject(vp, UBC_FLAGS_NONE);
4420         if (control == MEMORY_OBJECT_CONTROL_NULL) {
4421                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4422                              (int)uio->uio_offset, uio_resid(uio), retval, 3, 0);
4423
4424                 return(0);
4425         }
4426         segflg = uio->uio_segflg;
4427
4428         switch(segflg) {
4429
4430           case UIO_USERSPACE32:
4431           case UIO_USERISPACE32:
4432                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4433                 break;
4434
4435           case UIO_USERSPACE64:
4436           case UIO_USERISPACE64:
4437                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4438                 break;
4439
4440           case UIO_SYSSPACE32:
4441                 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4442                 break;
4443
4444           case UIO_SYSSPACE64:
4445                 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4446                 break;
4447
4448           case UIO_USERSPACE:
4449           case UIO_USERISPACE:
4450                 uio->uio_segflg = UIO_PHYS_USERSPACE;
4451                 break;
4452
4453           case UIO_SYSSPACE:
4454                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4455                 break;
4456         }
4457
4458         if ( (io_size = *io_resid) ) {
4459                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4460                 xsize = uio_resid(uio);
4461
4462                 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset,
4463                                                        uio, start_offset, io_size, mark_dirty);
4464                 xsize -= uio_resid(uio);
4465                 io_size -= xsize;
4466         }
4467         uio->uio_segflg = segflg;
4468         *io_resid       = io_size;
4469
4470         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4471                      (int)uio->uio_offset, uio_resid(uio), retval, 0x80000000 | segflg, 0);
4472
4473         return(retval);
4474 }
4475
4476
4477 int
4478 is_file_clean(vnode_t vp, off_t filesize)
4479 {
4480         off_t f_offset;
4481         int   flags;
4482         int   total_dirty = 0;
4483
4484         for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
4485                 if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
4486                         if (flags & UPL_POP_DIRTY) {
4487                                 total_dirty++;
4488                         }
4489                 }
4490         }
4491         if (total_dirty)
4492                 return(EINVAL);
4493
4494         return (0);
4495 }
4496
4497
4498
4499 /*
4500  * Dirty region tracking/clustering mechanism.
4501  *
4502  * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4503  * dirty regions within a larger space (file).  It is primarily intended to
4504  * support clustering in large files with many dirty areas.
4505  *
4506  * The implementation assumes that the dirty regions are pages.
4507  *
4508  * To represent dirty pages within the file, we store bit vectors in a
4509  * variable-size circular hash.
4510  */
4511
4512 /*
4513  * Bitvector size.  This determines the number of pages we group in a
4514  * single hashtable entry.  Each hashtable entry is aligned to this
4515  * size within the file.
4516  */
4517 #define DRT_BITVECTOR_PAGES             256
4518
4519 /*
4520  * File offset handling.
4521  *
4522  * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4523  * the correct formula is  (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4524  */
4525 #define DRT_ADDRESS_MASK                (~((1 << 20) - 1))
4526 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK)
4527
4528 /*
4529  * Hashtable address field handling.
4530  *
4531  * The low-order bits of the hashtable address are used to conserve
4532  * space.
4533  *
4534  * DRT_HASH_COUNT_MASK must be large enough to store the range
4535  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4536  * to indicate that the bucket is actually unoccupied.
4537  */
4538 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4539 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \
4540         do {                                                                                            \
4541                 (scm)->scm_hashtable[(i)].dhe_control =                                                 \
4542                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4543         } while (0)
4544 #define DRT_HASH_COUNT_MASK             0x1ff
4545 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4546 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \
4547         do {                                                                                                            \
4548                 (scm)->scm_hashtable[(i)].dhe_control =                                                                 \
4549                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \
4550         } while (0)
4551 #define DRT_HASH_CLEAR(scm, i)                                                                                          \
4552         do {                                                                                                            \
4553                 (scm)->scm_hashtable[(i)].dhe_control = 0;                                                              \
4554         } while (0)
4555 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4556 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4557 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \
4558         do {                                                                                            \
4559                 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \
4560                 DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \
4561         } while(0);
4562
4563
4564 /*
4565  * Hash table moduli.
4566  *
4567  * Since the hashtable entry's size is dependent on the size of
4568  * the bitvector, and since the hashtable size is constrained to
4569  * both being prime and fitting within the desired allocation
4570  * size, these values need to be manually determined.
4571  *
4572  * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4573  *
4574  * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4575  * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4576  */
4577 #define DRT_HASH_SMALL_MODULUS  23
4578 #define DRT_HASH_LARGE_MODULUS  401
4579
4580 #define DRT_SMALL_ALLOCATION    1024    /* 104 bytes spare */
4581 #define DRT_LARGE_ALLOCATION    16384   /* 344 bytes spare */
4582
4583 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4584
4585 /*
4586  * Hashtable bitvector handling.
4587  *
4588  * Bitvector fields are 32 bits long.
4589  */
4590
4591 #define DRT_HASH_SET_BIT(scm, i, bit)                           \
4592         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4593
4594 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \
4595         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4596
4597 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \
4598         ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4599
4600 #define DRT_BITVECTOR_CLEAR(scm, i)                             \
4601         bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4602
4603 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \
4604         bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \
4605             &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \
4606             (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4607
4608
4609
4610 /*
4611  * Hashtable entry.
4612  */
4613 struct vfs_drt_hashentry {
4614         u_int64_t       dhe_control;
4615         u_int32_t       dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
4616 };
4617
4618 /*
4619  * Dirty Region Tracking structure.
4620  *
4621  * The hashtable is allocated entirely inside the DRT structure.
4622  *
4623  * The hash is a simple circular prime modulus arrangement, the structure
4624  * is resized from small to large if it overflows.
4625  */
4626
4627 struct vfs_drt_clustermap {
4628         u_int32_t               scm_magic;      /* sanity/detection */
4629 #define DRT_SCM_MAGIC           0x12020003
4630         u_int32_t               scm_modulus;    /* current ring size */
4631         u_int32_t               scm_buckets;    /* number of occupied buckets */
4632         u_int32_t               scm_lastclean;  /* last entry we cleaned */
4633         u_int32_t               scm_iskips;     /* number of slot skips */
4634
4635         struct vfs_drt_hashentry scm_hashtable[0];
4636 };
4637
4638
4639 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus)
4640 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus)
4641
4642 /*
4643  * Debugging codes and arguments.
4644  */
4645 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4646 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4647 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4648 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4649 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4650                                                             * dirty */
4651                                                            /* 0, setcount */
4652                                                            /* 1 (clean, no map) */
4653                                                            /* 2 (map alloc fail) */
4654                                                            /* 3, resid (partial) */
4655 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87))
4656 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4657                                                             * lastclean, iskips */
4658
4659
4660 static kern_return_t    vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
4661 static kern_return_t    vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
4662 static kern_return_t    vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
4663         u_int64_t offset, int *indexp);
4664 static kern_return_t    vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
4665         u_int64_t offset,
4666         int *indexp,
4667         int recursed);
4668 static kern_return_t    vfs_drt_do_mark_pages(
4669         void            **cmapp,
4670         u_int64_t       offset,
4671         u_int           length,
4672         int             *setcountp,
4673         int             dirty);
4674 static void             vfs_drt_trace(
4675         struct vfs_drt_clustermap *cmap,
4676         int code,
4677         int arg1,
4678         int arg2,
4679         int arg3,
4680         int arg4);
4681
4682
4683 /*
4684  * Allocate and initialise a sparse cluster map.
4685  *
4686  * Will allocate a new map, resize or compact an existing map.
4687  *
4688  * XXX we should probably have at least one intermediate map size,
4689  * as the 1:16 ratio seems a bit drastic.
4690  */
4691 static kern_return_t
4692 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
4693 {
4694         struct vfs_drt_clustermap *cmap, *ocmap;
4695         kern_return_t   kret;
4696         u_int64_t       offset;
4697         int             nsize, i, active_buckets, index, copycount;
4698
4699         ocmap = NULL;
4700         if (cmapp != NULL)
4701                 ocmap = *cmapp;
4702
4703         /*
4704          * Decide on the size of the new map.
4705          */
4706         if (ocmap == NULL) {
4707                 nsize = DRT_HASH_SMALL_MODULUS;
4708         } else {
4709                 /* count the number of active buckets in the old map */
4710                 active_buckets = 0;
4711                 for (i = 0; i < ocmap->scm_modulus; i++) {
4712                         if (!DRT_HASH_VACANT(ocmap, i) &&
4713                             (DRT_HASH_GET_COUNT(ocmap, i) != 0))
4714                                 active_buckets++;
4715                 }
4716                 /*
4717                  * If we're currently using the small allocation, check to
4718                  * see whether we should grow to the large one.
4719                  */
4720                 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
4721                         /* if the ring is nearly full */
4722                         if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
4723                                 nsize = DRT_HASH_LARGE_MODULUS;
4724                         } else {
4725                                 nsize = DRT_HASH_SMALL_MODULUS;
4726                         }
4727                 } else {
4728                         /* already using the large modulus */
4729                         nsize = DRT_HASH_LARGE_MODULUS;
4730                         /*
4731                          * If the ring is completely full, there's
4732                          * nothing useful for us to do.  Behave as
4733                          * though we had compacted into the new
4734                          * array and return.
4735                          */
4736                         if (active_buckets >= DRT_HASH_LARGE_MODULUS)
4737                                 return(KERN_SUCCESS);
4738                 }
4739         }
4740
4741         /*
4742          * Allocate and initialise the new map.
4743          */
4744
4745         kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
4746             (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4747         if (kret != KERN_SUCCESS)
4748                 return(kret);
4749         cmap->scm_magic = DRT_SCM_MAGIC;
4750         cmap->scm_modulus = nsize;
4751         cmap->scm_buckets = 0;
4752         cmap->scm_lastclean = 0;
4753         cmap->scm_iskips = 0;
4754         for (i = 0; i < cmap->scm_modulus; i++) {
4755                 DRT_HASH_CLEAR(cmap, i);
4756                 DRT_HASH_VACATE(cmap, i);
4757                 DRT_BITVECTOR_CLEAR(cmap, i);
4758         }
4759
4760         /*
4761          * If there's an old map, re-hash entries from it into the new map.
4762          */
4763         copycount = 0;
4764         if (ocmap != NULL) {
4765                 for (i = 0; i < ocmap->scm_modulus; i++) {
4766                         /* skip empty buckets */
4767                         if (DRT_HASH_VACANT(ocmap, i) ||
4768                             (DRT_HASH_GET_COUNT(ocmap, i) == 0))
4769                                 continue;
4770                         /* get new index */
4771                         offset = DRT_HASH_GET_ADDRESS(ocmap, i);
4772                         kret = vfs_drt_get_index(&cmap, offset, &index, 1);
4773                         if (kret != KERN_SUCCESS) {
4774                                 /* XXX need to bail out gracefully here */
4775                                 panic("vfs_drt: new cluster map mysteriously too small");
4776                         }
4777                         /* copy */
4778                         DRT_HASH_COPY(ocmap, i, cmap, index);
4779                         copycount++;
4780                 }
4781         }
4782
4783         /* log what we've done */
4784         vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
4785
4786         /*
4787          * It's important to ensure that *cmapp always points to
4788          * a valid map, so we must overwrite it before freeing
4789          * the old map.
4790          */
4791         *cmapp = cmap;
4792         if (ocmap != NULL) {
4793                 /* emit stats into trace buffer */
4794                 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
4795                               ocmap->scm_modulus,
4796                               ocmap->scm_buckets,
4797                               ocmap->scm_lastclean,
4798                               ocmap->scm_iskips);
4799
4800                 vfs_drt_free_map(ocmap);
4801         }
4802         return(KERN_SUCCESS);
4803 }
4804
4805
4806 /*
4807  * Free a sparse cluster map.
4808  */
4809 static kern_return_t
4810 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
4811 {
4812         kmem_free(kernel_map, (vm_offset_t)cmap,
4813                   (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4814         return(KERN_SUCCESS);
4815 }
4816
4817
4818 /*
4819  * Find the hashtable slot currently occupied by an entry for the supplied offset.
4820  */
4821 static kern_return_t
4822 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
4823 {
4824         int             index, i;
4825
4826         offset = DRT_ALIGN_ADDRESS(offset);
4827         index = DRT_HASH(cmap, offset);
4828
4829         /* traverse the hashtable */
4830         for (i = 0; i < cmap->scm_modulus; i++) {
4831
4832                 /*
4833                  * If the slot is vacant, we can stop.
4834                  */
4835                 if (DRT_HASH_VACANT(cmap, index))
4836                         break;
4837
4838                 /*
4839                  * If the address matches our offset, we have success.
4840                  */
4841                 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
4842                         *indexp = index;
4843                         return(KERN_SUCCESS);
4844                 }
4845
4846                 /*
4847                  * Move to the next slot, try again.
4848                  */
4849                 index = DRT_HASH_NEXT(cmap, index);
4850         }
4851         /*
4852          * It's not there.
4853          */
4854         return(KERN_FAILURE);
4855 }
4856
4857 /*
4858  * Find the hashtable slot for the supplied offset.  If we haven't allocated
4859  * one yet, allocate one and populate the address field.  Note that it will
4860  * not have a nonzero page count and thus will still technically be free, so
4861  * in the case where we are called to clean pages, the slot will remain free.
4862  */
4863 static kern_return_t
4864 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
4865 {
4866         struct vfs_drt_clustermap *cmap;
4867         kern_return_t   kret;
4868         int             index, i;
4869
4870         cmap = *cmapp;
4871
4872         /* look for an existing entry */
4873         kret = vfs_drt_search_index(cmap, offset, indexp);
4874         if (kret == KERN_SUCCESS)
4875                 return(kret);
4876
4877         /* need to allocate an entry */
4878         offset = DRT_ALIGN_ADDRESS(offset);
4879         index = DRT_HASH(cmap, offset);
4880
4881         /* scan from the index forwards looking for a vacant slot */
4882         for (i = 0; i < cmap->scm_modulus; i++) {
4883                 /* slot vacant? */
4884                 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
4885                         cmap->scm_buckets++;
4886                         if (index < cmap->scm_lastclean)
4887                                 cmap->scm_lastclean = index;
4888                         DRT_HASH_SET_ADDRESS(cmap, index, offset);
4889                         DRT_HASH_SET_COUNT(cmap, index, 0);
4890                         DRT_BITVECTOR_CLEAR(cmap, index);
4891                         *indexp = index;
4892                         vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
4893                         return(KERN_SUCCESS);
4894                 }
4895                 cmap->scm_iskips += i;
4896                 index = DRT_HASH_NEXT(cmap, index);
4897         }
4898
4899         /*
4900          * We haven't found a vacant slot, so the map is full.  If we're not
4901          * already recursed, try reallocating/compacting it.
4902          */
4903         if (recursed)
4904                 return(KERN_FAILURE);
4905         kret = vfs_drt_alloc_map(cmapp);
4906         if (kret == KERN_SUCCESS) {
4907                 /* now try to insert again */
4908                 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
4909         }
4910         return(kret);
4911 }
4912
4913 /*
4914  * Implementation of set dirty/clean.
4915  *
4916  * In the 'clean' case, not finding a map is OK.
4917  */
4918 static kern_return_t
4919 vfs_drt_do_mark_pages(
4920         void            **private,
4921         u_int64_t       offset,
4922         u_int           length,
4923         int             *setcountp,
4924         int             dirty)
4925 {
4926         struct vfs_drt_clustermap *cmap, **cmapp;
4927         kern_return_t   kret;
4928         int             i, index, pgoff, pgcount, setcount, ecount;
4929
4930         cmapp = (struct vfs_drt_clustermap **)private;
4931         cmap = *cmapp;
4932
4933         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
4934
4935         if (setcountp != NULL)
4936                 *setcountp = 0;
4937
4938         /* allocate a cluster map if we don't already have one */
4939         if (cmap == NULL) {
4940                 /* no cluster map, nothing to clean */
4941                 if (!dirty) {
4942                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
4943                         return(KERN_SUCCESS);
4944                 }
4945                 kret = vfs_drt_alloc_map(cmapp);
4946                 if (kret != KERN_SUCCESS) {
4947                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
4948                         return(kret);
4949                 }
4950         }
4951         setcount = 0;
4952
4953         /*
4954          * Iterate over the length of the region.
4955          */
4956         while (length > 0) {
4957                 /*
4958                  * Get the hashtable index for this offset.
4959                  *
4960                  * XXX this will add blank entries if we are clearing a range
4961                  * that hasn't been dirtied.
4962                  */
4963                 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
4964                 cmap = *cmapp;  /* may have changed! */
4965                 /* this may be a partial-success return */
4966                 if (kret != KERN_SUCCESS) {
4967                         if (setcountp != NULL)
4968                                 *setcountp = setcount;
4969                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
4970
4971                         return(kret);
4972                 }
4973
4974                 /*
4975                  * Work out how many pages we're modifying in this
4976                  * hashtable entry.
4977                  */
4978                 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
4979                 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
4980
4981                 /*
4982                  * Iterate over pages, dirty/clearing as we go.
4983                  */
4984                 ecount = DRT_HASH_GET_COUNT(cmap, index);
4985                 for (i = 0; i < pgcount; i++) {
4986                         if (dirty) {
4987                                 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4988                                         DRT_HASH_SET_BIT(cmap, index, pgoff + i);
4989                                         ecount++;
4990                                         setcount++;
4991                                 }
4992                         } else {
4993                                 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4994                                         DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
4995                                         ecount--;
4996                                         setcount++;
4997                                 }
4998                         }
4999                 }
5000                 DRT_HASH_SET_COUNT(cmap, index, ecount);
5001
5002                 offset += pgcount * PAGE_SIZE;
5003                 length -= pgcount * PAGE_SIZE;
5004         }
5005         if (setcountp != NULL)
5006                 *setcountp = setcount;
5007
5008         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
5009
5010         return(KERN_SUCCESS);
5011 }
5012
5013 /*
5014  * Mark a set of pages as dirty/clean.
5015  *
5016  * This is a public interface.
5017  *
5018  * cmapp
5019  *      Pointer to storage suitable for holding a pointer.  Note that
5020  *      this must either be NULL or a value set by this function.
5021  *
5022  * size
5023  *      Current file size in bytes.
5024  *
5025  * offset
5026  *      Offset of the first page to be marked as dirty, in bytes.  Must be
5027  *      page-aligned.
5028  *
5029  * length
5030  *      Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE.
5031  *
5032  * setcountp
5033  *      Number of pages newly marked dirty by this call (optional).
5034  *
5035  * Returns KERN_SUCCESS if all the pages were successfully marked.
5036  */
5037 static kern_return_t
5038 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp)
5039 {
5040         /* XXX size unused, drop from interface */
5041         return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
5042 }
5043
5044 #if 0
5045 static kern_return_t
5046 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
5047 {
5048         return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
5049 }
5050 #endif
5051
5052 /*
5053  * Get a cluster of dirty pages.
5054  *
5055  * This is a public interface.
5056  *
5057  * cmapp
5058  *      Pointer to storage managed by drt_mark_pages.  Note that this must
5059  *      be NULL or a value set by drt_mark_pages.
5060  *
5061  * offsetp
5062  *      Returns the byte offset into the file of the first page in the cluster.
5063  *
5064  * lengthp
5065  *      Returns the length in bytes of the cluster of dirty pages.
5066  *
5067  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there
5068  * are no dirty pages meeting the minmum size criteria.  Private storage will
5069  * be released if there are no more dirty pages left in the map
5070  *
5071  */
5072 static kern_return_t
5073 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
5074 {
5075         struct vfs_drt_clustermap *cmap;
5076         u_int64_t       offset;
5077         u_int           length;
5078         int             index, i, j, fs, ls;
5079
5080         /* sanity */
5081         if ((cmapp == NULL) || (*cmapp == NULL))
5082                 return(KERN_FAILURE);
5083         cmap = *cmapp;
5084
5085         /* walk the hashtable */
5086         for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
5087                 index = DRT_HASH(cmap, offset);
5088
5089                 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
5090                         continue;
5091
5092                 /* scan the bitfield for a string of bits */
5093                 fs = -1;
5094
5095                 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5096                         if (DRT_HASH_TEST_BIT(cmap, index, i)) {
5097                                 fs = i;
5098                                 break;
5099                         }
5100                 }
5101                 if (fs == -1) {
5102                         /*  didn't find any bits set */
5103                         panic("vfs_drt: entry summary count > 0 but no bits set in map");
5104                 }
5105                 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
5106                         if (!DRT_HASH_TEST_BIT(cmap, index, i))
5107                                 break;
5108                 }
5109
5110                 /* compute offset and length, mark pages clean */
5111                 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
5112                 length = ls * PAGE_SIZE;
5113                 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
5114                 cmap->scm_lastclean = index;
5115
5116                 /* return successful */
5117                 *offsetp = (off_t)offset;
5118                 *lengthp = length;
5119
5120                 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
5121                 return(KERN_SUCCESS);
5122         }
5123         /*
5124          * We didn't find anything... hashtable is empty
5125          * emit stats into trace buffer and
5126          * then free it
5127          */
5128         vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5129                       cmap->scm_modulus,
5130                       cmap->scm_buckets,
5131                       cmap->scm_lastclean,
5132                       cmap->scm_iskips);
5133
5134         vfs_drt_free_map(cmap);
5135         *cmapp = NULL;
5136
5137         return(KERN_FAILURE);
5138 }
5139
5140
5141 static kern_return_t
5142 vfs_drt_control(void **cmapp, int op_type)
5143 {
5144         struct vfs_drt_clustermap *cmap;
5145
5146         /* sanity */
5147         if ((cmapp == NULL) || (*cmapp == NULL))
5148                 return(KERN_FAILURE);
5149         cmap = *cmapp;
5150
5151         switch (op_type) {
5152         case 0:
5153                 /* emit stats into trace buffer */
5154                 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5155                               cmap->scm_modulus,
5156                               cmap->scm_buckets,
5157                               cmap->scm_lastclean,
5158                               cmap->scm_iskips);
5159
5160                 vfs_drt_free_map(cmap);
5161                 *cmapp = NULL;
5162                 break;
5163
5164         case 1:
5165                 cmap->scm_lastclean = 0;
5166                 break;
5167         }
5168         return(KERN_SUCCESS);
5169 }
5170
5171
5172
5173 /*
5174  * Emit a summary of the state of the clustermap into the trace buffer
5175  * along with some caller-provided data.
5176  */
5177 #if KDEBUG
5178 static void
5179 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
5180 {
5181         KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
5182 }
5183 #else
5184 static void
5185 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
5186                           __unused int arg1, __unused int arg2, __unused int arg3,
5187                           __unused int arg4)
5188 {
5189 }
5190 #endif
5191
5192 #if 0
5193 /*
5194  * Perform basic sanity check on the hash entry summary count
5195  * vs. the actual bits set in the entry.
5196  */
5197 static void
5198 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
5199 {
5200         int index, i;
5201         int bits_on;
5202
5203         for (index = 0; index < cmap->scm_modulus; index++) {
5204                 if (DRT_HASH_VACANT(cmap, index))
5205                         continue;
5206
5207                 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5208                         if (DRT_HASH_TEST_BIT(cmap, index, i))
5209                                 bits_on++;
5210                 }
5211                 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
5212                         panic("bits_on = %d,  index = %d\n", bits_on, index);
5213         }
5214 }
5215 #endif