bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*
  24  * Copyright (c) 1993
  25  *      The Regents of the University of California.  All rights reserved.
  26  *
  27  * Redistribution and use in source and binary forms, with or without
  28  * modification, are permitted provided that the following conditions
  29  * are met:
  30  * 1. Redistributions of source code must retain the above copyright
  31  *    notice, this list of conditions and the following disclaimer.
  32  * 2. Redistributions in binary form must reproduce the above copyright
  33  *    notice, this list of conditions and the following disclaimer in the
  34  *    documentation and/or other materials provided with the distribution.
  35  * 3. All advertising materials mentioning features or use of this software
  36  *    must display the following acknowledgement:
  37  *      This product includes software developed by the University of
  38  *      California, Berkeley and its contributors.
  39  * 4. Neither the name of the University nor the names of its contributors
  40  *    may be used to endorse or promote products derived from this software
  41  *    without specific prior written permission.
  42  *
  43  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  46  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  53  * SUCH DAMAGE.
  54  *
  55  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  56  */
  57
  58 #include <sys/param.h>
  59 #include <sys/proc_internal.h>
  60 #include <sys/buf_internal.h>
  61 #include <sys/mount_internal.h>
  62 #include <sys/vnode_internal.h>
  63 #include <sys/trace.h>
  64 #include <sys/malloc.h>
  65 #include <sys/time.h>
  66 #include <sys/kernel.h>
  67 #include <sys/resourcevar.h>
  68 #include <sys/uio_internal.h>
  69 #include <libkern/libkern.h>
  70 #include <machine/machine_routines.h>
  71
  72 #include <sys/ubc_internal.h>
  73
  74 #include <mach/mach_types.h>
  75 #include <mach/memory_object_types.h>
  76 #include <mach/vm_map.h>
  77 #include <mach/upl.h>
  78
  79 #include <vm/vm_kern.h>
  80 #include <vm/vm_map.h>
  81 #include <vm/vm_pageout.h>
  82
  83 #include <sys/kdebug.h>
  84
  85
  86
  87 #define CL_READ      0x01
  88 #define CL_ASYNC     0x02
  89 #define CL_COMMIT    0x04
  90 #define CL_PAGEOUT   0x10
  91 #define CL_AGE       0x20
  92 #define CL_DUMP      0x40
  93 #define CL_NOZERO    0x80
  94 #define CL_PAGEIN    0x100
  95 #define CL_DEV_MEMORY 0x200
  96 #define CL_PRESERVE   0x400
  97 #define CL_THROTTLE   0x800
  98 #define CL_KEEPCACHED 0x1000
  99
 100
 101 struct clios {
 102         u_int  io_completed;       /* amount of io that has currently completed */
 103         u_int  io_issued;          /* amount of io that was successfully issued */
 104         int    io_error;           /* error code of first error encountered */
 105         int    io_wanted;          /* someone is sleeping waiting for a change in state */
 106 };
 107
 108 static lck_grp_t        *cl_mtx_grp;
 109 static lck_attr_t       *cl_mtx_attr;
 110 static lck_grp_attr_t   *cl_mtx_grp_attr;
 111 static lck_mtx_t        *cl_mtxp;
 112
 113
 114 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 115                       int flags, buf_t real_bp, struct clios *iostate);
 116 static int cluster_iodone(buf_t bp, void *dummy);
 117 static int cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize);
 118 static int cluster_hard_throttle_on(vnode_t vp);
 119
 120 static int cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags);
 121 static int cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
 122                            off_t headOff, off_t tailOff, int flags);
 123 static int cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize);
 124 static int cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF);
 125 static int cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize);
 126 static int cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF);
 127 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags);
 128
 129 static void     cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra);
 130
 131 static int      cluster_push_x(vnode_t vp, struct cl_extent *, off_t EOF, int flags);
 132 static void     cluster_push_EOF(vnode_t vp, off_t EOF);
 133
 134 static int      cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int can_delay, int push_all);
 135
 136 static void     sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF);
 137 static void     sparse_cluster_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_all);
 138 static void     sparse_cluster_add(struct cl_writebehind *, vnode_t vp, struct cl_extent *, off_t EOF);
 139
 140 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp);
 141 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
 142 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
 143
 144 int     is_file_clean(vnode_t, off_t);
 145
 146 /*
 147  * throttle the number of async writes that
 148  * can be outstanding on a single vnode
 149  * before we issue a synchronous write
 150  */
 151 #define HARD_THROTTLE_MAXCNT    0
 152 #define HARD_THROTTLE_MAXSIZE   (64 * 1024)
 153
 154 int hard_throttle_on_root = 0;
 155 struct timeval priority_IO_timestamp_for_root;
 156
 157
 158 void
 159 cluster_init(void) {
 160         /*
 161          * allocate lock group attribute and group
 162          */
 163         cl_mtx_grp_attr = lck_grp_attr_alloc_init();
 164         //lck_grp_attr_setstat(cl_mtx_grp_attr);
 165         cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
 166
 167         /*
 168          * allocate the lock attribute
 169          */
 170         cl_mtx_attr = lck_attr_alloc_init();
 171         //lck_attr_setdebug(clf_mtx_attr);
 172
 173         /*
 174          * allocate and initialize mutex's used to protect updates and waits
 175          * on the cluster_io context
 176          */
 177         cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
 178
 179         if (cl_mtxp == NULL)
 180                 panic("cluster_init: failed to allocate cl_mtxp");
 181 }
 182
 183
 184
 185 #define CLW_ALLOCATE            0x01
 186 #define CLW_RETURNLOCKED        0x02
 187 /*
 188  * if the read ahead context doesn't yet exist,
 189  * allocate and initialize it...
 190  * the vnode lock serializes multiple callers
 191  * during the actual assignment... first one
 192  * to grab the lock wins... the other callers
 193  * will release the now unnecessary storage
 194  *
 195  * once the context is present, try to grab (but don't block on)
 196  * the lock associated with it... if someone
 197  * else currently owns it, than the read
 198  * will run without read-ahead.  this allows
 199  * multiple readers to run in parallel and
 200  * since there's only 1 read ahead context,
 201  * there's no real loss in only allowing 1
 202  * reader to have read-ahead enabled.
 203  */
 204 static struct cl_readahead *
 205 cluster_get_rap(vnode_t vp)
 206 {
 207         struct ubc_info         *ubc;
 208         struct cl_readahead     *rap;
 209
 210         ubc = vp->v_ubcinfo;
 211
 212         if ((rap = ubc->cl_rahead) == NULL) {
 213                 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
 214
 215                 bzero(rap, sizeof *rap);
 216                 rap->cl_lastr = -1;
 217                 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
 218
 219                 vnode_lock(vp);
 220
 221                 if (ubc->cl_rahead == NULL)
 222                         ubc->cl_rahead = rap;
 223                 else {
 224                         lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
 225                         FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
 226                                 rap = ubc->cl_rahead;
 227                 }
 228                 vnode_unlock(vp);
 229         }
 230         if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
 231                 return(rap);
 232
 233         return ((struct cl_readahead *)NULL);
 234 }
 235
 236
 237 /*
 238  * if the write behind context doesn't yet exist,
 239  * and CLW_ALLOCATE is specified, allocate and initialize it...
 240  * the vnode lock serializes multiple callers
 241  * during the actual assignment... first one
 242  * to grab the lock wins... the other callers
 243  * will release the now unnecessary storage
 244  *
 245  * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
 246  * the lock associated with the write behind context before
 247  * returning
 248  */
 249
 250 static struct cl_writebehind *
 251 cluster_get_wbp(vnode_t vp, int flags)
 252 {
 253         struct ubc_info *ubc;
 254         struct cl_writebehind *wbp;
 255
 256         ubc = vp->v_ubcinfo;
 257
 258         if ((wbp = ubc->cl_wbehind) == NULL) {
 259
 260                 if ( !(flags & CLW_ALLOCATE))
 261                         return ((struct cl_writebehind *)NULL);
 262
 263                 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
 264
 265                 bzero(wbp, sizeof *wbp);
 266                 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
 267
 268                 vnode_lock(vp);
 269
 270                 if (ubc->cl_wbehind == NULL)
 271                         ubc->cl_wbehind = wbp;
 272                 else {
 273                         lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
 274                         FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
 275                                 wbp = ubc->cl_wbehind;
 276                 }
 277                 vnode_unlock(vp);
 278         }
 279         if (flags & CLW_RETURNLOCKED)
 280                 lck_mtx_lock(&wbp->cl_lockw);
 281
 282         return (wbp);
 283 }
 284
 285
 286 static int
 287 cluster_hard_throttle_on(vnode_t vp)
 288 {
 289         static struct timeval hard_throttle_maxelapsed = { 0, 200000 };
 290
 291         if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
 292                 struct timeval elapsed;
 293
 294                 if (hard_throttle_on_root)
 295                         return(1);
 296
 297                 microuptime(&elapsed);
 298                 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
 299
 300                 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
 301                         return(1);
 302         }
 303         return(0);
 304 }
 305
 306
 307 static int
 308 cluster_iodone(buf_t bp, __unused void *dummy)
 309 {
 310         int     b_flags;
 311         int     error;
 312         int     total_size;
 313         int     total_resid;
 314         int     upl_offset;
 315         int     zero_offset;
 316         upl_t   upl;
 317         buf_t   cbp;
 318         buf_t   cbp_head;
 319         buf_t   cbp_next;
 320         buf_t   real_bp;
 321         struct  clios *iostate;
 322         int     commit_size;
 323         int     pg_offset;
 324
 325         cbp_head = (buf_t)(bp->b_trans_head);
 326
 327         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 328                      (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 329
 330         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 331                 /*
 332                  * all I/O requests that are part of this transaction
 333                  * have to complete before we can process it
 334                  */
 335                 if ( !(cbp->b_flags & B_DONE)) {
 336
 337                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 338                                      (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
 339
 340                         return 0;
 341                 }
 342         }
 343         error       = 0;
 344         total_size  = 0;
 345         total_resid = 0;
 346
 347         cbp        = cbp_head;
 348         upl_offset = cbp->b_uploffset;
 349         upl        = cbp->b_upl;
 350         b_flags    = cbp->b_flags;
 351         real_bp    = cbp->b_real_bp;
 352         zero_offset= cbp->b_validend;
 353         iostate    = (struct clios *)cbp->b_iostate;
 354
 355         if (real_bp)
 356                 real_bp->b_dev = cbp->b_dev;
 357
 358         while (cbp) {
 359                 if ((cbp->b_flags & B_ERROR) && error == 0)
 360                         error = cbp->b_error;
 361
 362                 total_resid += cbp->b_resid;
 363                 total_size  += cbp->b_bcount;
 364
 365                 cbp_next = cbp->b_trans_next;
 366
 367                 free_io_buf(cbp);
 368
 369                 cbp = cbp_next;
 370         }
 371         if (zero_offset)
 372                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 373
 374         if (iostate) {
 375                 int need_wakeup = 0;
 376
 377                 /*
 378                  * someone has issued multiple I/Os asynchrounsly
 379                  * and is waiting for them to complete (streaming)
 380                  */
 381                 lck_mtx_lock(cl_mtxp);
 382
 383                 if (error && iostate->io_error == 0)
 384                         iostate->io_error = error;
 385
 386                 iostate->io_completed += total_size;
 387
 388                 if (iostate->io_wanted) {
 389                         /*
 390                          * someone is waiting for the state of
 391                          * this io stream to change
 392                          */
 393                         iostate->io_wanted = 0;
 394                         need_wakeup = 1;
 395                 }
 396                 lck_mtx_unlock(cl_mtxp);
 397
 398                 if (need_wakeup)
 399                         wakeup((caddr_t)&iostate->io_wanted);
 400         }
 401         if ((b_flags & B_NEED_IODONE) && real_bp) {
 402                 if (error) {
 403                         real_bp->b_flags |= B_ERROR;
 404                         real_bp->b_error = error;
 405                 }
 406                 real_bp->b_resid = total_resid;
 407
 408                 buf_biodone(real_bp);
 409         }
 410         if (error == 0 && total_resid)
 411                 error = EIO;
 412
 413         if (b_flags & B_COMMIT_UPL) {
 414                 pg_offset   = upl_offset & PAGE_MASK;
 415                 commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 416
 417                 if (error || (b_flags & B_NOCACHE)) {
 418                         int upl_abort_code;
 419                         int page_in  = 0;
 420                         int page_out = 0;
 421
 422                         if (b_flags & B_PAGEIO) {
 423                                 if (b_flags & B_READ)
 424                                         page_in  = 1;
 425                                 else
 426                                         page_out = 1;
 427                         }
 428                         if (b_flags & B_CACHE)          /* leave pages in the cache unchanged on error */
 429                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 430                         else if (page_out && (error != ENXIO)) /* transient error */
 431                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 432                         else if (page_in)
 433                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 434                         else
 435                                 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 436
 437                         ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
 438                                                     upl_abort_code);
 439
 440                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 441                                      (int)upl, upl_offset - pg_offset, commit_size,
 442                                      0x80000000|upl_abort_code, 0);
 443
 444                 } else {
 445                         int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
 446
 447                         if ((b_flags & B_PHYS) && (b_flags & B_READ))
 448                                 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
 449
 450                         if (b_flags & B_AGE)
 451                                 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
 452
 453                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
 454                                         upl_commit_flags);
 455
 456                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 457                                      (int)upl, upl_offset - pg_offset, commit_size,
 458                                      upl_commit_flags, 0);
 459                 }
 460         } else {
 461                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 462                              (int)upl, upl_offset, 0, error, 0);
 463         }
 464
 465         return (error);
 466 }
 467
 468
 469 void
 470 cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, buf_t bp)
 471 {
 472         upl_page_info_t *pl;
 473
 474         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
 475                      upl_offset, size, (int)bp, 0, 0);
 476
 477         if (bp == NULL || bp->b_datap == 0) {
 478
 479                 pl = ubc_upl_pageinfo(upl);
 480
 481                 while (size) {
 482                         int           page_offset;
 483                         int           page_index;
 484                         addr64_t      zero_addr;
 485                         int           zero_cnt;
 486
 487                         page_index  = upl_offset / PAGE_SIZE;
 488                         page_offset = upl_offset & PAGE_MASK;
 489
 490                         zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
 491                         zero_cnt  = min(PAGE_SIZE - page_offset, size);
 492
 493                         bzero_phys(zero_addr, zero_cnt);
 494
 495                         size       -= zero_cnt;
 496                         upl_offset += zero_cnt;
 497                 }
 498         } else
 499                 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
 500
 501         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
 502                      upl_offset, size, 0, 0, 0);
 503 }
 504
 505
 506 static int
 507 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 508            int flags, buf_t real_bp, struct clios *iostate)
 509 {
 510         buf_t   cbp;
 511         u_int   size;
 512         u_int   io_size;
 513         int     io_flags;
 514         int     bmap_flags;
 515         int     error = 0;
 516         int     retval = 0;
 517         buf_t   cbp_head = NULL;
 518         buf_t   cbp_tail = NULL;
 519         int     trans_count = 0;
 520         u_int   pg_count;
 521         int     pg_offset;
 522         u_int   max_iosize;
 523         u_int   max_vectors;
 524         int     priv;
 525         int     zero_offset = 0;
 526         int     async_throttle = 0;
 527         mount_t mp;
 528
 529         mp = vp->v_mount;
 530
 531         if (mp->mnt_devblocksize > 1) {
 532                 /*
 533                  * round the requested size up so that this I/O ends on a
 534                  * page boundary in case this is a 'write'... if the filesystem
 535                  * has blocks allocated to back the page beyond the EOF, we want to
 536                  * make sure to write out the zero's that are sitting beyond the EOF
 537                  * so that in case the filesystem doesn't explicitly zero this area
 538                  * if a hole is created via a lseek/write beyond the current EOF,
 539                  * it will return zeros when it's read back from the disk.  If the
 540                  * physical allocation doesn't extend for the whole page, we'll
 541                  * only write/read from the disk up to the end of this allocation
 542                  * via the extent info returned from the VNOP_BLOCKMAP call.
 543                  */
 544                 pg_offset = upl_offset & PAGE_MASK;
 545
 546                 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
 547         } else {
 548                 /*
 549                  * anyone advertising a blocksize of 1 byte probably
 550                  * can't deal with us rounding up the request size
 551                  * AFP is one such filesystem/device
 552                  */
 553                 size = non_rounded_size;
 554         }
 555         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
 556                      (int)f_offset, size, upl_offset, flags, 0);
 557
 558         if (flags & CL_READ) {
 559                 io_flags = (B_READ);
 560                 bmap_flags = VNODE_READ;
 561
 562                 max_iosize  = mp->mnt_maxreadcnt;
 563                 max_vectors = mp->mnt_segreadcnt;
 564         } else {
 565                 io_flags = 0;
 566                 bmap_flags = VNODE_WRITE;
 567
 568                 max_iosize  = mp->mnt_maxwritecnt;
 569                 max_vectors = mp->mnt_segwritecnt;
 570         }
 571         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
 572
 573         /*
 574          * make sure the maximum iosize is a
 575          * multiple of the page size
 576          */
 577         max_iosize  &= ~PAGE_MASK;
 578
 579         if (flags & CL_THROTTLE) {
 580                 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
 581                         if (max_iosize > HARD_THROTTLE_MAXSIZE)
 582                                 max_iosize = HARD_THROTTLE_MAXSIZE;
 583                         async_throttle = HARD_THROTTLE_MAXCNT;
 584                 } else
 585                         async_throttle = VNODE_ASYNC_THROTTLE;
 586         }
 587         if (flags & CL_AGE)
 588                 io_flags |= B_AGE;
 589         if (flags & CL_DUMP)
 590                 io_flags |= B_NOCACHE;
 591         if (flags & (CL_PAGEIN | CL_PAGEOUT))
 592                 io_flags |= B_PAGEIO;
 593         if (flags & CL_COMMIT)
 594                 io_flags |= B_COMMIT_UPL;
 595         if (flags & CL_PRESERVE)
 596                 io_flags |= B_PHYS;
 597         if (flags & CL_KEEPCACHED)
 598                 io_flags |= B_CACHE;
 599
 600         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 601                 /*
 602                  * then we are going to end up
 603                  * with a page that we can't complete (the file size wasn't a multiple
 604                  * of PAGE_SIZE and we're trying to read to the end of the file
 605                  * so we'll go ahead and zero out the portion of the page we can't
 606                  * read in from the file
 607                  */
 608                 zero_offset = upl_offset + non_rounded_size;
 609         }
 610         while (size) {
 611                 int     pg_resid;
 612                 daddr64_t blkno;
 613                 daddr64_t lblkno;
 614
 615                 if (size > max_iosize)
 616                         io_size = max_iosize;
 617                 else
 618                         io_size = size;
 619
 620                 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, bmap_flags, NULL))) {
 621                         break;
 622                 }
 623                 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
 624                         real_bp->b_blkno = blkno;
 625
 626                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 627                              (int)f_offset, (int)blkno, io_size, zero_offset, 0);
 628
 629                 if (io_size == 0) {
 630                         /*
 631                          * vnop_blockmap didn't return an error... however, it did
 632                          * return an extent size of 0 which means we can't
 633                          * make forward progress on this I/O... a hole in the
 634                          * file would be returned as a blkno of -1 with a non-zero io_size
 635                          * a real extent is returned with a blkno != -1 and a non-zero io_size
 636                          */
 637                         error = EINVAL;
 638                         break;
 639                 }
 640                 if ( !(flags & CL_READ) && blkno == -1) {
 641                         off_t e_offset;
 642
 643                         /*
 644                          * we're writing into a 'hole'
 645                          */
 646                         if (flags & CL_PAGEOUT) {
 647                                 /*
 648                                  * if we got here via cluster_pageout
 649                                  * then just error the request and return
 650                                  * the 'hole' should already have been covered
 651                                  */
 652                                 error = EINVAL;
 653                                 break;
 654                         }
 655                         if ( !(flags & CL_COMMIT)) {
 656                                 /*
 657                                  * currently writes always request the commit to happen
 658                                  * as part of the io completion... however, if the CL_COMMIT
 659                                  * flag isn't specified, than we can't issue the abort_range
 660                                  * since the call site is going to abort or commit the same upl..
 661                                  * in this case we can only return an error
 662                                  */
 663                                 error = EINVAL;
 664                                 break;
 665                         }
 666                         /*
 667                          * we can get here if the cluster code happens to
 668                          * pick up a page that was dirtied via mmap vs
 669                          * a 'write' and the page targets a 'hole'...
 670                          * i.e. the writes to the cluster were sparse
 671                          * and the file was being written for the first time
 672                          *
 673                          * we can also get here if the filesystem supports
 674                          * 'holes' that are less than PAGE_SIZE.... because
 675                          * we can't know if the range in the page that covers
 676                          * the 'hole' has been dirtied via an mmap or not,
 677                          * we have to assume the worst and try to push the
 678                          * entire page to storage.
 679                          *
 680                          * Try paging out the page individually before
 681                          * giving up entirely and dumping it (the pageout
 682                          * path will insure that the zero extent accounting
 683                          * has been taken care of before we get back into cluster_io)
 684                          */
 685                         ubc_upl_abort_range(upl, trunc_page(upl_offset), PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
 686
 687                         e_offset = round_page_64(f_offset + 1);
 688
 689                         if (ubc_sync_range(vp, f_offset, e_offset, UBC_PUSHDIRTY) == 0) {
 690                                 error = EINVAL;
 691                                 break;
 692                         }
 693                         io_size = e_offset - f_offset;
 694
 695                         f_offset   += io_size;
 696                         upl_offset += io_size;
 697
 698                         if (size >= io_size)
 699                                 size -= io_size;
 700                         else
 701                                 size = 0;
 702                         /*
 703                          * keep track of how much of the original request
 704                          * that we've actually completed... non_rounded_size
 705                          * may go negative due to us rounding the request
 706                          * to a page size multiple (i.e.  size > non_rounded_size)
 707                          */
 708                         non_rounded_size -= io_size;
 709
 710                         if (non_rounded_size <= 0) {
 711                                 /*
 712                                  * we've transferred all of the data in the original
 713                                  * request, but we were unable to complete the tail
 714                                  * of the last page because the file didn't have
 715                                  * an allocation to back that portion... this is ok.
 716                                  */
 717                                 size = 0;
 718                         }
 719                         continue;
 720                 }
 721                 lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64);
 722                 /*
 723                  * we have now figured out how much I/O we can do - this is in 'io_size'
 724                  * pg_offset is the starting point in the first page for the I/O
 725                  * pg_count is the number of full and partial pages that 'io_size' encompasses
 726                  */
 727                 pg_offset = upl_offset & PAGE_MASK;
 728
 729                 if (flags & CL_DEV_MEMORY) {
 730                         /*
 731                          * currently, can't deal with reading 'holes' in file
 732                          */
 733                         if (blkno == -1) {
 734                                 error = EINVAL;
 735                                 break;
 736                         }
 737                         /*
 738                          * treat physical requests as one 'giant' page
 739                          */
 740                         pg_count = 1;
 741                 } else
 742                         pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
 743
 744                 if ((flags & CL_READ) && blkno == -1) {
 745                         int bytes_to_zero;
 746
 747                         /*
 748                          * if we're reading and blkno == -1, then we've got a
 749                          * 'hole' in the file that we need to deal with by zeroing
 750                          * out the affected area in the upl
 751                          */
 752                         if (zero_offset && io_size == size) {
 753                                 /*
 754                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
 755                                  * than 'zero_offset' will be non-zero
 756                                  * if the 'hole' returned by vnop_blockmap extends all the way to the eof
 757                                  * (indicated by the io_size finishing off the I/O request for this UPL)
 758                                  * than we're not going to issue an I/O for the
 759                                  * last page in this upl... we need to zero both the hole and the tail
 760                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
 761                                  */
 762                                 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
 763
 764                                 zero_offset = 0;
 765                         } else
 766                                 bytes_to_zero = io_size;
 767
 768                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
 769
 770                         if (cbp_head)
 771                                 /*
 772                                  * if there is a current I/O chain pending
 773                                  * then the first page of the group we just zero'd
 774                                  * will be handled by the I/O completion if the zero
 775                                  * fill started in the middle of the page
 776                                  */
 777                                 pg_count = (io_size - pg_offset) / PAGE_SIZE;
 778                         else {
 779                                 /*
 780                                  * no pending I/O to pick up that first page
 781                                  * so, we have to make sure it gets committed
 782                                  * here.
 783                                  * set the pg_offset to 0 so that the upl_commit_range
 784                                  * starts with this page
 785                                  */
 786                                 pg_count = (io_size + pg_offset) / PAGE_SIZE;
 787                                 pg_offset = 0;
 788                         }
 789                         if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
 790                                 /*
 791                                  * if we're done with the request for this UPL
 792                                  * then we have to make sure to commit the last page
 793                                  * even if we only partially zero-filled it
 794                                  */
 795                                 pg_count++;
 796
 797                         if (pg_count) {
 798                                 if (pg_offset)
 799                                         pg_resid = PAGE_SIZE - pg_offset;
 800                                 else
 801                                         pg_resid = 0;
 802
 803                                 if (flags & CL_COMMIT)
 804                                         ubc_upl_commit_range(upl,
 805                                                         (upl_offset + pg_resid) & ~PAGE_MASK,
 806                                                         pg_count * PAGE_SIZE,
 807                                                         UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
 808                         }
 809                         upl_offset += io_size;
 810                         f_offset   += io_size;
 811                         size       -= io_size;
 812                         /*
 813                          * keep track of how much of the original request
 814                          * that we've actually completed... non_rounded_size
 815                          * may go negative due to us rounding the request
 816                          * to a page size multiple (i.e.  size > non_rounded_size)
 817                          */
 818                         non_rounded_size -= io_size;
 819
 820                         if (non_rounded_size <= 0) {
 821                                 /*
 822                                  * we've transferred all of the data in the original
 823                                  * request, but we were unable to complete the tail
 824                                  * of the last page because the file didn't have
 825                                  * an allocation to back that portion... this is ok.
 826                                  */
 827                                 size = 0;
 828                         }
 829                         if (cbp_head && pg_count)
 830                                 goto start_io;
 831                         continue;
 832
 833                 }
 834                 if (pg_count > max_vectors) {
 835                         if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
 836                                 io_size = PAGE_SIZE - pg_offset;
 837                                 pg_count = 1;
 838                         } else {
 839                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
 840                                 pg_count = max_vectors;
 841                         }
 842                 }
 843
 844                 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
 845                         /*
 846                          * if we're not targeting a virtual device i.e. a disk image
 847                          * it's safe to dip into the reserve pool since real devices
 848                          * can complete this I/O request without requiring additional
 849                          * bufs from the alloc_io_buf pool
 850                          */
 851                         priv = 1;
 852                 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
 853                         /*
 854                          * Throttle the speculative IO
 855                          */
 856                         priv = 0;
 857                 else
 858                         priv = 1;
 859
 860                 cbp = alloc_io_buf(vp, priv);
 861
 862                 if (flags & CL_PAGEOUT) {
 863                         u_int i;
 864
 865                         for (i = 0; i < pg_count; i++) {
 866                                 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
 867                                         panic("BUSY bp found in cluster_io");
 868                         }
 869                 }
 870                 if (flags & CL_ASYNC) {
 871                         if (buf_setcallback(cbp, (void *)cluster_iodone, NULL))
 872                                 panic("buf_setcallback failed\n");
 873                 }
 874                 cbp->b_flags |= io_flags;
 875
 876                 cbp->b_lblkno = lblkno;
 877                 cbp->b_blkno  = blkno;
 878                 cbp->b_bcount = io_size;
 879
 880                 if (buf_setupl(cbp, upl, upl_offset))
 881                         panic("buf_setupl failed\n");
 882
 883                 cbp->b_trans_next = (buf_t)NULL;
 884
 885                 if ((cbp->b_iostate = (void *)iostate))
 886                         /*
 887                          * caller wants to track the state of this
 888                          * io... bump the amount issued against this stream
 889                          */
 890                         iostate->io_issued += io_size;
 891
 892                 if (flags & CL_READ) {
 893                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
 894                                      (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
 895                 }
 896                 else {
 897                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
 898                                      (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
 899                 }
 900
 901                 if (cbp_head) {
 902                         cbp_tail->b_trans_next = cbp;
 903                         cbp_tail = cbp;
 904                 } else {
 905                         cbp_head = cbp;
 906                         cbp_tail = cbp;
 907                 }
 908                 (buf_t)(cbp->b_trans_head) = cbp_head;
 909                 trans_count++;
 910
 911                 upl_offset += io_size;
 912                 f_offset   += io_size;
 913                 size       -= io_size;
 914                 /*
 915                  * keep track of how much of the original request
 916                  * that we've actually completed... non_rounded_size
 917                  * may go negative due to us rounding the request
 918                  * to a page size multiple (i.e.  size > non_rounded_size)
 919                  */
 920                 non_rounded_size -= io_size;
 921
 922                 if (non_rounded_size <= 0) {
 923                         /*
 924                          * we've transferred all of the data in the original
 925                          * request, but we were unable to complete the tail
 926                          * of the last page because the file didn't have
 927                          * an allocation to back that portion... this is ok.
 928                          */
 929                         size = 0;
 930                 }
 931                 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || trans_count > 8)) || size == 0) {
 932                         /*
 933                          * if we have no more I/O to issue or
 934                          * the current I/O we've prepared fully
 935                          * completes the last page in this request
 936                          * and it's either an ASYNC request or
 937                          * we've already accumulated more than 8 I/O's into
 938                          * this transaction and it's not an I/O directed to
 939                          * special DEVICE memory
 940                          * then go ahead and issue the I/O
 941                          */
 942 start_io:
 943                         if (real_bp) {
 944                                 cbp_head->b_flags |= B_NEED_IODONE;
 945                                 cbp_head->b_real_bp = real_bp;
 946                         } else
 947                                 cbp_head->b_real_bp = (buf_t)NULL;
 948
 949                         if (size == 0) {
 950                                 /*
 951                                  * we're about to issue the last I/O for this upl
 952                                  * if this was a read to the eof and the eof doesn't
 953                                  * finish on a page boundary, than we need to zero-fill
 954                                  * the rest of the page....
 955                                  */
 956                                 cbp_head->b_validend = zero_offset;
 957                         } else
 958                                 cbp_head->b_validend = 0;
 959
 960                         if (flags & CL_THROTTLE)
 961                                 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, (char *)"cluster_io");
 962
 963                         for (cbp = cbp_head; cbp;) {
 964                                 buf_t   cbp_next;
 965
 966                                 if ( !(io_flags & B_READ))
 967                                         vnode_startwrite(vp);
 968
 969                                 cbp_next = cbp->b_trans_next;
 970
 971                                 (void) VNOP_STRATEGY(cbp);
 972                                 cbp = cbp_next;
 973                         }
 974                         if ( !(flags & CL_ASYNC)) {
 975                                 int dummy;
 976
 977                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 978                                         buf_biowait(cbp);
 979
 980                                 if ((error = cluster_iodone(cbp_head, (void *)&dummy))) {
 981                                         if ((flags & (CL_PAGEOUT | CL_KEEPCACHED) == CL_PAGEOUT) && (error == ENXIO))
 982                                                 error = 0;      /* drop the error */
 983                                         else {
 984                                                 if (retval == 0)
 985                                                         retval = error;
 986                                                 error = 0;
 987                                         }
 988                                 }
 989                         }
 990                         cbp_head = (buf_t)NULL;
 991                         cbp_tail = (buf_t)NULL;
 992
 993                         trans_count = 0;
 994                 }
 995         }
 996         if (error) {
 997                 int abort_size;
 998
 999                 io_size = 0;
1000
1001                 for (cbp = cbp_head; cbp;) {
1002                         buf_t   cbp_next;
1003
1004                         upl_offset -= cbp->b_bcount;
1005                         size       += cbp->b_bcount;
1006                         io_size    += cbp->b_bcount;
1007
1008                         cbp_next = cbp->b_trans_next;
1009                         free_io_buf(cbp);
1010                         cbp = cbp_next;
1011                 }
1012                 if (iostate) {
1013                         int need_wakeup = 0;
1014
1015                         /*
1016                          * update the error condition for this stream
1017                          * since we never really issued the io
1018                          * just go ahead and adjust it back
1019                          */
1020                         lck_mtx_lock(cl_mtxp);
1021
1022                         if (iostate->io_error == 0)
1023                                 iostate->io_error = error;
1024                         iostate->io_issued -= io_size;
1025
1026                         if (iostate->io_wanted) {
1027                                 /*
1028                                  * someone is waiting for the state of
1029                                  * this io stream to change
1030                                  */
1031                                 iostate->io_wanted = 0;
1032                                 need_wakeup = 0;
1033                         }
1034                         lck_mtx_unlock(cl_mtxp);
1035
1036                         if (need_wakeup)
1037                                 wakeup((caddr_t)&iostate->io_wanted);
1038                 }
1039                 pg_offset  = upl_offset & PAGE_MASK;
1040                 abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1041
1042                 if (flags & CL_COMMIT) {
1043                         int upl_abort_code;
1044
1045                         if (flags & CL_PRESERVE) {
1046                                 ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
1047                                                      UPL_COMMIT_FREE_ON_EMPTY);
1048                         } else {
1049                                 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
1050                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
1051                                 else if (flags & CL_PAGEIN)
1052                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1053                                 else
1054                                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
1055
1056                                 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
1057                                                 upl_abort_code);
1058                         }
1059                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
1060                                      (int)upl, upl_offset - pg_offset, abort_size, error, 0);
1061                 }
1062                 if (real_bp) {
1063                         real_bp->b_flags |= B_ERROR;
1064                         real_bp->b_error  = error;
1065
1066                         buf_biodone(real_bp);
1067                 }
1068                 if (retval == 0)
1069                         retval = error;
1070         }
1071         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
1072                      (int)f_offset, size, upl_offset, retval, 0);
1073
1074         return (retval);
1075 }
1076
1077
1078 static int
1079 cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize)
1080 {
1081         int           pages_in_prefetch;
1082
1083         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
1084                      (int)f_offset, size, (int)filesize, 0, 0);
1085
1086         if (f_offset >= filesize) {
1087                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1088                              (int)f_offset, 0, 0, 0, 0);
1089                 return(0);
1090         }
1091         if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1092                 size = (MAX_UPL_TRANSFER * PAGE_SIZE);
1093         else
1094                 size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1095
1096         if ((off_t)size > (filesize - f_offset))
1097                 size = filesize - f_offset;
1098         pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1099
1100         advisory_read(vp, filesize, f_offset, size);
1101
1102         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1103                      (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1104
1105         return (pages_in_prefetch);
1106 }
1107
1108
1109
1110 static void
1111 cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap)
1112 {
1113         daddr64_t       r_addr;
1114         off_t           f_offset;
1115         int             size_of_prefetch;
1116
1117
1118         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
1119                      (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
1120
1121         if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
1122                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1123                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
1124                 return;
1125         }
1126         if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1) &&
1127                                    (extent->b_addr != (rap->cl_maxra + 1) || rap->cl_ralen == 0))) {
1128                 rap->cl_ralen = 0;
1129                 rap->cl_maxra = 0;
1130
1131                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1132                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
1133
1134                 return;
1135         }
1136         if (extent->e_addr < rap->cl_maxra) {
1137                 if ((rap->cl_maxra - extent->e_addr) > (MAX_UPL_TRANSFER / 4)) {
1138
1139                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1140                                      rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
1141                         return;
1142                 }
1143         }
1144         r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
1145         f_offset = (off_t)(r_addr * PAGE_SIZE_64);
1146
1147         size_of_prefetch = 0;
1148
1149         ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
1150
1151         if (size_of_prefetch) {
1152                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1153                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
1154                 return;
1155         }
1156         if (f_offset < filesize) {
1157                 daddr64_t read_size;
1158
1159                 rap->cl_ralen = rap->cl_ralen ? min(MAX_UPL_TRANSFER, rap->cl_ralen << 1) : 1;
1160
1161                 read_size = (extent->e_addr + 1) - extent->b_addr;
1162
1163                 if (read_size > rap->cl_ralen) {
1164                         if (read_size > MAX_UPL_TRANSFER)
1165                                 rap->cl_ralen = MAX_UPL_TRANSFER;
1166                         else
1167                                 rap->cl_ralen = read_size;
1168                 }
1169                 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize);
1170
1171                 if (size_of_prefetch)
1172                         rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
1173         }
1174         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1175                      rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
1176 }
1177
1178 int
1179 cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1180                 int size, off_t filesize, int flags)
1181 {
1182         int           io_size;
1183         int           rounded_size;
1184         off_t         max_size;
1185         int           local_flags;
1186         struct cl_writebehind *wbp;
1187
1188         if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
1189                 /*
1190                  * if we know we're issuing this I/O to a virtual device (i.e. disk image)
1191                  * then we don't want to enforce this throttle... if we do, we can
1192                  * potentially deadlock since we're stalling the pageout thread at a time
1193                  * when the disk image might need additional memory (which won't be available
1194                  * if the pageout thread can't run)... instead we'll just depend on the throttle
1195                  * that the pageout thread now has in place to deal with external files
1196                  */
1197                 local_flags = CL_PAGEOUT;
1198         else
1199                 local_flags = CL_PAGEOUT | CL_THROTTLE;
1200
1201         if ((flags & UPL_IOSYNC) == 0)
1202                 local_flags |= CL_ASYNC;
1203         if ((flags & UPL_NOCOMMIT) == 0)
1204                 local_flags |= CL_COMMIT;
1205         if ((flags & UPL_KEEPCACHED))
1206                 local_flags |= CL_KEEPCACHED;
1207
1208
1209         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
1210                      (int)f_offset, size, (int)filesize, local_flags, 0);
1211
1212         /*
1213          * If they didn't specify any I/O, then we are done...
1214          * we can't issue an abort because we don't know how
1215          * big the upl really is
1216          */
1217         if (size <= 0)
1218                 return (EINVAL);
1219
1220         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1221                 if (local_flags & CL_COMMIT)
1222                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1223                 return (EROFS);
1224         }
1225         /*
1226          * can't page-in from a negative offset
1227          * or if we're starting beyond the EOF
1228          * or if the file offset isn't page aligned
1229          * or the size requested isn't a multiple of PAGE_SIZE
1230          */
1231         if (f_offset < 0 || f_offset >= filesize ||
1232            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
1233                 if (local_flags & CL_COMMIT)
1234                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1235                 return (EINVAL);
1236         }
1237         max_size = filesize - f_offset;
1238
1239         if (size < max_size)
1240                 io_size = size;
1241         else
1242                 io_size = max_size;
1243
1244         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1245
1246         if (size > rounded_size) {
1247                 if (local_flags & CL_COMMIT)
1248                         ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
1249                                         UPL_ABORT_FREE_ON_EMPTY);
1250         }
1251         if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1252                 wbp->cl_hasbeenpaged = 1;
1253
1254         return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
1255                            local_flags, (buf_t)NULL, (struct clios *)NULL));
1256 }
1257
1258 int
1259 cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1260                int size, off_t filesize, int flags)
1261 {
1262         u_int         io_size;
1263         int           rounded_size;
1264         off_t         max_size;
1265         int           retval;
1266         int           local_flags = 0;
1267
1268         if (upl == NULL || size < 0)
1269                 panic("cluster_pagein: NULL upl passed in");
1270
1271         if ((flags & UPL_IOSYNC) == 0)
1272                 local_flags |= CL_ASYNC;
1273         if ((flags & UPL_NOCOMMIT) == 0)
1274                 local_flags |= CL_COMMIT;
1275
1276
1277         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1278                      (int)f_offset, size, (int)filesize, local_flags, 0);
1279
1280         /*
1281          * can't page-in from a negative offset
1282          * or if we're starting beyond the EOF
1283          * or if the file offset isn't page aligned
1284          * or the size requested isn't a multiple of PAGE_SIZE
1285          */
1286         if (f_offset < 0 || f_offset >= filesize ||
1287            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1288                 if (local_flags & CL_COMMIT)
1289                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1290                 return (EINVAL);
1291         }
1292         max_size = filesize - f_offset;
1293
1294         if (size < max_size)
1295                 io_size = size;
1296         else
1297                 io_size = max_size;
1298
1299         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1300
1301         if (size > rounded_size && (local_flags & CL_COMMIT))
1302                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1303                                     size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1304
1305         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
1306                            local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL);
1307
1308         if (retval == 0 && !(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) {
1309                 struct cl_readahead *rap;
1310
1311                 rap = cluster_get_rap(vp);
1312
1313                 if (rap != NULL) {
1314                         struct cl_extent extent;
1315
1316                         extent.b_addr = (daddr64_t)(f_offset / PAGE_SIZE_64);
1317                         extent.e_addr = (daddr64_t)((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1318
1319                         if (rounded_size == PAGE_SIZE) {
1320                                 /*
1321                                  * we haven't read the last page in of the file yet
1322                                  * so let's try to read ahead if we're in
1323                                  * a sequential access pattern
1324                                  */
1325                                 cluster_rd_ahead(vp, &extent, filesize, rap);
1326                         }
1327                         rap->cl_lastr = extent.e_addr;
1328
1329                         lck_mtx_unlock(&rap->cl_lockr);
1330                 }
1331         }
1332         return (retval);
1333 }
1334
1335 int
1336 cluster_bp(buf_t bp)
1337 {
1338         off_t  f_offset;
1339         int    flags;
1340
1341         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1342                      (int)bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1343
1344         if (bp->b_flags & B_READ)
1345                 flags = CL_ASYNC | CL_READ;
1346         else
1347                 flags = CL_ASYNC;
1348
1349         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1350
1351         return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL));
1352 }
1353
1354 int
1355 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
1356 {
1357         int           prev_resid;
1358         u_int         clip_size;
1359         off_t         max_io_size;
1360         int           upl_size;
1361         int           upl_flags;
1362         upl_t         upl;
1363         int           retval = 0;
1364         int           flags;
1365
1366         flags = xflags;
1367
1368         if (vp->v_flag & VNOCACHE_DATA)
1369                 flags |= IO_NOCACHE;
1370
1371         if ( (!(flags & IO_NOCACHE)) || (!uio) || (!UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
1372                 /*
1373                  * go do a write through the cache if one of the following is true....
1374                  *   NOCACHE is not true
1375                  *   there is no uio structure or it doesn't target USERSPACE
1376                  */
1377                 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1378         }
1379
1380 #if LP64_DEBUG
1381         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1382                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1383         }
1384 #endif /* LP64_DEBUG */
1385
1386         while (uio_resid(uio) && uio->uio_offset < newEOF && retval == 0) {
1387                 u_int64_t       iov_len;
1388                 u_int64_t       iov_base;
1389
1390                 /*
1391                  * we know we have a resid, so this is safe
1392                  * skip over any emtpy vectors
1393                  */
1394                 iov_len = uio_iov_len(uio);
1395
1396                 while (iov_len == 0) {
1397                         uio_next_iov(uio);
1398                         uio->uio_iovcnt--;
1399                         iov_len = uio_iov_len(uio);
1400                 }
1401                 iov_base = uio_iov_base(uio);
1402
1403                 upl_size  = PAGE_SIZE;
1404                 upl_flags = UPL_QUERY_OBJECT_TYPE;
1405
1406                 // LP64todo - fix this!
1407                 if ((vm_map_get_upl(current_map(),
1408                                     CAST_DOWN(vm_offset_t, iov_base) & ~PAGE_MASK,
1409                                     &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
1410                         /*
1411                          * the user app must have passed in an invalid address
1412                          */
1413                         return (EFAULT);
1414                 }
1415
1416                 /*
1417                  * We check every vector target but if it is physically
1418                  * contiguous space, we skip the sanity checks.
1419                  */
1420                 if (upl_flags & UPL_PHYS_CONTIG) {
1421                         int zflags;
1422
1423                         zflags = flags & ~IO_TAILZEROFILL;
1424                         zflags |= IO_HEADZEROFILL;
1425
1426                         if (flags & IO_HEADZEROFILL) {
1427                                 /*
1428                                  * in case we have additional vectors, we don't want to do this again
1429                                  */
1430                                 flags &= ~IO_HEADZEROFILL;
1431
1432                                 if ((retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, zflags)))
1433                                         return(retval);
1434                         }
1435                         retval = cluster_phys_write(vp, uio, newEOF);
1436
1437                         if (uio_resid(uio) == 0 && (flags & IO_TAILZEROFILL)) {
1438                                 return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, zflags));
1439                         }
1440                 }
1441                 else if ((uio_resid(uio) < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL))) {
1442                         /*
1443                          * we're here because we're don't have a physically contiguous target buffer
1444                          * go do a write through the cache if one of the following is true....
1445                          *   the total xfer size is less than a page...
1446                          *   we're being asked to ZEROFILL either the head or the tail of the I/O...
1447                          */
1448                         return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1449                 }
1450                 // LP64todo - fix this!
1451                 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1452                         if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1453                                 /*
1454                                  * Bring the file offset write up to a pagesize boundary
1455                                  * this will also bring the base address to a page boundary
1456                                  * since they both are currently on the same offset within a page
1457                                  * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1458                                  * so the computed clip_size must always be less than the current uio_resid
1459                                  */
1460                                 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1461
1462                                 /*
1463                                  * Fake the resid going into the cluster_write_x call
1464                                  * and restore it on the way out.
1465                                  */
1466                                 // LP64todo - fix this
1467                                 prev_resid = uio_resid(uio);
1468                                 uio_setresid(uio, clip_size);
1469
1470                                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1471
1472                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1473                         } else {
1474                                 /*
1475                                  * can't get both the file offset and the buffer offset aligned to a page boundary
1476                                  * so fire an I/O through the cache for this entire vector
1477                                  */
1478                                 // LP64todo - fix this
1479                                 clip_size = iov_len;
1480                                 // LP64todo - fix this
1481                                 prev_resid = uio_resid(uio);
1482                                 uio_setresid(uio, clip_size);
1483
1484                                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1485
1486                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1487                         }
1488                 } else {
1489                         /*
1490                          * If we come in here, we know the offset into
1491                          * the file is on a pagesize boundary and the
1492                          * target buffer address is also on a page boundary
1493                          */
1494                         max_io_size = newEOF - uio->uio_offset;
1495                         // LP64todo - fix this
1496                         clip_size = uio_resid(uio);
1497                         if (iov_len < clip_size)
1498                                 // LP64todo - fix this!
1499                                 clip_size = iov_len;
1500                         if (max_io_size < clip_size)
1501                                 clip_size = max_io_size;
1502
1503                         if (clip_size < PAGE_SIZE) {
1504                                 /*
1505                                  * Take care of tail end of write in this vector
1506                                  */
1507                                 // LP64todo - fix this
1508                                 prev_resid = uio_resid(uio);
1509                                 uio_setresid(uio, clip_size);
1510
1511                                 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1512
1513                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1514                         } else {
1515                                 /* round clip_size down to a multiple of pagesize */
1516                                 clip_size = clip_size & ~(PAGE_MASK);
1517                                 // LP64todo - fix this
1518                                 prev_resid = uio_resid(uio);
1519                                 uio_setresid(uio, clip_size);
1520
1521                                 retval = cluster_nocopy_write(vp, uio, newEOF);
1522
1523                                 if ((retval == 0) && uio_resid(uio))
1524                                         retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1525
1526                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1527                         }
1528                 } /* end else */
1529         } /* end while */
1530
1531         return(retval);
1532 }
1533
1534
1535 static int
1536 cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF)
1537 {
1538         upl_t            upl;
1539         upl_page_info_t  *pl;
1540         vm_offset_t      upl_offset;
1541         int              io_size;
1542         int              io_flag;
1543         int              upl_size;
1544         int              upl_needed_size;
1545         int              pages_in_pl;
1546         int              upl_flags;
1547         kern_return_t    kret;
1548         int              i;
1549         int              force_data_sync;
1550         int              error  = 0;
1551         struct clios     iostate;
1552         struct cl_writebehind *wbp;
1553         struct iovec     *iov;
1554
1555         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1556                      (int)uio->uio_offset, (int)uio_resid(uio),
1557                      (int)newEOF, 0, 0);
1558
1559         /*
1560          * When we enter this routine, we know
1561          *  -- the offset into the file is on a pagesize boundary
1562          *  -- the resid is a page multiple
1563          *  -- the resid will not exceed iov_len
1564          */
1565
1566         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1567
1568                 cluster_try_push(wbp, vp, newEOF, 0, 1);
1569
1570                 lck_mtx_unlock(&wbp->cl_lockw);
1571         }
1572         iostate.io_completed = 0;
1573         iostate.io_issued = 0;
1574         iostate.io_error = 0;
1575         iostate.io_wanted = 0;
1576
1577         iov = uio->uio_iov;
1578
1579         while (uio_resid(uio) && uio->uio_offset < newEOF && error == 0) {
1580                 io_size = uio_resid(uio);
1581
1582                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1583                         io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1584
1585                 // LP64todo - fix this!
1586                 upl_offset = CAST_DOWN(vm_offset_t, iov->iov_base) & PAGE_MASK;
1587
1588                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1589
1590                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1591                              (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1592
1593                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1594                         pages_in_pl = 0;
1595                         upl_size = upl_needed_size;
1596                         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1597                                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1598
1599                         // LP64todo - fix this!
1600                         kret = vm_map_get_upl(current_map(),
1601                                               CAST_DOWN(vm_offset_t, iov->iov_base) & ~PAGE_MASK,
1602                                               &upl_size,
1603                                               &upl,
1604                                               NULL,
1605                                               &pages_in_pl,
1606                                               &upl_flags,
1607                                               force_data_sync);
1608
1609                         if (kret != KERN_SUCCESS) {
1610                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1611                                              0, 0, 0, kret, 0);
1612                                 /*
1613                                  * cluster_nocopy_write: failed to get pagelist
1614                                  *
1615                                  * we may have already spun some portion of this request
1616                                  * off as async requests... we need to wait for the I/O
1617                                  * to complete before returning
1618                                  */
1619                                 goto wait_for_writes;
1620                         }
1621                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1622                         pages_in_pl = upl_size / PAGE_SIZE;
1623
1624                         for (i = 0; i < pages_in_pl; i++) {
1625                                 if (!upl_valid_page(pl, i))
1626                                         break;
1627                         }
1628                         if (i == pages_in_pl)
1629                                 break;
1630
1631                         /*
1632                          * didn't get all the pages back that we
1633                          * needed... release this upl and try again
1634                          */
1635                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1636                                             UPL_ABORT_FREE_ON_EMPTY);
1637                 }
1638                 if (force_data_sync >= 3) {
1639                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1640                                      i, pages_in_pl, upl_size, kret, 0);
1641                         /*
1642                          * for some reason, we couldn't acquire a hold on all
1643                          * the pages needed in the user's address space
1644                          *
1645                          * we may have already spun some portion of this request
1646                          * off as async requests... we need to wait for the I/O
1647                          * to complete before returning
1648                          */
1649                         goto wait_for_writes;
1650                 }
1651
1652                 /*
1653                  * Consider the possibility that upl_size wasn't satisfied.
1654                  */
1655                 if (upl_size != upl_needed_size)
1656                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1657
1658                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1659                              (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1660
1661                 if (io_size == 0) {
1662                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1663                                             UPL_ABORT_FREE_ON_EMPTY);
1664                         /*
1665                          * we may have already spun some portion of this request
1666                          * off as async requests... we need to wait for the I/O
1667                          * to complete before returning
1668                          */
1669                         goto wait_for_writes;
1670                 }
1671                 /*
1672                  * Now look for pages already in the cache
1673                  * and throw them away.
1674                  * uio->uio_offset is page aligned within the file
1675                  * io_size is a multiple of PAGE_SIZE
1676                  */
1677                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
1678
1679                 /*
1680                  * we want push out these writes asynchronously so that we can overlap
1681                  * the preparation of the next I/O
1682                  * if there are already too many outstanding writes
1683                  * wait until some complete before issuing the next
1684                  */
1685                 lck_mtx_lock(cl_mtxp);
1686
1687                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1688                         iostate.io_wanted = 1;
1689                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
1690                 }
1691                 lck_mtx_unlock(cl_mtxp);
1692
1693                 if (iostate.io_error) {
1694                         /*
1695                          * one of the earlier writes we issued ran into a hard error
1696                          * don't issue any more writes, cleanup the UPL
1697                          * that was just created but not used, then
1698                          * go wait for all writes that are part of this stream
1699                          * to complete before returning the error to the caller
1700                          */
1701                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1702                                             UPL_ABORT_FREE_ON_EMPTY);
1703
1704                         goto wait_for_writes;
1705                 }
1706                 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE;
1707
1708                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1709                              (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1710
1711                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1712                                    io_size, io_flag, (buf_t)NULL, &iostate);
1713
1714                 iov->iov_len    -= io_size;
1715                 ((u_int32_t)iov->iov_base)   += io_size;
1716                 uio_setresid(uio, (uio_resid(uio) - io_size));
1717                 uio->uio_offset += io_size;
1718
1719                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1720                              (int)upl_offset, (int)uio->uio_offset, (int)uio_resid(uio), error, 0);
1721
1722         } /* end while */
1723
1724 wait_for_writes:
1725         /*
1726          * make sure all async writes issued as part of this stream
1727          * have completed before we return
1728          */
1729         lck_mtx_lock(cl_mtxp);
1730
1731         while (iostate.io_issued != iostate.io_completed) {
1732                 iostate.io_wanted = 1;
1733                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
1734         }
1735         lck_mtx_unlock(cl_mtxp);
1736
1737         if (iostate.io_error)
1738                 error = iostate.io_error;
1739
1740         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1741                      (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1742
1743         return (error);
1744 }
1745
1746
1747 static int
1748 cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF)
1749 {
1750         upl_page_info_t *pl;
1751         addr64_t         src_paddr;
1752         upl_t            upl;
1753         vm_offset_t      upl_offset;
1754         int              tail_size;
1755         int              io_size;
1756         int              upl_size;
1757         int              upl_needed_size;
1758         int              pages_in_pl;
1759         int              upl_flags;
1760         kern_return_t    kret;
1761         int              error  = 0;
1762         u_int64_t        iov_base;
1763         int              devblocksize;
1764         struct cl_writebehind *wbp;
1765
1766         devblocksize = vp->v_mount->mnt_devblocksize;
1767         /*
1768          * When we enter this routine, we know
1769          *  -- the resid will not exceed iov_len
1770          *  -- the vector target address is physcially contiguous
1771          */
1772         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1773
1774                 cluster_try_push(wbp, vp, newEOF, 0, 1);
1775
1776                 lck_mtx_unlock(&wbp->cl_lockw);
1777         }
1778 #if LP64_DEBUG
1779         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1780                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1781         }
1782 #endif /* LP64_DEBUG */
1783
1784         // LP64todo - fix this!
1785         io_size = uio_iov_len(uio);
1786         iov_base = uio_iov_base(uio);
1787         upl_offset = CAST_DOWN(upl_offset_t, iov_base) & PAGE_MASK;
1788         upl_needed_size = upl_offset + io_size;
1789
1790         pages_in_pl = 0;
1791         upl_size = upl_needed_size;
1792         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1793                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1794
1795         // LP64todo - fix this!
1796         kret = vm_map_get_upl(current_map(),
1797                               CAST_DOWN(upl_offset_t, iov_base) & ~PAGE_MASK,
1798                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1799
1800         if (kret != KERN_SUCCESS) {
1801                 /*
1802                  * cluster_phys_write: failed to get pagelist
1803                  * note: return kret here
1804                  */
1805               return(EINVAL);
1806         }
1807         /*
1808          * Consider the possibility that upl_size wasn't satisfied.
1809          * This is a failure in the physical memory case.
1810          */
1811         if (upl_size < upl_needed_size) {
1812                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1813                 return(EINVAL);
1814         }
1815         pl = ubc_upl_pageinfo(upl);
1816
1817         src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)(iov_base & PAGE_MASK));
1818
1819         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1820                 int   head_size;
1821
1822                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1823
1824                 if (head_size > io_size)
1825                         head_size = io_size;
1826
1827                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0);
1828
1829                 if (error) {
1830                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1831
1832                         return(EINVAL);
1833                 }
1834                 upl_offset += head_size;
1835                 src_paddr  += head_size;
1836                 io_size    -= head_size;
1837         }
1838         tail_size = io_size & (devblocksize - 1);
1839         io_size  -= tail_size;
1840
1841         if (io_size) {
1842                 /*
1843                  * issue a synchronous write to cluster_io
1844                  */
1845                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1846                                    io_size, CL_DEV_MEMORY, (buf_t)NULL, (struct clios *)NULL);
1847         }
1848         if (error == 0) {
1849                 /*
1850                  * The cluster_io write completed successfully,
1851                  * update the uio structure
1852                  */
1853                 uio_setresid(uio, (uio_resid(uio) - io_size));
1854                 uio_iov_len_add(uio, -io_size);
1855                 uio_iov_base_add(uio, io_size);
1856                 uio->uio_offset += io_size;
1857                 src_paddr       += io_size;
1858
1859                 if (tail_size)
1860                         error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0);
1861         }
1862         /*
1863          * just release our hold on the physically contiguous
1864          * region without changing any state
1865          */
1866         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1867
1868         return (error);
1869 }
1870
1871
1872 static int
1873 cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int flags)
1874 {
1875         upl_page_info_t *pl;
1876         upl_t            upl;
1877         vm_offset_t      upl_offset = 0;
1878         int              upl_size;
1879         off_t            upl_f_offset;
1880         int              pages_in_upl;
1881         int              start_offset;
1882         int              xfer_resid;
1883         int              io_size;
1884         int              io_offset;
1885         int              bytes_to_zero;
1886         int              bytes_to_move;
1887         kern_return_t    kret;
1888         int              retval = 0;
1889         int              io_resid;
1890         long long        total_size;
1891         long long        zero_cnt;
1892         off_t            zero_off;
1893         long long        zero_cnt1;
1894         off_t            zero_off1;
1895         struct cl_extent cl;
1896         int              intersection;
1897         struct cl_writebehind *wbp;
1898
1899         if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1900         {
1901                 if (wbp->cl_hasbeenpaged) {
1902                         /*
1903                          * this vnode had pages cleaned to it by
1904                          * the pager which indicates that either
1905                          * it's not very 'hot', or the system is
1906                          * being overwhelmed by a lot of dirty
1907                          * data being delayed in the VM cache...
1908                          * in either event, we'll push our remaining
1909                          * delayed data at this point...  this will
1910                          * be more efficient than paging out 1 page at
1911                          * a time, and will also act as a throttle
1912                          * by delaying this client from writing any
1913                          * more data until all his delayed data has
1914                          * at least been queued to the uderlying driver.
1915                          */
1916                         if (wbp->cl_number || wbp->cl_scmap)
1917                                 cluster_push_EOF(vp, newEOF);
1918
1919                         wbp->cl_hasbeenpaged = 0;
1920                 }
1921         }
1922         if (uio) {
1923                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1924                              (int)uio->uio_offset, uio_resid(uio), (int)oldEOF, (int)newEOF, 0);
1925
1926                 // LP64todo - fix this
1927                 io_resid = uio_resid(uio);
1928         } else {
1929                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1930                              0, 0, (int)oldEOF, (int)newEOF, 0);
1931
1932                 io_resid = 0;
1933         }
1934         zero_cnt  = 0;
1935         zero_cnt1 = 0;
1936         zero_off  = 0;
1937         zero_off1 = 0;
1938
1939         if (flags & IO_HEADZEROFILL) {
1940                 /*
1941                  * some filesystems (HFS is one) don't support unallocated holes within a file...
1942                  * so we zero fill the intervening space between the old EOF and the offset
1943                  * where the next chunk of real data begins.... ftruncate will also use this
1944                  * routine to zero fill to the new EOF when growing a file... in this case, the
1945                  * uio structure will not be provided
1946                  */
1947                 if (uio) {
1948                         if (headOff < uio->uio_offset) {
1949                                 zero_cnt = uio->uio_offset - headOff;
1950                                 zero_off = headOff;
1951                         }
1952                 } else if (headOff < newEOF) {
1953                         zero_cnt = newEOF - headOff;
1954                         zero_off = headOff;
1955                 }
1956         }
1957         if (flags & IO_TAILZEROFILL) {
1958                 if (uio) {
1959                         // LP64todo - fix this
1960                         zero_off1 = uio->uio_offset + uio_resid(uio);
1961
1962                         if (zero_off1 < tailOff)
1963                                 zero_cnt1 = tailOff - zero_off1;
1964                 }
1965         }
1966         if (zero_cnt == 0 && uio == (struct uio *) 0) {
1967                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1968                              retval, 0, 0, 0, 0);
1969                 return (0);
1970         }
1971
1972         while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1973                 /*
1974                  * for this iteration of the loop, figure out where our starting point is
1975                  */
1976                 if (zero_cnt) {
1977                         start_offset = (int)(zero_off & PAGE_MASK_64);
1978                         upl_f_offset = zero_off - start_offset;
1979                 } else if (io_resid) {
1980                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1981                         upl_f_offset = uio->uio_offset - start_offset;
1982                 } else {
1983                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
1984                         upl_f_offset = zero_off1 - start_offset;
1985                 }
1986                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1987                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1988
1989                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1990                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1991
1992                 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
1993
1994                 if (uio && ((flags & (IO_NOCACHE | IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
1995                         /*
1996                          * assumption... total_size <= io_resid
1997                          * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1998                          */
1999                         if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
2000                                 total_size -= start_offset;
2001                         xfer_resid = total_size;
2002
2003                         retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
2004
2005                         if (retval)
2006                                 break;
2007
2008                         io_resid   -= (total_size - xfer_resid);
2009                         total_size   = xfer_resid;
2010                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2011                         upl_f_offset = uio->uio_offset - start_offset;
2012
2013                         if (total_size == 0) {
2014                                 if (start_offset) {
2015                                         /*
2016                                          * the write did not finish on a page boundary
2017                                          * which will leave upl_f_offset pointing to the
2018                                          * beginning of the last page written instead of
2019                                          * the page beyond it... bump it in this case
2020                                          * so that the cluster code records the last page
2021                                          * written as dirty
2022                                          */
2023                                         upl_f_offset += PAGE_SIZE_64;
2024                                 }
2025                                 upl_size = 0;
2026
2027                                 goto check_cluster;
2028                         }
2029                 }
2030                 /*
2031                  * compute the size of the upl needed to encompass
2032                  * the requested write... limit each call to cluster_io
2033                  * to the maximum UPL size... cluster_io will clip if
2034                  * this exceeds the maximum io_size for the device,
2035                  * make sure to account for
2036                  * a starting offset that's not page aligned
2037                  */
2038                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2039
2040                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2041                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2042
2043                 pages_in_upl = upl_size / PAGE_SIZE;
2044                 io_size      = upl_size - start_offset;
2045
2046                 if ((long long)io_size > total_size)
2047                         io_size = total_size;
2048
2049                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
2050
2051
2052                 /*
2053                  * Gather the pages from the buffer cache.
2054                  * The UPL_WILL_MODIFY flag lets the UPL subsystem know
2055                  * that we intend to modify these pages.
2056                  */
2057                 kret = ubc_create_upl(vp,
2058                                       upl_f_offset,
2059                                       upl_size,
2060                                       &upl,
2061                                       &pl,
2062                                       UPL_SET_LITE | UPL_WILL_MODIFY);
2063                 if (kret != KERN_SUCCESS)
2064                         panic("cluster_write: failed to get pagelist");
2065
2066                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
2067                         (int)upl, (int)upl_f_offset, start_offset, 0, 0);
2068
2069                 if (start_offset && !upl_valid_page(pl, 0)) {
2070                         int   read_size;
2071
2072                         /*
2073                          * we're starting in the middle of the first page of the upl
2074                          * and the page isn't currently valid, so we're going to have
2075                          * to read it in first... this is a synchronous operation
2076                          */
2077                         read_size = PAGE_SIZE;
2078
2079                         if ((upl_f_offset + read_size) > newEOF)
2080                                 read_size = newEOF - upl_f_offset;
2081
2082                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
2083                                             CL_READ, (buf_t)NULL, (struct clios *)NULL);
2084                         if (retval) {
2085                                 /*
2086                                  * we had an error during the read which causes us to abort
2087                                  * the current cluster_write request... before we do, we need
2088                                  * to release the rest of the pages in the upl without modifying
2089                                  * there state and mark the failed page in error
2090                                  */
2091                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2092
2093                                 if (upl_size > PAGE_SIZE)
2094                                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2095
2096                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2097                                              (int)upl, 0, 0, retval, 0);
2098                                 break;
2099                         }
2100                 }
2101                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
2102                         /*
2103                          * the last offset we're writing to in this upl does not end on a page
2104                          * boundary... if it's not beyond the old EOF, then we'll also need to
2105                          * pre-read this page in if it isn't already valid
2106                          */
2107                         upl_offset = upl_size - PAGE_SIZE;
2108
2109                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
2110                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
2111                                 int   read_size;
2112
2113                                 read_size = PAGE_SIZE;
2114
2115                                 if ((upl_f_offset + upl_offset + read_size) > newEOF)
2116                                         read_size = newEOF - (upl_f_offset + upl_offset);
2117
2118                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
2119                                                     CL_READ, (buf_t)NULL, (struct clios *)NULL);
2120                                 if (retval) {
2121                                         /*
2122                                          * we had an error during the read which causes us to abort
2123                                          * the current cluster_write request... before we do, we
2124                                          * need to release the rest of the pages in the upl without
2125                                          * modifying there state and mark the failed page in error
2126                                          */
2127                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2128
2129                                         if (upl_size > PAGE_SIZE)
2130                                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2131
2132                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2133                                                      (int)upl, 0, 0, retval, 0);
2134                                         break;
2135                                 }
2136                         }
2137                 }
2138                 xfer_resid = io_size;
2139                 io_offset = start_offset;
2140
2141                 while (zero_cnt && xfer_resid) {
2142
2143                         if (zero_cnt < (long long)xfer_resid)
2144                                 bytes_to_zero = zero_cnt;
2145                         else
2146                                 bytes_to_zero = xfer_resid;
2147
2148                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2149                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2150                         } else {
2151                                 int zero_pg_index;
2152
2153                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
2154                                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
2155
2156                                 if ( !upl_valid_page(pl, zero_pg_index)) {
2157                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2158
2159                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2160                                            !upl_dirty_page(pl, zero_pg_index)) {
2161                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2162                                 }
2163                         }
2164                         xfer_resid -= bytes_to_zero;
2165                         zero_cnt   -= bytes_to_zero;
2166                         zero_off   += bytes_to_zero;
2167                         io_offset  += bytes_to_zero;
2168                 }
2169                 if (xfer_resid && io_resid) {
2170                         bytes_to_move = min(io_resid, xfer_resid);
2171
2172                         retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
2173
2174                         if (retval) {
2175
2176                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2177
2178                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2179                                              (int)upl, 0, 0, retval, 0);
2180                         } else {
2181                                 io_resid  -= bytes_to_move;
2182                                 xfer_resid -= bytes_to_move;
2183                                 io_offset  += bytes_to_move;
2184                         }
2185                 }
2186                 while (xfer_resid && zero_cnt1 && retval == 0) {
2187
2188                         if (zero_cnt1 < (long long)xfer_resid)
2189                                 bytes_to_zero = zero_cnt1;
2190                         else
2191                                 bytes_to_zero = xfer_resid;
2192
2193                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2194                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2195                         } else {
2196                                 int zero_pg_index;
2197
2198                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
2199                                 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
2200
2201                                 if ( !upl_valid_page(pl, zero_pg_index)) {
2202                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2203                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2204                                            !upl_dirty_page(pl, zero_pg_index)) {
2205                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2206                                 }
2207                         }
2208                         xfer_resid -= bytes_to_zero;
2209                         zero_cnt1  -= bytes_to_zero;
2210                         zero_off1  += bytes_to_zero;
2211                         io_offset  += bytes_to_zero;
2212                 }
2213
2214                 if (retval == 0) {
2215                         int cl_index;
2216                         int can_delay;
2217
2218                         io_size += start_offset;
2219
2220                         if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
2221                                 /*
2222                                  * if we're extending the file with this write
2223                                  * we'll zero fill the rest of the page so that
2224                                  * if the file gets extended again in such a way as to leave a
2225                                  * hole starting at this EOF, we'll have zero's in the correct spot
2226                                  */
2227                                 cluster_zero(upl, io_size, upl_size - io_size, NULL);
2228                         }
2229                         if (flags & IO_SYNC)
2230                                 /*
2231                                  * if the IO_SYNC flag is set than we need to
2232                                  * bypass any clusters and immediately issue
2233                                  * the I/O
2234                                  */
2235                                 goto issue_io;
2236 check_cluster:
2237                         /*
2238                          * take the lock to protect our accesses
2239                          * of the writebehind and sparse cluster state
2240                          */
2241                         wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
2242
2243                         /*
2244                          * calculate the last logical block number
2245                          * that this delayed I/O encompassed
2246                          */
2247                         cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
2248
2249                         if (wbp->cl_scmap) {
2250
2251                                 if ( !(flags & IO_NOCACHE)) {
2252                                         /*
2253                                          * we've fallen into the sparse
2254                                          * cluster method of delaying dirty pages
2255                                          * first, we need to release the upl if we hold one
2256                                          * since pages in it may be present in the sparse cluster map
2257                                          * and may span 2 separate buckets there... if they do and
2258                                          * we happen to have to flush a bucket to make room and it intersects
2259                                          * this upl, a deadlock may result on page BUSY
2260                                          */
2261                                         if (upl_size)
2262                                                 ubc_upl_commit_range(upl, 0, upl_size,
2263                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2264
2265                                         sparse_cluster_add(wbp, vp, &cl, newEOF);
2266
2267                                         lck_mtx_unlock(&wbp->cl_lockw);
2268
2269                                         continue;
2270                                 }
2271                                 /*
2272                                  * must have done cached writes that fell into
2273                                  * the sparse cluster mechanism... we've switched
2274                                  * to uncached writes on the file, so go ahead
2275                                  * and push whatever's in the sparse map
2276                                  * and switch back to normal clustering
2277                                  *
2278                                  * see the comment above concerning a possible deadlock...
2279                                  */
2280                                 if (upl_size) {
2281                                         ubc_upl_commit_range(upl, 0, upl_size,
2282                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2283                                         /*
2284                                          * setting upl_size to 0 keeps us from committing a
2285                                          * second time in the start_new_cluster path
2286                                          */
2287                                         upl_size = 0;
2288                                 }
2289                                 sparse_cluster_push(wbp, vp, newEOF, 1);
2290
2291                                 wbp->cl_number = 0;
2292                                 /*
2293                                  * no clusters of either type present at this point
2294                                  * so just go directly to start_new_cluster since
2295                                  * we know we need to delay this I/O since we've
2296                                  * already released the pages back into the cache
2297                                  * to avoid the deadlock with sparse_cluster_push
2298                                  */
2299                                 goto start_new_cluster;
2300                         }
2301                         upl_offset = 0;
2302
2303                         if (wbp->cl_number == 0)
2304                                 /*
2305                                  * no clusters currently present
2306                                  */
2307                                 goto start_new_cluster;
2308
2309                         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
2310                                 /*
2311                                  * check each cluster that we currently hold
2312                                  * try to merge some or all of this write into
2313                                  * one or more of the existing clusters... if
2314                                  * any portion of the write remains, start a
2315                                  * new cluster
2316                                  */
2317                                 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
2318                                         /*
2319                                          * the current write starts at or after the current cluster
2320                                          */
2321                                         if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
2322                                                 /*
2323                                                  * we have a write that fits entirely
2324                                                  * within the existing cluster limits
2325                                                  */
2326                                                 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
2327                                                         /*
2328                                                          * update our idea of where the cluster ends
2329                                                          */
2330                                                         wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2331                                                 break;
2332                                         }
2333                                         if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
2334                                                 /*
2335                                                  * we have a write that starts in the middle of the current cluster
2336                                                  * but extends beyond the cluster's limit... we know this because
2337                                                  * of the previous checks
2338                                                  * we'll extend the current cluster to the max
2339                                                  * and update the b_addr for the current write to reflect that
2340                                                  * the head of it was absorbed into this cluster...
2341                                                  * note that we'll always have a leftover tail in this case since
2342                                                  * full absorbtion would have occurred in the clause above
2343                                                  */
2344                                                 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER;
2345
2346                                                 if (upl_size) {
2347                                                         daddr64_t start_pg_in_upl;
2348
2349                                                         start_pg_in_upl = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
2350
2351                                                         if (start_pg_in_upl < wbp->cl_clusters[cl_index].e_addr) {
2352                                                                 intersection = (int)((wbp->cl_clusters[cl_index].e_addr - start_pg_in_upl) * PAGE_SIZE);
2353
2354                                                                 ubc_upl_commit_range(upl, upl_offset, intersection,
2355                                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2356                                                                 upl_f_offset += intersection;
2357                                                                 upl_offset   += intersection;
2358                                                                 upl_size     -= intersection;
2359                                                         }
2360                                                 }
2361                                                 cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
2362                                         }
2363                                         /*
2364                                          * we come here for the case where the current write starts
2365                                          * beyond the limit of the existing cluster or we have a leftover
2366                                          * tail after a partial absorbtion
2367                                          *
2368                                          * in either case, we'll check the remaining clusters before
2369                                          * starting a new one
2370                                          */
2371                                 } else {
2372                                         /*
2373                                          * the current write starts in front of the cluster we're currently considering
2374                                          */
2375                                         if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= MAX_UPL_TRANSFER) {
2376                                                 /*
2377                                                  * we can just merge the new request into
2378                                                  * this cluster and leave it in the cache
2379                                                  * since the resulting cluster is still
2380                                                  * less than the maximum allowable size
2381                                                  */
2382                                                 wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
2383
2384                                                 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
2385                                                         /*
2386                                                          * the current write completely
2387                                                          * envelops the existing cluster and since
2388                                                          * each write is limited to at most MAX_UPL_TRANSFER bytes
2389                                                          * we can just use the start and last blocknos of the write
2390                                                          * to generate the cluster limits
2391                                                          */
2392                                                         wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2393                                                 }
2394                                                 break;
2395                                         }
2396
2397                                         /*
2398                                          * if we were to combine this write with the current cluster
2399                                          * we would exceed the cluster size limit.... so,
2400                                          * let's see if there's any overlap of the new I/O with
2401                                          * the cluster we're currently considering... in fact, we'll
2402                                          * stretch the cluster out to it's full limit and see if we
2403                                          * get an intersection with the current write
2404                                          *
2405                                          */
2406                                         if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER) {
2407                                                 /*
2408                                                  * the current write extends into the proposed cluster
2409                                                  * clip the length of the current write after first combining it's
2410                                                  * tail with the newly shaped cluster
2411                                                  */
2412                                                 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER;
2413
2414                                                 if (upl_size) {
2415                                                         intersection = (int)((cl.e_addr - wbp->cl_clusters[cl_index].b_addr) * PAGE_SIZE);
2416
2417                                                         if (intersection > upl_size)
2418                                                                 /*
2419                                                                  * because the current write may consist of a number of pages found in the cache
2420                                                                  * which are not part of the UPL, we may have an intersection that exceeds
2421                                                                  * the size of the UPL that is also part of this write
2422                                                                  */
2423                                                                 intersection = upl_size;
2424
2425                                                         ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2426                                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2427                                                         upl_size -= intersection;
2428                                                 }
2429                                                 cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
2430                                         }
2431                                         /*
2432                                          * if we get here, there was no way to merge
2433                                          * any portion of this write with this cluster
2434                                          * or we could only merge part of it which
2435                                          * will leave a tail...
2436                                          * we'll check the remaining clusters before starting a new one
2437                                          */
2438                                 }
2439                         }
2440                         if (cl_index < wbp->cl_number)
2441                                 /*
2442                                  * we found an existing cluster(s) that we
2443                                  * could entirely merge this I/O into
2444                                  */
2445                                 goto delay_io;
2446
2447                         if (wbp->cl_number < MAX_CLUSTERS && !(flags & IO_NOCACHE))
2448                                 /*
2449                                  * we didn't find an existing cluster to
2450                                  * merge into, but there's room to start
2451                                  * a new one
2452                                  */
2453                                 goto start_new_cluster;
2454
2455                         /*
2456                          * no exisitng cluster to merge with and no
2457                          * room to start a new one... we'll try
2458                          * pushing one of the existing ones... if none of
2459                          * them are able to be pushed, we'll switch
2460                          * to the sparse cluster mechanism
2461                          * cluster_try_push updates cl_number to the
2462                          * number of remaining clusters... and
2463                          * returns the number of currently unused clusters
2464                          */
2465                         int ret_cluster_try_push = 0;
2466                         /* if writes are not deferred, call cluster push immediately */
2467                         if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2468                                 if (flags & IO_NOCACHE)
2469                                         can_delay = 0;
2470                                 else
2471                                         can_delay = 1;
2472
2473                                 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, can_delay, 0);
2474                         }
2475
2476                         /* execute following regardless writes are deferred or not */
2477                         if (ret_cluster_try_push == 0) {
2478                                 /*
2479                                  * no more room in the normal cluster mechanism
2480                                  * so let's switch to the more expansive but expensive
2481                                  * sparse mechanism....
2482                                  * first, we need to release the upl if we hold one
2483                                  * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2484                                  * and may span 2 separate buckets there... if they do and
2485                                  * we happen to have to flush a bucket to make room and it intersects
2486                                  * this upl, a deadlock may result on page BUSY
2487                                  */
2488                                 if (upl_size)
2489                                         ubc_upl_commit_range(upl, upl_offset, upl_size,
2490                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2491
2492                                 sparse_cluster_switch(wbp, vp, newEOF);
2493                                 sparse_cluster_add(wbp, vp, &cl, newEOF);
2494
2495                                 lck_mtx_unlock(&wbp->cl_lockw);
2496
2497                                 continue;
2498                         }
2499                         /*
2500                          * we pushed one cluster successfully, so we must be sequentially writing this file
2501                          * otherwise, we would have failed and fallen into the sparse cluster support
2502                          * so let's take the opportunity to push out additional clusters as long as we
2503                          * remain below the throttle... this will give us better I/O locality if we're
2504                          * in a copy loop (i.e.  we won't jump back and forth between the read and write points
2505                          * however, we don't want to push so much out that the write throttle kicks in and
2506                          * hangs this thread up until some of the I/O completes...
2507                          */
2508                         if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2509                                 while (wbp->cl_number && (vp->v_numoutput <= (VNODE_ASYNC_THROTTLE / 2)))
2510                                         cluster_try_push(wbp, vp, newEOF, 0, 0);
2511                         }
2512
2513 start_new_cluster:
2514                         wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
2515                         wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
2516
2517                         if (flags & IO_NOCACHE)
2518                                 wbp->cl_clusters[wbp->cl_number].io_nocache = 1;
2519                         else
2520                                 wbp->cl_clusters[wbp->cl_number].io_nocache = 0;
2521                         wbp->cl_number++;
2522 delay_io:
2523                         if (upl_size)
2524                                 ubc_upl_commit_range(upl, upl_offset, upl_size,
2525                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2526
2527                         lck_mtx_unlock(&wbp->cl_lockw);
2528
2529                         continue;
2530 issue_io:
2531                         /*
2532                          * we don't hold the vnode lock at this point
2533                          *
2534                          * because we had to ask for a UPL that provides currenty non-present pages, the
2535                          * UPL has been automatically set to clear the dirty flags (both software and hardware)
2536                          * upon committing it... this is not the behavior we want since it's possible for
2537                          * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
2538                          * in order to maintain some semblance of coherency with mapped writes
2539                          * we need to drop the current upl and pick it back up with COPYOUT_FROM set
2540                          * so that we correctly deal with a change in state of the hardware modify bit...
2541                          * we do this via cluster_push_x... by passing along the IO_SYNC flag, we force
2542                          * cluster_push_x to wait until all the I/Os have completed... cluster_push_x is also
2543                          * responsible for generating the correct sized I/O(s)
2544                          */
2545                         ubc_upl_commit_range(upl, 0, upl_size,
2546                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2547
2548                         cl.e_addr = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
2549
2550                         retval = cluster_push_x(vp, &cl, newEOF, flags);
2551                 }
2552         }
2553         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2554                      retval, 0, io_resid, 0, 0);
2555
2556         return (retval);
2557 }
2558
2559 int
2560 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
2561 {
2562         int           prev_resid;
2563         u_int         clip_size;
2564         off_t         max_io_size;
2565         int           upl_size;
2566         int           upl_flags;
2567         upl_t         upl;
2568         int           retval = 0;
2569         int           flags;
2570
2571         flags = xflags;
2572
2573         if (vp->v_flag & VNOCACHE_DATA)
2574                 flags |= IO_NOCACHE;
2575         if (vp->v_flag & VRAOFF)
2576                 flags |= IO_RAOFF;
2577
2578         if (!((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
2579                 /*
2580                  * go do a read through the cache if one of the following is true....
2581                  *   NOCACHE is not true
2582                  *   the uio request doesn't target USERSPACE
2583                  */
2584                 return (cluster_read_x(vp, uio, filesize, flags));
2585         }
2586
2587 #if LP64_DEBUG
2588         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
2589                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
2590         }
2591 #endif /* LP64_DEBUG */
2592
2593         while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
2594                 u_int64_t       iov_len;
2595                 u_int64_t       iov_base;
2596
2597                 /*
2598                  * we know we have a resid, so this is safe
2599                  * skip over any emtpy vectors
2600                  */
2601                 iov_len = uio_iov_len(uio);
2602
2603                 while (iov_len == 0) {
2604                         uio_next_iov(uio);
2605                         uio->uio_iovcnt--;
2606                         iov_len = uio_iov_len(uio);
2607                 }
2608                 iov_base = uio_iov_base(uio);
2609                 upl_size  = PAGE_SIZE;
2610                 upl_flags = UPL_QUERY_OBJECT_TYPE;
2611
2612                 // LP64todo - fix this!
2613                 if ((vm_map_get_upl(current_map(),
2614                                     CAST_DOWN(vm_offset_t, iov_base) & ~PAGE_MASK,
2615                                     &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
2616                         /*
2617                          * the user app must have passed in an invalid address
2618                          */
2619                         return (EFAULT);
2620                 }
2621
2622                 /*
2623                  * We check every vector target but if it is physically
2624                  * contiguous space, we skip the sanity checks.
2625                  */
2626                 if (upl_flags & UPL_PHYS_CONTIG) {
2627                         retval = cluster_phys_read(vp, uio, filesize);
2628                 }
2629                 else if (uio_resid(uio) < PAGE_SIZE) {
2630                         /*
2631                          * we're here because we're don't have a physically contiguous target buffer
2632                          * go do a read through the cache if
2633                          *   the total xfer size is less than a page...
2634                          */
2635                         return (cluster_read_x(vp, uio, filesize, flags));
2636                 }
2637                 // LP64todo - fix this!
2638                 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2639                        if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2640                                /*
2641                                 * Bring the file offset read up to a pagesize boundary
2642                                 * this will also bring the base address to a page boundary
2643                                 * since they both are currently on the same offset within a page
2644                                 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2645                                 * so the computed clip_size must always be less than the current uio_resid
2646                                 */
2647                                clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2648
2649                                /*
2650                                 * Fake the resid going into the cluster_read_x call
2651                                 * and restore it on the way out.
2652                                 */
2653                                prev_resid = uio_resid(uio);
2654                                // LP64todo - fix this
2655                                uio_setresid(uio, clip_size);
2656
2657                                retval = cluster_read_x(vp, uio, filesize, flags);
2658
2659                                uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2660                        } else {
2661                                /*
2662                                 * can't get both the file offset and the buffer offset aligned to a page boundary
2663                                 * so fire an I/O through the cache for this entire vector
2664                                 */
2665                                // LP64todo - fix this!
2666                                clip_size = iov_len;
2667                                prev_resid = uio_resid(uio);
2668                                uio_setresid(uio, clip_size);
2669
2670                                retval = cluster_read_x(vp, uio, filesize, flags);
2671
2672                                uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2673                        }
2674                 } else {
2675                         /*
2676                          * If we come in here, we know the offset into
2677                          * the file is on a pagesize boundary
2678                          */
2679                         max_io_size = filesize - uio->uio_offset;
2680                         // LP64todo - fix this
2681                         clip_size = uio_resid(uio);
2682                         if (iov_len < clip_size)
2683                                 clip_size = iov_len;
2684                         if (max_io_size < clip_size)
2685                                 clip_size = (int)max_io_size;
2686
2687                         if (clip_size < PAGE_SIZE) {
2688                                 /*
2689                                  * Take care of the tail end of the read in this vector.
2690                                  */
2691                                 // LP64todo - fix this
2692                                 prev_resid = uio_resid(uio);
2693                                 uio_setresid(uio, clip_size);
2694
2695                                 retval = cluster_read_x(vp, uio, filesize, flags);
2696
2697                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2698                         } else {
2699                                 /* round clip_size down to a multiple of pagesize */
2700                                 clip_size = clip_size & ~(PAGE_MASK);
2701                                 // LP64todo - fix this
2702                                 prev_resid = uio_resid(uio);
2703                                 uio_setresid(uio, clip_size);
2704
2705                                 retval = cluster_nocopy_read(vp, uio, filesize);
2706
2707                                 if ((retval==0) && uio_resid(uio))
2708                                         retval = cluster_read_x(vp, uio, filesize, flags);
2709
2710                                 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2711                         }
2712                 } /* end else */
2713         } /* end while */
2714
2715         return(retval);
2716 }
2717
2718 static int
2719 cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags)
2720 {
2721         upl_page_info_t *pl;
2722         upl_t            upl;
2723         vm_offset_t      upl_offset;
2724         int              upl_size;
2725         off_t            upl_f_offset;
2726         int              start_offset;
2727         int              start_pg;
2728         int              last_pg;
2729         int              uio_last = 0;
2730         int              pages_in_upl;
2731         off_t            max_size;
2732         off_t            last_ioread_offset;
2733         off_t            last_request_offset;
2734         u_int            size_of_prefetch;
2735         u_int            io_size;
2736         kern_return_t    kret;
2737         int              error  = 0;
2738         int              retval = 0;
2739         u_int            max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2740         u_int            rd_ahead_enabled = 1;
2741         u_int            prefetch_enabled = 1;
2742         struct cl_readahead *   rap;
2743         struct clios            iostate;
2744         struct cl_extent        extent;
2745
2746         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2747                      (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
2748
2749         // LP64todo - fix this
2750         last_request_offset = uio->uio_offset + uio_resid(uio);
2751
2752         if ((flags & (IO_RAOFF|IO_NOCACHE)) ||
2753                 ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
2754                 rd_ahead_enabled = 0;
2755                 rap = NULL;
2756         } else {
2757                 if (cluster_hard_throttle_on(vp)) {
2758                         rd_ahead_enabled = 0;
2759                         prefetch_enabled = 0;
2760
2761                         max_rd_size = HARD_THROTTLE_MAXSIZE;
2762                 }
2763                 if ((rap = cluster_get_rap(vp)) == NULL)
2764                         rd_ahead_enabled = 0;
2765         }
2766         if (last_request_offset > filesize)
2767                 last_request_offset = filesize;
2768         extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
2769         extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
2770
2771         if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
2772                 /*
2773                  * determine if we already have a read-ahead in the pipe courtesy of the
2774                  * last read systemcall that was issued...
2775                  * if so, pick up it's extent to determine where we should start
2776                  * with respect to any read-ahead that might be necessary to
2777                  * garner all the data needed to complete this read systemcall
2778                  */
2779                 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
2780
2781                 if (last_ioread_offset < uio->uio_offset)
2782                         last_ioread_offset = (off_t)0;
2783                 else if (last_ioread_offset > last_request_offset)
2784                         last_ioread_offset = last_request_offset;
2785         } else
2786                 last_ioread_offset = (off_t)0;
2787
2788         while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
2789                 /*
2790                  * compute the size of the upl needed to encompass
2791                  * the requested read... limit each call to cluster_io
2792                  * to the maximum UPL size... cluster_io will clip if
2793                  * this exceeds the maximum io_size for the device,
2794                  * make sure to account for
2795                  * a starting offset that's not page aligned
2796                  */
2797                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2798                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2799                 max_size     = filesize - uio->uio_offset;
2800
2801         // LP64todo - fix this!
2802                 if ((off_t)((unsigned int)uio_resid(uio)) < max_size)
2803                         io_size = uio_resid(uio);
2804                 else
2805                         io_size = max_size;
2806
2807                 if (!(flags & IO_NOCACHE)) {
2808
2809                         while (io_size) {
2810                                 u_int io_resid;
2811                                 u_int io_requested;
2812
2813                                 /*
2814                                  * if we keep finding the pages we need already in the cache, then
2815                                  * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2816                                  * to determine that we have all the pages we need... once we miss in
2817                                  * the cache and have issued an I/O, than we'll assume that we're likely
2818                                  * to continue to miss in the cache and it's to our advantage to try and prefetch
2819                                  */
2820                                 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
2821                                         if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
2822                                                 /*
2823                                                  * we've already issued I/O for this request and
2824                                                  * there's still work to do and
2825                                                  * our prefetch stream is running dry, so issue a
2826                                                  * pre-fetch I/O... the I/O latency will overlap
2827                                                  * with the copying of the data
2828                                                  */
2829                                                 if (size_of_prefetch > max_rd_size)
2830                                                         size_of_prefetch = max_rd_size;
2831
2832                                                 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
2833
2834                                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2835
2836                                                 if (last_ioread_offset > last_request_offset)
2837                                                         last_ioread_offset = last_request_offset;
2838                                         }
2839                                 }
2840                                 /*
2841                                  * limit the size of the copy we're about to do so that
2842                                  * we can notice that our I/O pipe is running dry and
2843                                  * get the next I/O issued before it does go dry
2844                                  */
2845                                 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
2846                                         io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
2847                                 else
2848                                         io_resid = io_size;
2849
2850                                 io_requested = io_resid;
2851
2852                                 retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
2853
2854                                 io_size -= (io_requested - io_resid);
2855
2856                                 if (retval || io_resid)
2857                                         /*
2858                                          * if we run into a real error or
2859                                          * a page that is not in the cache
2860                                          * we need to leave streaming mode
2861                                          */
2862                                         break;
2863
2864                                 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
2865                                         /*
2866                                          * we're already finished the I/O for this read request
2867                                          * let's see if we should do a read-ahead
2868                                          */
2869                                         cluster_rd_ahead(vp, &extent, filesize, rap);
2870                                 }
2871                         }
2872                         if (retval)
2873                                 break;
2874                         if (io_size == 0) {
2875                                 if (rap != NULL) {
2876                                         if (extent.e_addr < rap->cl_lastr)
2877                                                 rap->cl_maxra = 0;
2878                                         rap->cl_lastr = extent.e_addr;
2879                                 }
2880                                 break;
2881                         }
2882                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2883                         upl_f_offset = uio->uio_offset - (off_t)start_offset;
2884                         max_size     = filesize - uio->uio_offset;
2885                 }
2886                 if (io_size > max_rd_size)
2887                         io_size = max_rd_size;
2888
2889                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2890
2891                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2892                         upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
2893                 pages_in_upl = upl_size / PAGE_SIZE;
2894
2895                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2896                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2897
2898                 kret = ubc_create_upl(vp,
2899                                       upl_f_offset,
2900                                       upl_size,
2901                                       &upl,
2902                                       &pl,
2903                                       UPL_SET_LITE);
2904                 if (kret != KERN_SUCCESS)
2905                         panic("cluster_read: failed to get pagelist");
2906
2907                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2908                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2909
2910                 /*
2911                  * scan from the beginning of the upl looking for the first
2912                  * non-valid page.... this will become the first page in
2913                  * the request we're going to make to 'cluster_io'... if all
2914                  * of the pages are valid, we won't call through to 'cluster_io'
2915                  */
2916                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2917                         if (!upl_valid_page(pl, start_pg))
2918                                 break;
2919                 }
2920
2921                 /*
2922                  * scan from the starting invalid page looking for a valid
2923                  * page before the end of the upl is reached, if we
2924                  * find one, then it will be the last page of the request to
2925                  * 'cluster_io'
2926                  */
2927                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2928                         if (upl_valid_page(pl, last_pg))
2929                                 break;
2930                 }
2931                 iostate.io_completed = 0;
2932                 iostate.io_issued = 0;
2933                 iostate.io_error = 0;
2934                 iostate.io_wanted = 0;
2935
2936                 if (start_pg < last_pg) {
2937                         /*
2938                          * we found a range of 'invalid' pages that must be filled
2939                          * if the last page in this range is the last page of the file
2940                          * we may have to clip the size of it to keep from reading past
2941                          * the end of the last physical block associated with the file
2942                          */
2943                         upl_offset = start_pg * PAGE_SIZE;
2944                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
2945
2946                         if ((upl_f_offset + upl_offset + io_size) > filesize)
2947                                 io_size = filesize - (upl_f_offset + upl_offset);
2948
2949                         /*
2950                          * issue an asynchronous read to cluster_io
2951                          */
2952
2953                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2954                                            io_size, CL_READ | CL_ASYNC, (buf_t)NULL, &iostate);
2955                 }
2956                 if (error == 0) {
2957                         /*
2958                          * if the read completed successfully, or there was no I/O request
2959                          * issued, than copy the data into user land via 'cluster_upl_copy_data'
2960                          * we'll first add on any 'valid'
2961                          * pages that were present in the upl when we acquired it.
2962                          */
2963                         u_int  val_size;
2964
2965                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2966                                 if (!upl_valid_page(pl, uio_last))
2967                                         break;
2968                         }
2969                         /*
2970                          * compute size to transfer this round,  if uio->uio_resid is
2971                          * still non-zero after this attempt, we'll loop around and
2972                          * set up for another I/O.
2973                          */
2974                         val_size = (uio_last * PAGE_SIZE) - start_offset;
2975
2976                         if (val_size > max_size)
2977                                 val_size = max_size;
2978
2979                         if (val_size > uio_resid(uio))
2980         // LP64todo - fix this
2981                                 val_size = uio_resid(uio);
2982
2983                         if (last_ioread_offset == 0)
2984                                 last_ioread_offset = uio->uio_offset + val_size;
2985
2986                         if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
2987                                 /*
2988                                  * if there's still I/O left to do for this request, and...
2989                                  * we're not in hard throttle mode, then issue a
2990                                  * pre-fetch I/O... the I/O latency will overlap
2991                                  * with the copying of the data
2992                                  */
2993                                 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
2994
2995                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2996
2997                                 if (last_ioread_offset > last_request_offset)
2998                                         last_ioread_offset = last_request_offset;
2999
3000                         } else if ((uio->uio_offset + val_size) == last_request_offset) {
3001                                 /*
3002                                  * this transfer will finish this request, so...
3003                                  * let's try to read ahead if we're in
3004                                  * a sequential access pattern and we haven't
3005                                  * explicitly disabled it
3006                                  */
3007                                 if (rd_ahead_enabled)
3008                                         cluster_rd_ahead(vp, &extent, filesize, rap);
3009
3010                                 if (rap != NULL) {
3011                                         if (extent.e_addr < rap->cl_lastr)
3012                                                 rap->cl_maxra = 0;
3013                                         rap->cl_lastr = extent.e_addr;
3014                                 }
3015                         }
3016                         lck_mtx_lock(cl_mtxp);
3017
3018                         while (iostate.io_issued != iostate.io_completed) {
3019                                 iostate.io_wanted = 1;
3020                                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_x", 0);
3021                         }
3022                         lck_mtx_unlock(cl_mtxp);
3023
3024                         if (iostate.io_error)
3025                                 error = iostate.io_error;
3026                         else
3027                                 retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
3028                 }
3029                 if (start_pg < last_pg) {
3030                         /*
3031                          * compute the range of pages that we actually issued an I/O for
3032                          * and either commit them as valid if the I/O succeeded
3033                          * or abort them if the I/O failed
3034                          */
3035                         io_size = (last_pg - start_pg) * PAGE_SIZE;
3036
3037                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3038                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3039
3040                         if (error || (flags & IO_NOCACHE))
3041                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
3042                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3043                         else
3044                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
3045                                                      UPL_COMMIT_CLEAR_DIRTY |
3046                                                      UPL_COMMIT_FREE_ON_EMPTY |
3047                                                      UPL_COMMIT_INACTIVATE);
3048
3049                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
3050                                      (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3051                 }
3052                 if ((last_pg - start_pg) < pages_in_upl) {
3053                         int cur_pg;
3054                         int commit_flags;
3055
3056                         /*
3057                          * the set of pages that we issued an I/O for did not encompass
3058                          * the entire upl... so just release these without modifying
3059                          * their state
3060                          */
3061                         if (error)
3062                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3063                         else {
3064                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3065                                              (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
3066
3067                                 if (start_pg) {
3068                                         /*
3069                                          * we found some already valid pages at the beginning of
3070                                          * the upl commit these back to the inactive list with
3071                                          * reference cleared
3072                                          */
3073                                         for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
3074                                                 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
3075                                                                    | UPL_COMMIT_INACTIVATE;
3076
3077                                                 if (upl_dirty_page(pl, cur_pg))
3078                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
3079
3080                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
3081                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3082                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3083                                                 else
3084                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3085                                                                 PAGE_SIZE, commit_flags);
3086                                         }
3087                                 }
3088                                 if (last_pg < uio_last) {
3089                                         /*
3090                                          * we found some already valid pages immediately after the
3091                                          * pages we issued I/O for, commit these back to the
3092                                          * inactive list with reference cleared
3093                                          */
3094                                         for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
3095                                                 commit_flags =  UPL_COMMIT_FREE_ON_EMPTY
3096                                                                                 | UPL_COMMIT_INACTIVATE;
3097
3098                                                 if (upl_dirty_page(pl, cur_pg))
3099                                                         commit_flags |= UPL_COMMIT_SET_DIRTY;
3100
3101                                                 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
3102                                                         ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3103                                                                 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3104                                                 else
3105                                                         ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3106                                                                 PAGE_SIZE, commit_flags);
3107                                         }
3108                                 }
3109                                 if (uio_last < pages_in_upl) {
3110                                         /*
3111                                          * there were some invalid pages beyond the valid pages
3112                                          * that we didn't issue an I/O for, just release them
3113                                          * unchanged
3114                                          */
3115                                         ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
3116                                                             (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
3117                                 }
3118
3119                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
3120                                         (int)upl, -1, -1, 0, 0);
3121                         }
3122                 }
3123                 if (retval == 0)
3124                         retval = error;
3125
3126                 if ( uio_resid(uio) ) {
3127                         if (cluster_hard_throttle_on(vp)) {
3128                                 rd_ahead_enabled = 0;
3129                                 prefetch_enabled = 0;
3130
3131                                 max_rd_size = HARD_THROTTLE_MAXSIZE;
3132                         } else {
3133                                 if (rap != NULL)
3134                                         rd_ahead_enabled = 1;
3135                                 prefetch_enabled = 1;
3136
3137                                 max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3138                         }
3139                 }
3140         }
3141         if (rap != NULL) {
3142                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3143                              (int)uio->uio_offset, uio_resid(uio), rap->cl_lastr, retval, 0);
3144
3145                 lck_mtx_unlock(&rap->cl_lockr);
3146         } else {
3147                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3148                              (int)uio->uio_offset, uio_resid(uio), 0, retval, 0);
3149         }
3150
3151         return (retval);
3152 }
3153
3154
3155 static int
3156 cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize)
3157 {
3158         upl_t            upl;
3159         upl_page_info_t  *pl;
3160         vm_offset_t      upl_offset;
3161         off_t            max_io_size;
3162         int              io_size;
3163         int              upl_size;
3164         int              upl_needed_size;
3165         int              pages_in_pl;
3166         int              upl_flags;
3167         kern_return_t    kret;
3168         struct iovec     *iov;
3169         int              i;
3170         int              force_data_sync;
3171         int              retval = 0;
3172         int              no_zero_fill = 0;
3173         int              abort_flag = 0;
3174         struct clios     iostate;
3175         u_int            max_rd_size  = MAX_UPL_TRANSFER * PAGE_SIZE;
3176         u_int            max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
3177
3178
3179         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
3180                      (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
3181
3182         /*
3183          * When we enter this routine, we know
3184          *  -- the offset into the file is on a pagesize boundary
3185          *  -- the resid is a page multiple
3186          *  -- the resid will not exceed iov_len
3187          */
3188
3189         iostate.io_completed = 0;
3190         iostate.io_issued = 0;
3191         iostate.io_error = 0;
3192         iostate.io_wanted = 0;
3193
3194         iov = uio->uio_iov;
3195
3196         while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
3197
3198                 if (cluster_hard_throttle_on(vp)) {
3199                         max_rd_size  = HARD_THROTTLE_MAXSIZE;
3200                         max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
3201                 } else {
3202                         max_rd_size  = MAX_UPL_TRANSFER * PAGE_SIZE;
3203                         max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
3204                 }
3205                 max_io_size = filesize - uio->uio_offset;
3206
3207         // LP64todo - fix this
3208                 if (max_io_size < (off_t)((unsigned int)uio_resid(uio)))
3209                         io_size = max_io_size;
3210                 else
3211                         io_size = uio_resid(uio);
3212
3213                 /*
3214                  * First look for pages already in the cache
3215                  * and move them to user space.
3216                  */
3217                 retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
3218
3219                 if (retval) {
3220                         /*
3221                          * we may have already spun some portion of this request
3222                          * off as async requests... we need to wait for the I/O
3223                          * to complete before returning
3224                          */
3225                         goto wait_for_reads;
3226                 }
3227                 /*
3228                  * If we are already finished with this read, then return
3229                  */
3230                 if (io_size == 0) {
3231                         /*
3232                          * we may have already spun some portion of this request
3233                          * off as async requests... we need to wait for the I/O
3234                          * to complete before returning
3235                          */
3236                         goto wait_for_reads;
3237                 }
3238                 max_io_size = io_size;
3239
3240                 if (max_io_size > max_rd_size)
3241                         max_io_size = max_rd_size;
3242
3243                 io_size = 0;
3244
3245                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
3246
3247                 if (io_size == 0)
3248                         /*
3249                          * we may have already spun some portion of this request
3250                          * off as async requests... we need to wait for the I/O
3251                          * to complete before returning
3252                          */
3253                         goto wait_for_reads;
3254
3255                 // LP64todo - fix this!
3256                 upl_offset = CAST_DOWN(vm_offset_t, iov->iov_base) & PAGE_MASK;
3257                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
3258
3259                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
3260                              (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
3261
3262                 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
3263                         no_zero_fill = 1;
3264                         abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY;
3265                 } else {
3266                         no_zero_fill = 0;
3267                         abort_flag = UPL_ABORT_FREE_ON_EMPTY;
3268                 }
3269                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
3270                         pages_in_pl = 0;
3271                         upl_size = upl_needed_size;
3272                         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3273
3274                         if (no_zero_fill)
3275                                 upl_flags |= UPL_NOZEROFILL;
3276                         if (force_data_sync)
3277                                 upl_flags |= UPL_FORCE_DATA_SYNC;
3278
3279                         // LP64todo - fix this!
3280                         kret = vm_map_create_upl(current_map(),
3281                                               (vm_map_offset_t)(CAST_DOWN(vm_offset_t, iov->iov_base) & ~PAGE_MASK),
3282                                                  &upl_size, &upl, NULL, &pages_in_pl, &upl_flags);
3283
3284                         if (kret != KERN_SUCCESS) {
3285                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3286                                              (int)upl_offset, upl_size, io_size, kret, 0);
3287                                 /*
3288                                  * cluster_nocopy_read: failed to get pagelist
3289                                  *
3290                                  * we may have already spun some portion of this request
3291                                  * off as async requests... we need to wait for the I/O
3292                                  * to complete before returning
3293                                  */
3294                                 goto wait_for_reads;
3295                         }
3296                         pages_in_pl = upl_size / PAGE_SIZE;
3297                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3298
3299                         for (i = 0; i < pages_in_pl; i++) {
3300                                 if (!upl_valid_page(pl, i))
3301                                         break;
3302                         }
3303                         if (i == pages_in_pl)
3304                                 break;
3305
3306                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3307                 }
3308                 if (force_data_sync >= 3) {
3309                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3310                                      (int)upl_offset, upl_size, io_size, kret, 0);
3311
3312                         goto wait_for_reads;
3313                 }
3314                 /*
3315                  * Consider the possibility that upl_size wasn't satisfied.
3316                  */
3317                 if (upl_size != upl_needed_size)
3318                         io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
3319
3320                 if (io_size == 0) {
3321                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3322                         goto wait_for_reads;
3323                 }
3324                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3325                              (int)upl_offset, upl_size, io_size, kret, 0);
3326
3327                 /*
3328                  * request asynchronously so that we can overlap
3329                  * the preparation of the next I/O
3330                  * if there are already too many outstanding reads
3331                  * wait until some have completed before issuing the next read
3332                  */
3333                 lck_mtx_lock(cl_mtxp);
3334
3335                 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
3336                         iostate.io_wanted = 1;
3337                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
3338                 }
3339                 lck_mtx_unlock(cl_mtxp);
3340
3341                 if (iostate.io_error) {
3342                         /*
3343                          * one of the earlier reads we issued ran into a hard error
3344                          * don't issue any more reads, cleanup the UPL
3345                          * that was just created but not used, then
3346                          * go wait for any other reads to complete before
3347                          * returning the error to the caller
3348                          */
3349                         ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3350
3351                         goto wait_for_reads;
3352                 }
3353                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
3354                              (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
3355
3356                 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size,
3357                                    CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
3358                                    (buf_t)NULL, &iostate);
3359
3360                 /*
3361                  * update the uio structure
3362                  */
3363                 ((u_int32_t)iov->iov_base)   += io_size;
3364                 iov->iov_len    -= io_size;
3365                 uio_setresid(uio, (uio_resid(uio) - io_size));
3366                 uio->uio_offset += io_size;
3367
3368                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
3369                              (int)upl, (int)uio->uio_offset, (int)uio_resid(uio), retval, 0);
3370
3371         } /* end while */
3372
3373 wait_for_reads:
3374         /*
3375          * make sure all async reads that are part of this stream
3376          * have completed before we return
3377          */
3378         lck_mtx_lock(cl_mtxp);
3379
3380         while (iostate.io_issued != iostate.io_completed) {
3381                 iostate.io_wanted = 1;
3382                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
3383         }
3384         lck_mtx_unlock(cl_mtxp);
3385
3386         if (iostate.io_error)
3387                 retval = iostate.io_error;
3388
3389         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
3390                      (int)uio->uio_offset, (int)uio_resid(uio), 6, retval, 0);
3391
3392         return (retval);
3393 }
3394
3395
3396 static int
3397 cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize)
3398 {
3399         upl_page_info_t *pl;
3400         upl_t            upl;
3401         vm_offset_t      upl_offset;
3402         addr64_t         dst_paddr;
3403         off_t            max_size;
3404 #if LP64KERN
3405         int64_t                  io_size;
3406         u_int64_t                iov_len;
3407         u_int64_t                iov_base;
3408 #else
3409         int                      io_size;
3410         uint                     iov_len;
3411         uint                     iov_base;
3412 #endif
3413         int              tail_size;
3414         int              upl_size;
3415         int              upl_needed_size;
3416         int              pages_in_pl;
3417         int              upl_flags;
3418         kern_return_t    kret;
3419         struct clios     iostate;
3420         int              error;
3421         int              devblocksize;
3422
3423         devblocksize = vp->v_mount->mnt_devblocksize;
3424         /*
3425          * When we enter this routine, we know
3426          *  -- the resid will not exceed iov_len
3427          *  -- the target address is physically contiguous
3428          */
3429
3430 #if LP64_DEBUG
3431         if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
3432                 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
3433         }
3434 #endif /* LP64_DEBUG */
3435
3436         iov_len = uio_iov_len(uio);
3437         iov_base = uio_iov_base(uio);
3438
3439         max_size = filesize - uio->uio_offset;
3440
3441         // LP64todo - fix this!
3442         if (max_size < 0 || (u_int64_t)max_size > iov_len)
3443                 io_size = iov_len;
3444         else
3445                 io_size = max_size;
3446
3447         // LP64todo - fix this!
3448         upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
3449         upl_needed_size = upl_offset + io_size;
3450
3451         error       = 0;
3452         pages_in_pl = 0;
3453         upl_size = upl_needed_size;
3454         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3455
3456         kret = vm_map_get_upl(current_map(),
3457                               CAST_DOWN(vm_offset_t, iov_base) & ~PAGE_MASK,
3458                               &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
3459
3460         if (kret != KERN_SUCCESS) {
3461                 /*
3462                  * cluster_phys_read: failed to get pagelist
3463                  */
3464                 return(EINVAL);
3465         }
3466         if (upl_size < upl_needed_size) {
3467                 /*
3468                  * The upl_size wasn't satisfied.
3469                  */
3470                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3471
3472                 return(EINVAL);
3473         }
3474         pl = ubc_upl_pageinfo(upl);
3475
3476         dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)(iov_base & PAGE_MASK));
3477
3478         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3479                 int   head_size;
3480
3481                 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
3482
3483                 if (head_size > io_size)
3484                         head_size = io_size;
3485
3486                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ);
3487
3488                 if (error) {
3489                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3490
3491                         return(EINVAL);
3492                 }
3493                 upl_offset += head_size;
3494                 dst_paddr  += head_size;
3495                 io_size    -= head_size;
3496         }
3497         tail_size = io_size & (devblocksize - 1);
3498         io_size  -= tail_size;
3499
3500         iostate.io_completed = 0;
3501         iostate.io_issued = 0;
3502         iostate.io_error = 0;
3503         iostate.io_wanted = 0;
3504
3505         while (io_size && error == 0) {
3506                 int  xsize;
3507
3508                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3509                         xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
3510                 else
3511                         xsize = io_size;
3512                 /*
3513                  * request asynchronously so that we can overlap
3514                  * the preparation of the next I/O... we'll do
3515                  * the commit after all the I/O has completed
3516                  * since its all issued against the same UPL
3517                  * if there are already too many outstanding reads
3518                  * wait until some have completed before issuing the next
3519                  */
3520                 lck_mtx_lock(cl_mtxp);
3521
3522                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
3523                         iostate.io_wanted = 1;
3524                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
3525                 }
3526                 lck_mtx_unlock(cl_mtxp);
3527
3528                 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize,
3529                                    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3530                                    (buf_t)NULL, &iostate);
3531                 /*
3532                  * The cluster_io read was issued successfully,
3533                  * update the uio structure
3534                  */
3535                 if (error == 0) {
3536                         uio_setresid(uio, (uio_resid(uio) - xsize));
3537                         uio_iov_base_add(uio, xsize);
3538                         uio_iov_len_add(uio, -xsize);
3539                         uio->uio_offset += xsize;
3540                         dst_paddr       += xsize;
3541                         upl_offset      += xsize;
3542                         io_size         -= xsize;
3543                 }
3544         }
3545         /*
3546          * make sure all async reads that are part of this stream
3547          * have completed before we proceed
3548          */
3549         lck_mtx_lock(cl_mtxp);
3550
3551         while (iostate.io_issued != iostate.io_completed) {
3552                 iostate.io_wanted = 1;
3553                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
3554         }
3555         lck_mtx_unlock(cl_mtxp);
3556
3557         if (iostate.io_error)
3558                 error = iostate.io_error;
3559
3560         if (error == 0 && tail_size)
3561                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ);
3562
3563         /*
3564          * just release our hold on the physically contiguous
3565          * region without changing any state
3566          */
3567         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3568
3569         return (error);
3570 }
3571
3572
3573 /*
3574  * generate advisory I/O's in the largest chunks possible
3575  * the completed pages will be released into the VM cache
3576  */
3577 int
3578 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
3579 {
3580         upl_page_info_t *pl;
3581         upl_t            upl;
3582         vm_offset_t      upl_offset;
3583         int              upl_size;
3584         off_t            upl_f_offset;
3585         int              start_offset;
3586         int              start_pg;
3587         int              last_pg;
3588         int              pages_in_upl;
3589         off_t            max_size;
3590         int              io_size;
3591         kern_return_t    kret;
3592         int              retval = 0;
3593         int              issued_io;
3594         int              skip_range;
3595
3596         if ( !UBCINFOEXISTS(vp))
3597                 return(EINVAL);
3598
3599         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3600                      (int)f_offset, resid, (int)filesize, 0, 0);
3601
3602         while (resid && f_offset < filesize && retval == 0) {
3603                 /*
3604                  * compute the size of the upl needed to encompass
3605                  * the requested read... limit each call to cluster_io
3606                  * to the maximum UPL size... cluster_io will clip if
3607                  * this exceeds the maximum io_size for the device,
3608                  * make sure to account for
3609                  * a starting offset that's not page aligned
3610                  */
3611                 start_offset = (int)(f_offset & PAGE_MASK_64);
3612                 upl_f_offset = f_offset - (off_t)start_offset;
3613                 max_size     = filesize - f_offset;
3614
3615                 if (resid < max_size)
3616                         io_size = resid;
3617                 else
3618                         io_size = max_size;
3619
3620                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3621                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3622                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3623
3624                 skip_range = 0;
3625                 /*
3626                  * return the number of contiguously present pages in the cache
3627                  * starting at upl_f_offset within the file
3628                  */
3629                 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
3630
3631                 if (skip_range) {
3632                         /*
3633                          * skip over pages already present in the cache
3634                          */
3635                         io_size = skip_range - start_offset;
3636
3637                         f_offset += io_size;
3638                         resid    -= io_size;
3639
3640                         if (skip_range == upl_size)
3641                                 continue;
3642                         /*
3643                          * have to issue some real I/O
3644                          * at this point, we know it's starting on a page boundary
3645                          * because we've skipped over at least the first page in the request
3646                          */
3647                         start_offset = 0;
3648                         upl_f_offset += skip_range;
3649                         upl_size     -= skip_range;
3650                 }
3651                 pages_in_upl = upl_size / PAGE_SIZE;
3652
3653                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
3654                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3655
3656                 kret = ubc_create_upl(vp,
3657                                       upl_f_offset,
3658                                       upl_size,
3659                                       &upl,
3660                                       &pl,
3661                                       UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
3662                 if (kret != KERN_SUCCESS)
3663                         return(retval);
3664                 issued_io = 0;
3665
3666                 /*
3667                  * before we start marching forward, we must make sure we end on
3668                  * a present page, otherwise we will be working with a freed
3669                  * upl
3670                  */
3671                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3672                         if (upl_page_present(pl, last_pg))
3673                                 break;
3674                 }
3675                 pages_in_upl = last_pg + 1;
3676
3677
3678                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
3679                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3680
3681
3682                 for (last_pg = 0; last_pg < pages_in_upl; ) {
3683                         /*
3684                          * scan from the beginning of the upl looking for the first
3685                          * page that is present.... this will become the first page in
3686                          * the request we're going to make to 'cluster_io'... if all
3687                          * of the pages are absent, we won't call through to 'cluster_io'
3688                          */
3689                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3690                                 if (upl_page_present(pl, start_pg))
3691                                         break;
3692                         }
3693
3694                         /*
3695                          * scan from the starting present page looking for an absent
3696                          * page before the end of the upl is reached, if we
3697                          * find one, then it will terminate the range of pages being
3698                          * presented to 'cluster_io'
3699                          */
3700                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3701                                 if (!upl_page_present(pl, last_pg))
3702                                         break;
3703                         }
3704
3705                         if (last_pg > start_pg) {
3706                                 /*
3707                                  * we found a range of pages that must be filled
3708                                  * if the last page in this range is the last page of the file
3709                                  * we may have to clip the size of it to keep from reading past
3710                                  * the end of the last physical block associated with the file
3711                                  */
3712                                 upl_offset = start_pg * PAGE_SIZE;
3713                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
3714
3715                                 if ((upl_f_offset + upl_offset + io_size) > filesize)
3716                                         io_size = filesize - (upl_f_offset + upl_offset);
3717
3718                                 /*
3719                                  * issue an asynchronous read to cluster_io
3720                                  */
3721                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
3722                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (buf_t)NULL, (struct clios *)NULL);
3723
3724                                 issued_io = 1;
3725                         }
3726                 }
3727                 if (issued_io == 0)
3728                         ubc_upl_abort(upl, 0);
3729
3730                 io_size = upl_size - start_offset;
3731
3732                 if (io_size > resid)
3733                         io_size = resid;
3734                 f_offset += io_size;
3735                 resid    -= io_size;
3736         }
3737
3738         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3739                      (int)f_offset, resid, retval, 0, 0);
3740
3741         return(retval);
3742 }
3743
3744
3745 int
3746 cluster_push(vnode_t vp, int flags)
3747 {
3748         int     retval;
3749         struct  cl_writebehind *wbp;
3750
3751         if ( !UBCINFOEXISTS(vp)) {
3752                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -1, 0);
3753                 return (0);
3754         }
3755         /* return if deferred write is set */
3756         if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
3757                 return (0);
3758         }
3759         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
3760                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -2, 0);
3761                 return (0);
3762         }
3763         if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
3764                 lck_mtx_unlock(&wbp->cl_lockw);
3765
3766                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -3, 0);
3767                 return(0);
3768         }
3769         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3770                      (int)wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
3771
3772         if (wbp->cl_scmap) {
3773                 sparse_cluster_push(wbp, vp, ubc_getsize(vp), 1);
3774
3775                 retval = 1;
3776         } else
3777                 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), 0, 1);
3778
3779         lck_mtx_unlock(&wbp->cl_lockw);
3780
3781         if (flags & IO_SYNC)
3782                 (void)vnode_waitforwrites(vp, 0, 0, 0, (char *)"cluster_push");
3783
3784         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3785                      (int)wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
3786
3787         return (retval);
3788 }
3789
3790
3791 __private_extern__ void
3792 cluster_release(struct ubc_info *ubc)
3793 {
3794         struct cl_writebehind *wbp;
3795         struct cl_readahead   *rap;
3796
3797         if ((wbp = ubc->cl_wbehind)) {
3798
3799                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
3800
3801                 if (wbp->cl_scmap)
3802                         vfs_drt_control(&(wbp->cl_scmap), 0);
3803         } else {
3804                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, 0, 0, 0, 0);
3805         }
3806
3807         rap = ubc->cl_rahead;
3808
3809         if (wbp != NULL) {
3810                 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
3811                 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
3812         }
3813         if ((rap = ubc->cl_rahead)) {
3814                 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
3815                 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
3816         }
3817         ubc->cl_rahead  = NULL;
3818         ubc->cl_wbehind = NULL;
3819
3820         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)ubc, (int)rap, (int)wbp, 0, 0);
3821 }
3822
3823
3824 static void
3825 cluster_push_EOF(vnode_t vp, off_t EOF)
3826 {
3827         struct cl_writebehind *wbp;
3828
3829         wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3830
3831         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3832                      (int)wbp->cl_scmap, wbp->cl_number, (int)EOF, 0, 0);
3833
3834         if (wbp->cl_scmap)
3835                 sparse_cluster_push(wbp, vp, EOF, 1);
3836         else
3837                 cluster_try_push(wbp, vp, EOF, 0, 1);
3838
3839         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3840                      (int)wbp->cl_scmap, wbp->cl_number, 0, 0, 0);
3841
3842         lck_mtx_unlock(&wbp->cl_lockw);
3843 }
3844
3845
3846 static int
3847 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int can_delay, int push_all)
3848 {
3849         int cl_index;
3850         int cl_index1;
3851         int min_index;
3852         int cl_len;
3853         int cl_pushed = 0;
3854         struct cl_wextent l_clusters[MAX_CLUSTERS];
3855
3856         /*
3857          * the write behind context exists and has
3858          * already been locked...
3859          *
3860          * make a local 'sorted' copy of the clusters
3861          * and clear wbp->cl_number so that new clusters can
3862          * be developed
3863          */
3864         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3865                 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
3866                         if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
3867                                 continue;
3868                         if (min_index == -1)
3869                                 min_index = cl_index1;
3870                         else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
3871                                 min_index = cl_index1;
3872                 }
3873                 if (min_index == -1)
3874                         break;
3875                 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
3876                 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
3877                 l_clusters[cl_index].io_nocache = wbp->cl_clusters[min_index].io_nocache;
3878
3879                 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
3880         }
3881         wbp->cl_number = 0;
3882
3883         cl_len = cl_index;
3884
3885         if (can_delay && cl_len == MAX_CLUSTERS) {
3886                 int   i;
3887
3888                 /*
3889                  * determine if we appear to be writing the file sequentially
3890                  * if not, by returning without having pushed any clusters
3891                  * we will cause this vnode to be pushed into the sparse cluster mechanism
3892                  * used for managing more random I/O patterns
3893                  *
3894                  * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3895                  * that's why we're in try_push with can_delay true...
3896                  *
3897                  * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3898                  * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3899                  * so we can just make a simple pass through, up to, but not including the last one...
3900                  * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
3901                  * are sequential
3902                  *
3903                  * we let the last one be partial as long as it was adjacent to the previous one...
3904                  * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3905                  * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3906                  */
3907                 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
3908                         if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != MAX_UPL_TRANSFER)
3909                                 goto dont_try;
3910                         if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
3911                                 goto dont_try;
3912                 }
3913         }
3914         /*
3915          * drop the lock while we're firing off the I/Os...
3916          * this is safe since I'm working off of a private sorted copy
3917          * of the clusters, and I'm going to re-evaluate the public
3918          * state after I retake the lock
3919          */
3920         lck_mtx_unlock(&wbp->cl_lockw);
3921
3922         for (cl_index = 0; cl_index < cl_len; cl_index++) {
3923                 int flags;
3924                 struct cl_extent cl;
3925
3926                 /*
3927                  * try to push each cluster in turn...
3928                  */
3929                 if (l_clusters[cl_index].io_nocache)
3930                         flags = IO_NOCACHE;
3931                 else
3932                         flags = 0;
3933                 cl.b_addr = l_clusters[cl_index].b_addr;
3934                 cl.e_addr = l_clusters[cl_index].e_addr;
3935
3936                 cluster_push_x(vp, &cl, EOF, flags);
3937
3938                 l_clusters[cl_index].b_addr = 0;
3939                 l_clusters[cl_index].e_addr = 0;
3940
3941                 cl_pushed++;
3942
3943                 if (push_all == 0)
3944                         break;
3945         }
3946         lck_mtx_lock(&wbp->cl_lockw);
3947
3948 dont_try:
3949         if (cl_len > cl_pushed) {
3950                /*
3951                 * we didn't push all of the clusters, so
3952                 * lets try to merge them back in to the vnode
3953                 */
3954                 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
3955                         /*
3956                          * we picked up some new clusters while we were trying to
3957                          * push the old ones... this can happen because I've dropped
3958                          * the vnode lock... the sum of the
3959                          * leftovers plus the new cluster count exceeds our ability
3960                          * to represent them, so switch to the sparse cluster mechanism
3961                          *
3962                          * collect the active public clusters...
3963                          */
3964                         sparse_cluster_switch(wbp, vp, EOF);
3965
3966                         for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
3967                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
3968                                         continue;
3969                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3970                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3971                                 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
3972
3973                                 cl_index1++;
3974                         }
3975                         /*
3976                          * update the cluster count
3977                          */
3978                         wbp->cl_number = cl_index1;
3979
3980                         /*
3981                          * and collect the original clusters that were moved into the
3982                          * local storage for sorting purposes
3983                          */
3984                         sparse_cluster_switch(wbp, vp, EOF);
3985
3986                 } else {
3987                         /*
3988                          * we've got room to merge the leftovers back in
3989                          * just append them starting at the next 'hole'
3990                          * represented by wbp->cl_number
3991                          */
3992                         for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
3993                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
3994                                         continue;
3995
3996                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3997                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3998                                 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
3999
4000                                 cl_index1++;
4001                         }
4002                         /*
4003                          * update the cluster count
4004                          */
4005                         wbp->cl_number = cl_index1;
4006                 }
4007         }
4008         return(MAX_CLUSTERS - wbp->cl_number);
4009 }
4010
4011
4012
4013 static int
4014 cluster_push_x(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags)
4015 {
4016         upl_page_info_t *pl;
4017         upl_t            upl;
4018         vm_offset_t      upl_offset;
4019         int              upl_size;
4020         off_t            upl_f_offset;
4021         int              pages_in_upl;
4022         int              start_pg;
4023         int              last_pg;
4024         int              io_size;
4025         int              io_flags;
4026         int              upl_flags;
4027         int              size;
4028         int              error = 0;
4029         int              retval;
4030         kern_return_t    kret;
4031
4032
4033         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
4034                      (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
4035
4036         if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
4037                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
4038
4039                 return (0);
4040         }
4041         upl_size = pages_in_upl * PAGE_SIZE;
4042         upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4043
4044         if (upl_f_offset + upl_size >= EOF) {
4045
4046                 if (upl_f_offset >= EOF) {
4047                         /*
4048                          * must have truncated the file and missed
4049                          * clearing a dangling cluster (i.e. it's completely
4050                          * beyond the new EOF
4051                          */
4052                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
4053
4054                         return(0);
4055                 }
4056                 size = EOF - upl_f_offset;
4057
4058                 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4059                 pages_in_upl = upl_size / PAGE_SIZE;
4060         } else
4061                 size = upl_size;
4062
4063         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
4064
4065         /*
4066          * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
4067          *
4068          * - only pages that are currently dirty are returned... these are the ones we need to clean
4069          * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
4070          * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
4071          * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
4072          *   someone dirties this page while the I/O is in progress, we don't lose track of the new state
4073          *
4074          * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
4075          */
4076
4077         if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE))
4078                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
4079         else
4080                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
4081
4082         kret = ubc_create_upl(vp,
4083                                 upl_f_offset,
4084                                 upl_size,
4085                                 &upl,
4086                                 &pl,
4087                                 upl_flags);
4088         if (kret != KERN_SUCCESS)
4089                 panic("cluster_push: failed to get pagelist");
4090
4091         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
4092
4093         /*
4094          * since we only asked for the dirty pages back
4095          * it's possible that we may only get a few or even none, so...
4096          * before we start marching forward, we must make sure we know
4097          * where the last present page is in the UPL, otherwise we could
4098          * end up working with a freed upl due to the FREE_ON_EMPTY semantics
4099          * employed by commit_range and abort_range.
4100          */
4101         for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
4102                 if (upl_page_present(pl, last_pg))
4103                         break;
4104         }
4105         pages_in_upl = last_pg + 1;
4106
4107         if (pages_in_upl == 0) {
4108                 ubc_upl_abort(upl, 0);
4109
4110                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
4111                 return(0);
4112         }
4113
4114         for (last_pg = 0; last_pg < pages_in_upl; ) {
4115                 /*
4116                  * find the next dirty page in the UPL
4117                  * this will become the first page in the
4118                  * next I/O to generate
4119                  */
4120                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
4121                         if (upl_dirty_page(pl, start_pg))
4122                                 break;
4123                         if (upl_page_present(pl, start_pg))
4124                                 /*
4125                                  * RET_ONLY_DIRTY will return non-dirty 'precious' pages
4126                                  * just release these unchanged since we're not going
4127                                  * to steal them or change their state
4128                                  */
4129                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4130                 }
4131                 if (start_pg >= pages_in_upl)
4132                         /*
4133                          * done... no more dirty pages to push
4134                          */
4135                         break;
4136                 if (start_pg > last_pg)
4137                         /*
4138                          * skipped over some non-dirty pages
4139                          */
4140                         size -= ((start_pg - last_pg) * PAGE_SIZE);
4141
4142                 /*
4143                  * find a range of dirty pages to write
4144                  */
4145                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4146                         if (!upl_dirty_page(pl, last_pg))
4147                                 break;
4148                 }
4149                 upl_offset = start_pg * PAGE_SIZE;
4150
4151                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
4152
4153                 io_flags = CL_THROTTLE | CL_COMMIT;
4154
4155                 if ( !(flags & IO_SYNC))
4156                         io_flags |= CL_ASYNC;
4157
4158                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
4159                                     io_flags, (buf_t)NULL, (struct clios *)NULL);
4160
4161                 if (error == 0 && retval)
4162                         error = retval;
4163
4164                 size -= io_size;
4165         }
4166         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
4167
4168         return(error);
4169 }
4170
4171
4172 /*
4173  * sparse_cluster_switch is called with the write behind lock held
4174  */
4175 static void
4176 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF)
4177 {
4178         int     cl_index;
4179
4180         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4181
4182         if (wbp->cl_scmap == NULL)
4183                 wbp->cl_scdirty = 0;
4184
4185         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
4186                 int       flags;
4187                 struct cl_extent cl;
4188
4189                 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
4190
4191                         if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
4192                                 if (flags & UPL_POP_DIRTY) {
4193                                         cl.e_addr = cl.b_addr + 1;
4194
4195                                         sparse_cluster_add(wbp, vp, &cl, EOF);
4196                                 }
4197                         }
4198                 }
4199         }
4200         wbp->cl_number = 0;
4201
4202         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4203 }
4204
4205
4206 /*
4207  * sparse_cluster_push is called with the write behind lock held
4208  */
4209 static void
4210 sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_all)
4211 {
4212         struct cl_extent cl;
4213         off_t           offset;
4214         u_int           length;
4215
4216         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, push_all, 0);
4217
4218         if (push_all)
4219                 vfs_drt_control(&(wbp->cl_scmap), 1);
4220
4221         for (;;) {
4222                 if (vfs_drt_get_cluster(&(wbp->cl_scmap), &offset, &length) != KERN_SUCCESS)
4223                         break;
4224
4225                 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
4226                 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
4227
4228                 wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr);
4229
4230                 cluster_push_x(vp, &cl, EOF, 0);
4231
4232                 if (push_all == 0)
4233                         break;
4234         }
4235         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4236 }
4237
4238
4239 /*
4240  * sparse_cluster_add is called with the write behind lock held
4241  */
4242 static void
4243 sparse_cluster_add(struct cl_writebehind *wbp, vnode_t vp, struct cl_extent *cl, off_t EOF)
4244 {
4245         u_int   new_dirty;
4246         u_int   length;
4247         off_t   offset;
4248
4249         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)wbp->cl_scmap, wbp->cl_scdirty, (int)cl->b_addr, (int)cl->e_addr, 0);
4250
4251         offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4252         length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
4253
4254         while (vfs_drt_mark_pages(&(wbp->cl_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
4255                 /*
4256                  * no room left in the map
4257                  * only a partial update was done
4258                  * push out some pages and try again
4259                  */
4260                 wbp->cl_scdirty += new_dirty;
4261
4262                 sparse_cluster_push(wbp, vp, EOF, 0);
4263
4264                 offset += (new_dirty * PAGE_SIZE_64);
4265                 length -= (new_dirty * PAGE_SIZE);
4266         }
4267         wbp->cl_scdirty += new_dirty;
4268
4269         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4270 }
4271
4272
4273 static int
4274 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags)
4275 {
4276         struct iovec     *iov;
4277         upl_page_info_t  *pl;
4278         upl_t            upl;
4279         addr64_t         ubc_paddr;
4280         kern_return_t    kret;
4281         int              error = 0;
4282         int              did_read = 0;
4283         int              abort_flags;
4284         int              upl_flags;
4285
4286         iov = uio->uio_iov;
4287
4288         upl_flags = UPL_SET_LITE;
4289         if (! (flags & CL_READ)) {
4290                 /*
4291                  * "write" operation:  let the UPL subsystem know
4292                  * that we intend to modify the buffer cache pages
4293                  * we're gathering.
4294                  */
4295                 upl_flags |= UPL_WILL_MODIFY;
4296         }
4297
4298         kret = ubc_create_upl(vp,
4299                               uio->uio_offset & ~PAGE_MASK_64,
4300                               PAGE_SIZE,
4301                               &upl,
4302                               &pl,
4303                               upl_flags);
4304
4305         if (kret != KERN_SUCCESS)
4306                 return(EINVAL);
4307
4308         if (!upl_valid_page(pl, 0)) {
4309                 /*
4310                  * issue a synchronous read to cluster_io
4311                  */
4312                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4313                                    CL_READ, (buf_t)NULL, (struct clios *)NULL);
4314                 if (error) {
4315                           ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4316
4317                           return(error);
4318                 }
4319                 did_read = 1;
4320         }
4321         ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
4322
4323 /*
4324  *      NOTE:  There is no prototype for the following in BSD. It, and the definitions
4325  *      of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
4326  *      osfmk/ppc/mappings.h.  They are not included here because there appears to be no
4327  *      way to do so without exporting them to kexts as well.
4328  */
4329         if (flags & CL_READ)
4330 //              copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);    /* Copy physical to physical and flush the destination */
4331                 copypv(ubc_paddr, usr_paddr, xsize,        2 |        1 |        4);    /* Copy physical to physical and flush the destination */
4332         else
4333 //              copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);    /* Copy physical to physical and flush the source */
4334                 copypv(usr_paddr, ubc_paddr, xsize,        2 |        1 |        8);    /* Copy physical to physical and flush the source */
4335
4336         if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
4337                 /*
4338                  * issue a synchronous write to cluster_io
4339                  */
4340                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4341                                         0, (buf_t)NULL, (struct clios *)NULL);
4342         }
4343         if (error == 0) {
4344                 uio->uio_offset += xsize;
4345                 uio_iov_base_add(uio, xsize);
4346                 uio_iov_len_add(uio, -xsize);
4347                 uio_setresid(uio, (uio_resid(uio) - xsize));
4348         }
4349         if (did_read)
4350                 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4351         else
4352                 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
4353
4354         ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
4355
4356         return (error);
4357 }
4358
4359
4360
4361 int
4362 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
4363 {
4364         int       pg_offset;
4365         int       pg_index;
4366         int       csize;
4367         int       segflg;
4368         int       retval = 0;
4369         upl_page_info_t *pl;
4370
4371         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
4372                      (int)uio->uio_offset, uio_resid(uio), upl_offset, xsize, 0);
4373
4374         segflg = uio->uio_segflg;
4375
4376         switch(segflg) {
4377
4378           case UIO_USERSPACE32:
4379           case UIO_USERISPACE32:
4380                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4381                 break;
4382
4383           case UIO_USERSPACE:
4384           case UIO_USERISPACE:
4385                 uio->uio_segflg = UIO_PHYS_USERSPACE;
4386                 break;
4387
4388           case UIO_USERSPACE64:
4389           case UIO_USERISPACE64:
4390                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4391                 break;
4392
4393           case UIO_SYSSPACE32:
4394                 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4395                 break;
4396
4397           case UIO_SYSSPACE:
4398                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4399                 break;
4400
4401           case UIO_SYSSPACE64:
4402                 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4403                 break;
4404         }
4405         pl = ubc_upl_pageinfo(upl);
4406
4407         pg_index  = upl_offset / PAGE_SIZE;
4408         pg_offset = upl_offset & PAGE_MASK;
4409         csize     = min(PAGE_SIZE - pg_offset, xsize);
4410
4411         while (xsize && retval == 0) {
4412                 addr64_t  paddr;
4413
4414                 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
4415
4416                 retval = uiomove64(paddr, csize, uio);
4417
4418                 pg_index += 1;
4419                 pg_offset = 0;
4420                 xsize    -= csize;
4421                 csize     = min(PAGE_SIZE, xsize);
4422         }
4423         uio->uio_segflg = segflg;
4424
4425         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4426                      (int)uio->uio_offset, uio_resid(uio), retval, segflg, 0);
4427
4428         return (retval);
4429 }
4430
4431
4432 int
4433 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
4434 {
4435         int       segflg;
4436         int       io_size;
4437         int       xsize;
4438         int       start_offset;
4439         int       retval = 0;
4440         memory_object_control_t  control;
4441
4442
4443         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
4444                      (int)uio->uio_offset, uio_resid(uio), 0, *io_resid, 0);
4445
4446         control = ubc_getobject(vp, UBC_FLAGS_NONE);
4447         if (control == MEMORY_OBJECT_CONTROL_NULL) {
4448                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4449                              (int)uio->uio_offset, uio_resid(uio), retval, 3, 0);
4450
4451                 return(0);
4452         }
4453         segflg = uio->uio_segflg;
4454
4455         switch(segflg) {
4456
4457           case UIO_USERSPACE32:
4458           case UIO_USERISPACE32:
4459                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4460                 break;
4461
4462           case UIO_USERSPACE64:
4463           case UIO_USERISPACE64:
4464                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4465                 break;
4466
4467           case UIO_SYSSPACE32:
4468                 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4469                 break;
4470
4471           case UIO_SYSSPACE64:
4472                 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4473                 break;
4474
4475           case UIO_USERSPACE:
4476           case UIO_USERISPACE:
4477                 uio->uio_segflg = UIO_PHYS_USERSPACE;
4478                 break;
4479
4480           case UIO_SYSSPACE:
4481                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4482                 break;
4483         }
4484
4485         if ( (io_size = *io_resid) ) {
4486                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4487                 xsize = uio_resid(uio);
4488
4489                 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset,
4490                                                        uio, start_offset, io_size, mark_dirty);
4491                 xsize -= uio_resid(uio);
4492                 io_size -= xsize;
4493         }
4494         uio->uio_segflg = segflg;
4495         *io_resid       = io_size;
4496
4497         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4498                      (int)uio->uio_offset, uio_resid(uio), retval, 0x80000000 | segflg, 0);
4499
4500         return(retval);
4501 }
4502
4503
4504 int
4505 is_file_clean(vnode_t vp, off_t filesize)
4506 {
4507         off_t f_offset;
4508         int   flags;
4509         int   total_dirty = 0;
4510
4511         for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
4512                 if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
4513                         if (flags & UPL_POP_DIRTY) {
4514                                 total_dirty++;
4515                         }
4516                 }
4517         }
4518         if (total_dirty)
4519                 return(EINVAL);
4520
4521         return (0);
4522 }
4523
4524
4525
4526 /*
4527  * Dirty region tracking/clustering mechanism.
4528  *
4529  * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4530  * dirty regions within a larger space (file).  It is primarily intended to
4531  * support clustering in large files with many dirty areas.
4532  *
4533  * The implementation assumes that the dirty regions are pages.
4534  *
4535  * To represent dirty pages within the file, we store bit vectors in a
4536  * variable-size circular hash.
4537  */
4538
4539 /*
4540  * Bitvector size.  This determines the number of pages we group in a
4541  * single hashtable entry.  Each hashtable entry is aligned to this
4542  * size within the file.
4543  */
4544 #define DRT_BITVECTOR_PAGES             256
4545
4546 /*
4547  * File offset handling.
4548  *
4549  * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4550  * the correct formula is  (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4551  */
4552 #define DRT_ADDRESS_MASK                (~((1 << 20) - 1))
4553 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK)
4554
4555 /*
4556  * Hashtable address field handling.
4557  *
4558  * The low-order bits of the hashtable address are used to conserve
4559  * space.
4560  *
4561  * DRT_HASH_COUNT_MASK must be large enough to store the range
4562  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4563  * to indicate that the bucket is actually unoccupied.
4564  */
4565 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4566 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \
4567         do {                                                                                            \
4568                 (scm)->scm_hashtable[(i)].dhe_control =                                                 \
4569                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4570         } while (0)
4571 #define DRT_HASH_COUNT_MASK             0x1ff
4572 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4573 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \
4574         do {                                                                                                            \
4575                 (scm)->scm_hashtable[(i)].dhe_control =                                                                 \
4576                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \
4577         } while (0)
4578 #define DRT_HASH_CLEAR(scm, i)                                                                                          \
4579         do {                                                                                                            \
4580                 (scm)->scm_hashtable[(i)].dhe_control = 0;                                                              \
4581         } while (0)
4582 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4583 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4584 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \
4585         do {                                                                                            \
4586                 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \
4587                 DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \
4588         } while(0);
4589
4590
4591 /*
4592  * Hash table moduli.
4593  *
4594  * Since the hashtable entry's size is dependent on the size of
4595  * the bitvector, and since the hashtable size is constrained to
4596  * both being prime and fitting within the desired allocation
4597  * size, these values need to be manually determined.
4598  *
4599  * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4600  *
4601  * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4602  * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4603  */
4604 #define DRT_HASH_SMALL_MODULUS  23
4605 #define DRT_HASH_LARGE_MODULUS  401
4606
4607 #define DRT_SMALL_ALLOCATION    1024    /* 104 bytes spare */
4608 #define DRT_LARGE_ALLOCATION    16384   /* 344 bytes spare */
4609
4610 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4611
4612 /*
4613  * Hashtable bitvector handling.
4614  *
4615  * Bitvector fields are 32 bits long.
4616  */
4617
4618 #define DRT_HASH_SET_BIT(scm, i, bit)                           \
4619         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4620
4621 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \
4622         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4623
4624 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \
4625         ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4626
4627 #define DRT_BITVECTOR_CLEAR(scm, i)                             \
4628         bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4629
4630 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \
4631         bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \
4632             &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \
4633             (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4634
4635
4636
4637 /*
4638  * Hashtable entry.
4639  */
4640 struct vfs_drt_hashentry {
4641         u_int64_t       dhe_control;
4642         u_int32_t       dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
4643 };
4644
4645 /*
4646  * Dirty Region Tracking structure.
4647  *
4648  * The hashtable is allocated entirely inside the DRT structure.
4649  *
4650  * The hash is a simple circular prime modulus arrangement, the structure
4651  * is resized from small to large if it overflows.
4652  */
4653
4654 struct vfs_drt_clustermap {
4655         u_int32_t               scm_magic;      /* sanity/detection */
4656 #define DRT_SCM_MAGIC           0x12020003
4657         u_int32_t               scm_modulus;    /* current ring size */
4658         u_int32_t               scm_buckets;    /* number of occupied buckets */
4659         u_int32_t               scm_lastclean;  /* last entry we cleaned */
4660         u_int32_t               scm_iskips;     /* number of slot skips */
4661
4662         struct vfs_drt_hashentry scm_hashtable[0];
4663 };
4664
4665
4666 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus)
4667 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus)
4668
4669 /*
4670  * Debugging codes and arguments.
4671  */
4672 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4673 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4674 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4675 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4676 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4677                                                             * dirty */
4678                                                            /* 0, setcount */
4679                                                            /* 1 (clean, no map) */
4680                                                            /* 2 (map alloc fail) */
4681                                                            /* 3, resid (partial) */
4682 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87))
4683 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4684                                                             * lastclean, iskips */
4685
4686
4687 static kern_return_t    vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
4688 static kern_return_t    vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
4689 static kern_return_t    vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
4690         u_int64_t offset, int *indexp);
4691 static kern_return_t    vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
4692         u_int64_t offset,
4693         int *indexp,
4694         int recursed);
4695 static kern_return_t    vfs_drt_do_mark_pages(
4696         void            **cmapp,
4697         u_int64_t       offset,
4698         u_int           length,
4699         int             *setcountp,
4700         int             dirty);
4701 static void             vfs_drt_trace(
4702         struct vfs_drt_clustermap *cmap,
4703         int code,
4704         int arg1,
4705         int arg2,
4706         int arg3,
4707         int arg4);
4708
4709
4710 /*
4711  * Allocate and initialise a sparse cluster map.
4712  *
4713  * Will allocate a new map, resize or compact an existing map.
4714  *
4715  * XXX we should probably have at least one intermediate map size,
4716  * as the 1:16 ratio seems a bit drastic.
4717  */
4718 static kern_return_t
4719 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
4720 {
4721         struct vfs_drt_clustermap *cmap, *ocmap;
4722         kern_return_t   kret;
4723         u_int64_t       offset;
4724         int             nsize, i, active_buckets, index, copycount;
4725
4726         ocmap = NULL;
4727         if (cmapp != NULL)
4728                 ocmap = *cmapp;
4729
4730         /*
4731          * Decide on the size of the new map.
4732          */
4733         if (ocmap == NULL) {
4734                 nsize = DRT_HASH_SMALL_MODULUS;
4735         } else {
4736                 /* count the number of active buckets in the old map */
4737                 active_buckets = 0;
4738                 for (i = 0; i < ocmap->scm_modulus; i++) {
4739                         if (!DRT_HASH_VACANT(ocmap, i) &&
4740                             (DRT_HASH_GET_COUNT(ocmap, i) != 0))
4741                                 active_buckets++;
4742                 }
4743                 /*
4744                  * If we're currently using the small allocation, check to
4745                  * see whether we should grow to the large one.
4746                  */
4747                 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
4748                         /* if the ring is nearly full */
4749                         if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
4750                                 nsize = DRT_HASH_LARGE_MODULUS;
4751                         } else {
4752                                 nsize = DRT_HASH_SMALL_MODULUS;
4753                         }
4754                 } else {
4755                         /* already using the large modulus */
4756                         nsize = DRT_HASH_LARGE_MODULUS;
4757                         /*
4758                          * If the ring is completely full, there's
4759                          * nothing useful for us to do.  Behave as
4760                          * though we had compacted into the new
4761                          * array and return.
4762                          */
4763                         if (active_buckets >= DRT_HASH_LARGE_MODULUS)
4764                                 return(KERN_SUCCESS);
4765                 }
4766         }
4767
4768         /*
4769          * Allocate and initialise the new map.
4770          */
4771
4772         kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
4773             (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4774         if (kret != KERN_SUCCESS)
4775                 return(kret);
4776         cmap->scm_magic = DRT_SCM_MAGIC;
4777         cmap->scm_modulus = nsize;
4778         cmap->scm_buckets = 0;
4779         cmap->scm_lastclean = 0;
4780         cmap->scm_iskips = 0;
4781         for (i = 0; i < cmap->scm_modulus; i++) {
4782                 DRT_HASH_CLEAR(cmap, i);
4783                 DRT_HASH_VACATE(cmap, i);
4784                 DRT_BITVECTOR_CLEAR(cmap, i);
4785         }
4786
4787         /*
4788          * If there's an old map, re-hash entries from it into the new map.
4789          */
4790         copycount = 0;
4791         if (ocmap != NULL) {
4792                 for (i = 0; i < ocmap->scm_modulus; i++) {
4793                         /* skip empty buckets */
4794                         if (DRT_HASH_VACANT(ocmap, i) ||
4795                             (DRT_HASH_GET_COUNT(ocmap, i) == 0))
4796                                 continue;
4797                         /* get new index */
4798                         offset = DRT_HASH_GET_ADDRESS(ocmap, i);
4799                         kret = vfs_drt_get_index(&cmap, offset, &index, 1);
4800                         if (kret != KERN_SUCCESS) {
4801                                 /* XXX need to bail out gracefully here */
4802                                 panic("vfs_drt: new cluster map mysteriously too small");
4803                         }
4804                         /* copy */
4805                         DRT_HASH_COPY(ocmap, i, cmap, index);
4806                         copycount++;
4807                 }
4808         }
4809
4810         /* log what we've done */
4811         vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
4812
4813         /*
4814          * It's important to ensure that *cmapp always points to
4815          * a valid map, so we must overwrite it before freeing
4816          * the old map.
4817          */
4818         *cmapp = cmap;
4819         if (ocmap != NULL) {
4820                 /* emit stats into trace buffer */
4821                 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
4822                               ocmap->scm_modulus,
4823                               ocmap->scm_buckets,
4824                               ocmap->scm_lastclean,
4825                               ocmap->scm_iskips);
4826
4827                 vfs_drt_free_map(ocmap);
4828         }
4829         return(KERN_SUCCESS);
4830 }
4831
4832
4833 /*
4834  * Free a sparse cluster map.
4835  */
4836 static kern_return_t
4837 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
4838 {
4839         kmem_free(kernel_map, (vm_offset_t)cmap,
4840                   (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4841         return(KERN_SUCCESS);
4842 }
4843
4844
4845 /*
4846  * Find the hashtable slot currently occupied by an entry for the supplied offset.
4847  */
4848 static kern_return_t
4849 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
4850 {
4851         int             index, i;
4852
4853         offset = DRT_ALIGN_ADDRESS(offset);
4854         index = DRT_HASH(cmap, offset);
4855
4856         /* traverse the hashtable */
4857         for (i = 0; i < cmap->scm_modulus; i++) {
4858
4859                 /*
4860                  * If the slot is vacant, we can stop.
4861                  */
4862                 if (DRT_HASH_VACANT(cmap, index))
4863                         break;
4864
4865                 /*
4866                  * If the address matches our offset, we have success.
4867                  */
4868                 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
4869                         *indexp = index;
4870                         return(KERN_SUCCESS);
4871                 }
4872
4873                 /*
4874                  * Move to the next slot, try again.
4875                  */
4876                 index = DRT_HASH_NEXT(cmap, index);
4877         }
4878         /*
4879          * It's not there.
4880          */
4881         return(KERN_FAILURE);
4882 }
4883
4884 /*
4885  * Find the hashtable slot for the supplied offset.  If we haven't allocated
4886  * one yet, allocate one and populate the address field.  Note that it will
4887  * not have a nonzero page count and thus will still technically be free, so
4888  * in the case where we are called to clean pages, the slot will remain free.
4889  */
4890 static kern_return_t
4891 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
4892 {
4893         struct vfs_drt_clustermap *cmap;
4894         kern_return_t   kret;
4895         int             index, i;
4896
4897         cmap = *cmapp;
4898
4899         /* look for an existing entry */
4900         kret = vfs_drt_search_index(cmap, offset, indexp);
4901         if (kret == KERN_SUCCESS)
4902                 return(kret);
4903
4904         /* need to allocate an entry */
4905         offset = DRT_ALIGN_ADDRESS(offset);
4906         index = DRT_HASH(cmap, offset);
4907
4908         /* scan from the index forwards looking for a vacant slot */
4909         for (i = 0; i < cmap->scm_modulus; i++) {
4910                 /* slot vacant? */
4911                 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
4912                         cmap->scm_buckets++;
4913                         if (index < cmap->scm_lastclean)
4914                                 cmap->scm_lastclean = index;
4915                         DRT_HASH_SET_ADDRESS(cmap, index, offset);
4916                         DRT_HASH_SET_COUNT(cmap, index, 0);
4917                         DRT_BITVECTOR_CLEAR(cmap, index);
4918                         *indexp = index;
4919                         vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
4920                         return(KERN_SUCCESS);
4921                 }
4922                 cmap->scm_iskips += i;
4923                 index = DRT_HASH_NEXT(cmap, index);
4924         }
4925
4926         /*
4927          * We haven't found a vacant slot, so the map is full.  If we're not
4928          * already recursed, try reallocating/compacting it.
4929          */
4930         if (recursed)
4931                 return(KERN_FAILURE);
4932         kret = vfs_drt_alloc_map(cmapp);
4933         if (kret == KERN_SUCCESS) {
4934                 /* now try to insert again */
4935                 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
4936         }
4937         return(kret);
4938 }
4939
4940 /*
4941  * Implementation of set dirty/clean.
4942  *
4943  * In the 'clean' case, not finding a map is OK.
4944  */
4945 static kern_return_t
4946 vfs_drt_do_mark_pages(
4947         void            **private,
4948         u_int64_t       offset,
4949         u_int           length,
4950         int             *setcountp,
4951         int             dirty)
4952 {
4953         struct vfs_drt_clustermap *cmap, **cmapp;
4954         kern_return_t   kret;
4955         int             i, index, pgoff, pgcount, setcount, ecount;
4956
4957         cmapp = (struct vfs_drt_clustermap **)private;
4958         cmap = *cmapp;
4959
4960         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
4961
4962         if (setcountp != NULL)
4963                 *setcountp = 0;
4964
4965         /* allocate a cluster map if we don't already have one */
4966         if (cmap == NULL) {
4967                 /* no cluster map, nothing to clean */
4968                 if (!dirty) {
4969                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
4970                         return(KERN_SUCCESS);
4971                 }
4972                 kret = vfs_drt_alloc_map(cmapp);
4973                 if (kret != KERN_SUCCESS) {
4974                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
4975                         return(kret);
4976                 }
4977         }
4978         setcount = 0;
4979
4980         /*
4981          * Iterate over the length of the region.
4982          */
4983         while (length > 0) {
4984                 /*
4985                  * Get the hashtable index for this offset.
4986                  *
4987                  * XXX this will add blank entries if we are clearing a range
4988                  * that hasn't been dirtied.
4989                  */
4990                 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
4991                 cmap = *cmapp;  /* may have changed! */
4992                 /* this may be a partial-success return */
4993                 if (kret != KERN_SUCCESS) {
4994                         if (setcountp != NULL)
4995                                 *setcountp = setcount;
4996                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
4997
4998                         return(kret);
4999                 }
5000
5001                 /*
5002                  * Work out how many pages we're modifying in this
5003                  * hashtable entry.
5004                  */
5005                 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
5006                 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
5007
5008                 /*
5009                  * Iterate over pages, dirty/clearing as we go.
5010                  */
5011                 ecount = DRT_HASH_GET_COUNT(cmap, index);
5012                 for (i = 0; i < pgcount; i++) {
5013                         if (dirty) {
5014                                 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
5015                                         DRT_HASH_SET_BIT(cmap, index, pgoff + i);
5016                                         ecount++;
5017                                         setcount++;
5018                                 }
5019                         } else {
5020                                 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
5021                                         DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
5022                                         ecount--;
5023                                         setcount++;
5024                                 }
5025                         }
5026                 }
5027                 DRT_HASH_SET_COUNT(cmap, index, ecount);
5028
5029                 offset += pgcount * PAGE_SIZE;
5030                 length -= pgcount * PAGE_SIZE;
5031         }
5032         if (setcountp != NULL)
5033                 *setcountp = setcount;
5034
5035         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
5036
5037         return(KERN_SUCCESS);
5038 }
5039
5040 /*
5041  * Mark a set of pages as dirty/clean.
5042  *
5043  * This is a public interface.
5044  *
5045  * cmapp
5046  *      Pointer to storage suitable for holding a pointer.  Note that
5047  *      this must either be NULL or a value set by this function.
5048  *
5049  * size
5050  *      Current file size in bytes.
5051  *
5052  * offset
5053  *      Offset of the first page to be marked as dirty, in bytes.  Must be
5054  *      page-aligned.
5055  *
5056  * length
5057  *      Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE.
5058  *
5059  * setcountp
5060  *      Number of pages newly marked dirty by this call (optional).
5061  *
5062  * Returns KERN_SUCCESS if all the pages were successfully marked.
5063  */
5064 static kern_return_t
5065 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp)
5066 {
5067         /* XXX size unused, drop from interface */
5068         return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
5069 }
5070
5071 #if 0
5072 static kern_return_t
5073 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
5074 {
5075         return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
5076 }
5077 #endif
5078
5079 /*
5080  * Get a cluster of dirty pages.
5081  *
5082  * This is a public interface.
5083  *
5084  * cmapp
5085  *      Pointer to storage managed by drt_mark_pages.  Note that this must
5086  *      be NULL or a value set by drt_mark_pages.
5087  *
5088  * offsetp
5089  *      Returns the byte offset into the file of the first page in the cluster.
5090  *
5091  * lengthp
5092  *      Returns the length in bytes of the cluster of dirty pages.
5093  *
5094  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there
5095  * are no dirty pages meeting the minmum size criteria.  Private storage will
5096  * be released if there are no more dirty pages left in the map
5097  *
5098  */
5099 static kern_return_t
5100 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
5101 {
5102         struct vfs_drt_clustermap *cmap;
5103         u_int64_t       offset;
5104         u_int           length;
5105         int             index, i, j, fs, ls;
5106
5107         /* sanity */
5108         if ((cmapp == NULL) || (*cmapp == NULL))
5109                 return(KERN_FAILURE);
5110         cmap = *cmapp;
5111
5112         /* walk the hashtable */
5113         for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
5114                 index = DRT_HASH(cmap, offset);
5115
5116                 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
5117                         continue;
5118
5119                 /* scan the bitfield for a string of bits */
5120                 fs = -1;
5121
5122                 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5123                         if (DRT_HASH_TEST_BIT(cmap, index, i)) {
5124                                 fs = i;
5125                                 break;
5126                         }
5127                 }
5128                 if (fs == -1) {
5129                         /*  didn't find any bits set */
5130                         panic("vfs_drt: entry summary count > 0 but no bits set in map");
5131                 }
5132                 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
5133                         if (!DRT_HASH_TEST_BIT(cmap, index, i))
5134                                 break;
5135                 }
5136
5137                 /* compute offset and length, mark pages clean */
5138                 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
5139                 length = ls * PAGE_SIZE;
5140                 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
5141                 cmap->scm_lastclean = index;
5142
5143                 /* return successful */
5144                 *offsetp = (off_t)offset;
5145                 *lengthp = length;
5146
5147                 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
5148                 return(KERN_SUCCESS);
5149         }
5150         /*
5151          * We didn't find anything... hashtable is empty
5152          * emit stats into trace buffer and
5153          * then free it
5154          */
5155         vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5156                       cmap->scm_modulus,
5157                       cmap->scm_buckets,
5158                       cmap->scm_lastclean,
5159                       cmap->scm_iskips);
5160
5161         vfs_drt_free_map(cmap);
5162         *cmapp = NULL;
5163
5164         return(KERN_FAILURE);
5165 }
5166
5167
5168 static kern_return_t
5169 vfs_drt_control(void **cmapp, int op_type)
5170 {
5171         struct vfs_drt_clustermap *cmap;
5172
5173         /* sanity */
5174         if ((cmapp == NULL) || (*cmapp == NULL))
5175                 return(KERN_FAILURE);
5176         cmap = *cmapp;
5177
5178         switch (op_type) {
5179         case 0:
5180                 /* emit stats into trace buffer */
5181                 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5182                               cmap->scm_modulus,
5183                               cmap->scm_buckets,
5184                               cmap->scm_lastclean,
5185                               cmap->scm_iskips);
5186
5187                 vfs_drt_free_map(cmap);
5188                 *cmapp = NULL;
5189                 break;
5190
5191         case 1:
5192                 cmap->scm_lastclean = 0;
5193                 break;
5194         }
5195         return(KERN_SUCCESS);
5196 }
5197
5198
5199
5200 /*
5201  * Emit a summary of the state of the clustermap into the trace buffer
5202  * along with some caller-provided data.
5203  */
5204 #if KDEBUG
5205 static void
5206 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
5207 {
5208         KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
5209 }
5210 #else
5211 static void
5212 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
5213                           __unused int arg1, __unused int arg2, __unused int arg3,
5214                           __unused int arg4)
5215 {
5216 }
5217 #endif
5218
5219 #if 0
5220 /*
5221  * Perform basic sanity check on the hash entry summary count
5222  * vs. the actual bits set in the entry.
5223  */
5224 static void
5225 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
5226 {
5227         int index, i;
5228         int bits_on;
5229
5230         for (index = 0; index < cmap->scm_modulus; index++) {
5231                 if (DRT_HASH_VACANT(cmap, index))
5232                         continue;
5233
5234                 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5235                         if (DRT_HASH_TEST_BIT(cmap, index, i))
5236                                 bits_on++;
5237                 }
5238                 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
5239                         panic("bits_on = %d,  index = %d\n", bits_on, index);
5240         }
5241 }
5242 #endif