bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  62  */
  63
  64 #include <sys/param.h>
  65 #include <sys/proc_internal.h>
  66 #include <sys/buf_internal.h>
  67 #include <sys/mount_internal.h>
  68 #include <sys/vnode_internal.h>
  69 #include <sys/trace.h>
  70 #include <sys/malloc.h>
  71 #include <sys/time.h>
  72 #include <sys/kernel.h>
  73 #include <sys/resourcevar.h>
  74 #include <sys/uio_internal.h>
  75 #include <libkern/libkern.h>
  76 #include <machine/machine_routines.h>
  77
  78 #include <sys/ubc_internal.h>
  79 #include <vm/vnode_pager.h>
  80
  81 #include <mach/mach_types.h>
  82 #include <mach/memory_object_types.h>
  83 #include <mach/vm_map.h>
  84 #include <mach/upl.h>
  85
  86 #include <vm/vm_kern.h>
  87 #include <vm/vm_map.h>
  88 #include <vm/vm_pageout.h>
  89
  90 #include <sys/kdebug.h>
  91
  92 #define CL_READ         0x01
  93 #define CL_ASYNC        0x02
  94 #define CL_COMMIT       0x04
  95 #define CL_PAGEOUT      0x10
  96 #define CL_AGE          0x20
  97 #define CL_NOZERO       0x40
  98 #define CL_PAGEIN       0x80
  99 #define CL_DEV_MEMORY   0x100
 100 #define CL_PRESERVE     0x200
 101 #define CL_THROTTLE     0x400
 102 #define CL_KEEPCACHED   0x800
 103 #define CL_DIRECT_IO    0x1000
 104 #define CL_PASSIVE      0x2000
 105
 106
 107 struct clios {
 108         u_int  io_completed;       /* amount of io that has currently completed */
 109         u_int  io_issued;          /* amount of io that was successfully issued */
 110         int    io_error;           /* error code of first error encountered */
 111         int    io_wanted;          /* someone is sleeping waiting for a change in state */
 112 };
 113
 114 static lck_grp_t        *cl_mtx_grp;
 115 static lck_attr_t       *cl_mtx_attr;
 116 static lck_grp_attr_t   *cl_mtx_grp_attr;
 117 static lck_mtx_t        *cl_mtxp;
 118
 119
 120 #define IO_UNKNOWN      0
 121 #define IO_DIRECT       1
 122 #define IO_CONTIG       2
 123 #define IO_COPY         3
 124
 125 #define PUSH_DELAY      0x01
 126 #define PUSH_ALL        0x02
 127 #define PUSH_SYNC       0x04
 128
 129
 130 static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset);
 131 static void cluster_wait_IO(buf_t cbp_head, int async);
 132 static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait);
 133
 134 static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length);
 135
 136 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 137                       int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg);
 138 static int cluster_iodone(buf_t bp, void *callback_arg);
 139 static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags);
 140 static int cluster_hard_throttle_on(vnode_t vp);
 141
 142 static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg);
 143
 144 static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int flags);
 145 static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference);
 146
 147 static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size,  off_t filesize, int flags,
 148                              int (*)(buf_t, void *), void *callback_arg);
 149 static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
 150                                int flags, int (*)(buf_t, void *), void *callback_arg);
 151 static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
 152                                int (*)(buf_t, void *), void *callback_arg, int flags);
 153
 154 static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
 155                               off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg);
 156 static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
 157                                 int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg);
 158 static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
 159                                 int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag);
 160
 161 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg);
 162
 163 static int      cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
 164 static void     cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
 165
 166 static int      cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg);
 167
 168 static int      cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int (*)(buf_t, void *), void *callback_arg);
 169
 170 static void     sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg);
 171 static void     sparse_cluster_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int (*)(buf_t, void *), void *callback_arg);
 172 static void     sparse_cluster_add(struct cl_writebehind *, vnode_t vp, struct cl_extent *, off_t EOF, int (*)(buf_t, void *), void *callback_arg);
 173
 174 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp);
 175 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
 176 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
 177
 178 int     is_file_clean(vnode_t, off_t);
 179
 180 /*
 181  * limit the internal I/O size so that we
 182  * can represent it in a 32 bit int
 183  */
 184 #define MAX_IO_REQUEST_SIZE     (1024 * 1024 * 256)
 185 #define MAX_IO_CONTIG_SIZE      (1024 * 1024 * 8)
 186 #define MAX_VECTS       16
 187
 188 /*
 189  * note:  MAX_CLUSTER_SIZE CANNOT be larger than MAX_UPL_TRANSFER
 190  */
 191 #define MAX_CLUSTER_SIZE        (MAX_UPL_TRANSFER)
 192 #define MAX_PREFETCH            (MAX_CLUSTER_SIZE * PAGE_SIZE * 2)
 193 #define MIN_DIRECT_WRITE_SIZE   (4 * PAGE_SIZE)
 194
 195
 196 int speculative_reads_disabled = 0;
 197
 198 /*
 199  * throttle the number of async writes that
 200  * can be outstanding on a single vnode
 201  * before we issue a synchronous write
 202  */
 203 #define HARD_THROTTLE_MAXCNT    0
 204 #define HARD_THROTTLE_MAXSIZE   (64 * 1024)
 205
 206 int hard_throttle_on_root = 0;
 207 struct timeval priority_IO_timestamp_for_root;
 208
 209
 210 void
 211 cluster_init(void) {
 212         /*
 213          * allocate lock group attribute and group
 214          */
 215         cl_mtx_grp_attr = lck_grp_attr_alloc_init();
 216         cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
 217
 218         /*
 219          * allocate the lock attribute
 220          */
 221         cl_mtx_attr = lck_attr_alloc_init();
 222
 223         /*
 224          * allocate and initialize mutex's used to protect updates and waits
 225          * on the cluster_io context
 226          */
 227         cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
 228
 229         if (cl_mtxp == NULL)
 230                 panic("cluster_init: failed to allocate cl_mtxp");
 231 }
 232
 233
 234
 235 #define CLW_ALLOCATE            0x01
 236 #define CLW_RETURNLOCKED        0x02
 237 #define CLW_IONOCACHE           0x04
 238 #define CLW_IOPASSIVE   0x08
 239
 240 /*
 241  * if the read ahead context doesn't yet exist,
 242  * allocate and initialize it...
 243  * the vnode lock serializes multiple callers
 244  * during the actual assignment... first one
 245  * to grab the lock wins... the other callers
 246  * will release the now unnecessary storage
 247  *
 248  * once the context is present, try to grab (but don't block on)
 249  * the lock associated with it... if someone
 250  * else currently owns it, than the read
 251  * will run without read-ahead.  this allows
 252  * multiple readers to run in parallel and
 253  * since there's only 1 read ahead context,
 254  * there's no real loss in only allowing 1
 255  * reader to have read-ahead enabled.
 256  */
 257 static struct cl_readahead *
 258 cluster_get_rap(vnode_t vp)
 259 {
 260         struct ubc_info         *ubc;
 261         struct cl_readahead     *rap;
 262
 263         ubc = vp->v_ubcinfo;
 264
 265         if ((rap = ubc->cl_rahead) == NULL) {
 266                 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
 267
 268                 bzero(rap, sizeof *rap);
 269                 rap->cl_lastr = -1;
 270                 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
 271
 272                 vnode_lock(vp);
 273
 274                 if (ubc->cl_rahead == NULL)
 275                         ubc->cl_rahead = rap;
 276                 else {
 277                         lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
 278                         FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
 279                         rap = ubc->cl_rahead;
 280                 }
 281                 vnode_unlock(vp);
 282         }
 283         if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
 284                 return(rap);
 285
 286         return ((struct cl_readahead *)NULL);
 287 }
 288
 289
 290 /*
 291  * if the write behind context doesn't yet exist,
 292  * and CLW_ALLOCATE is specified, allocate and initialize it...
 293  * the vnode lock serializes multiple callers
 294  * during the actual assignment... first one
 295  * to grab the lock wins... the other callers
 296  * will release the now unnecessary storage
 297  *
 298  * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
 299  * the lock associated with the write behind context before
 300  * returning
 301  */
 302
 303 static struct cl_writebehind *
 304 cluster_get_wbp(vnode_t vp, int flags)
 305 {
 306         struct ubc_info *ubc;
 307         struct cl_writebehind *wbp;
 308
 309         ubc = vp->v_ubcinfo;
 310
 311         if ((wbp = ubc->cl_wbehind) == NULL) {
 312
 313                 if ( !(flags & CLW_ALLOCATE))
 314                         return ((struct cl_writebehind *)NULL);
 315
 316                 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
 317
 318                 bzero(wbp, sizeof *wbp);
 319                 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
 320
 321                 vnode_lock(vp);
 322
 323                 if (ubc->cl_wbehind == NULL)
 324                         ubc->cl_wbehind = wbp;
 325                 else {
 326                         lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
 327                         FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
 328                         wbp = ubc->cl_wbehind;
 329                 }
 330                 vnode_unlock(vp);
 331         }
 332         if (flags & CLW_RETURNLOCKED)
 333                 lck_mtx_lock(&wbp->cl_lockw);
 334
 335         return (wbp);
 336 }
 337
 338
 339 static void
 340 cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg)
 341 {
 342         struct cl_writebehind *wbp;
 343
 344         if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
 345
 346                 if (wbp->cl_number) {
 347                         lck_mtx_lock(&wbp->cl_lockw);
 348
 349                         cluster_try_push(wbp, vp, newEOF, PUSH_ALL | PUSH_SYNC, callback, callback_arg);
 350
 351                         lck_mtx_unlock(&wbp->cl_lockw);
 352                 }
 353         }
 354 }
 355
 356
 357 static int
 358 cluster_hard_throttle_on(vnode_t vp)
 359 {
 360         static struct timeval hard_throttle_maxelapsed = { 0, 200000 };
 361
 362         if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
 363                 struct timeval elapsed;
 364
 365                 if (hard_throttle_on_root)
 366                         return(1);
 367
 368                 microuptime(&elapsed);
 369                 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
 370
 371                 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
 372                         return(1);
 373         }
 374         return(0);
 375 }
 376
 377
 378 static int
 379 cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags)
 380 {
 381         int upl_abort_code = 0;
 382         int page_in  = 0;
 383         int page_out = 0;
 384
 385         if (io_flags & B_PHYS)
 386                 /*
 387                  * direct write of any flavor, or a direct read that wasn't aligned
 388                  */
 389                 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
 390         else {
 391                 if (io_flags & B_PAGEIO) {
 392                         if (io_flags & B_READ)
 393                                 page_in  = 1;
 394                         else
 395                                 page_out = 1;
 396                 }
 397                 if (io_flags & B_CACHE)
 398                         /*
 399                          * leave pages in the cache unchanged on error
 400                          */
 401                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 402                 else if (page_out && (error != ENXIO))
 403                         /*
 404                          * transient error... leave pages unchanged
 405                          */
 406                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 407                 else if (page_in)
 408                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 409                 else
 410                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 411
 412                 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
 413         }
 414         return (upl_abort_code);
 415 }
 416
 417
 418 static int
 419 cluster_iodone(buf_t bp, void *callback_arg)
 420 {
 421         int     b_flags;
 422         int     error;
 423         int     total_size;
 424         int     total_resid;
 425         int     upl_offset;
 426         int     zero_offset;
 427         int     pg_offset = 0;
 428         int     commit_size = 0;
 429         int     upl_flags = 0;
 430         int     transaction_size = 0;
 431         upl_t   upl;
 432         buf_t   cbp;
 433         buf_t   cbp_head;
 434         buf_t   cbp_next;
 435         buf_t   real_bp;
 436         struct  clios *iostate;
 437         boolean_t       transaction_complete = FALSE;
 438
 439         cbp_head = (buf_t)(bp->b_trans_head);
 440
 441         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 442                      (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 443
 444         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 445                 /*
 446                  * all I/O requests that are part of this transaction
 447                  * have to complete before we can process it
 448                  */
 449                 if ( !(cbp->b_flags & B_DONE)) {
 450
 451                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 452                                      (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
 453
 454                         return 0;
 455                 }
 456                 if (cbp->b_flags & B_EOT)
 457                         transaction_complete = TRUE;
 458         }
 459         if (transaction_complete == FALSE) {
 460                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 461                              (int)cbp_head, 0, 0, 0, 0);
 462
 463                 return 0;
 464         }
 465         error       = 0;
 466         total_size  = 0;
 467         total_resid = 0;
 468
 469         cbp        = cbp_head;
 470         upl_offset = cbp->b_uploffset;
 471         upl        = cbp->b_upl;
 472         b_flags    = cbp->b_flags;
 473         real_bp    = cbp->b_real_bp;
 474         zero_offset= cbp->b_validend;
 475         iostate    = (struct clios *)cbp->b_iostate;
 476
 477         if (real_bp)
 478                 real_bp->b_dev = cbp->b_dev;
 479
 480         while (cbp) {
 481                 if ((cbp->b_flags & B_ERROR) && error == 0)
 482                         error = cbp->b_error;
 483
 484                 total_resid += cbp->b_resid;
 485                 total_size  += cbp->b_bcount;
 486
 487                 cbp_next = cbp->b_trans_next;
 488
 489                 if (cbp_next == NULL)
 490                         /*
 491                          * compute the overall size of the transaction
 492                          * in case we created one that has 'holes' in it
 493                          * 'total_size' represents the amount of I/O we
 494                          * did, not the span of the transaction w/r to the UPL
 495                          */
 496                         transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
 497
 498                 if (cbp != cbp_head)
 499                         free_io_buf(cbp);
 500
 501                 cbp = cbp_next;
 502         }
 503         if (error == 0 && total_resid)
 504                 error = EIO;
 505
 506         if (error == 0) {
 507                 int     (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone);
 508
 509                 if (cliodone_func != NULL) {
 510                         cbp_head->b_bcount = transaction_size;
 511
 512                         error = (*cliodone_func)(cbp_head, callback_arg);
 513                 }
 514         }
 515         if (zero_offset)
 516                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 517
 518         free_io_buf(cbp_head);
 519
 520         if (iostate) {
 521                 int need_wakeup = 0;
 522
 523                 /*
 524                  * someone has issued multiple I/Os asynchrounsly
 525                  * and is waiting for them to complete (streaming)
 526                  */
 527                 lck_mtx_lock_spin(cl_mtxp);
 528
 529                 if (error && iostate->io_error == 0)
 530                         iostate->io_error = error;
 531
 532                 iostate->io_completed += total_size;
 533
 534                 if (iostate->io_wanted) {
 535                         /*
 536                          * someone is waiting for the state of
 537                          * this io stream to change
 538                          */
 539                         iostate->io_wanted = 0;
 540                         need_wakeup = 1;
 541                 }
 542                 lck_mtx_unlock(cl_mtxp);
 543
 544                 if (need_wakeup)
 545                         wakeup((caddr_t)&iostate->io_wanted);
 546         }
 547
 548         if (b_flags & B_COMMIT_UPL) {
 549
 550                 pg_offset   = upl_offset & PAGE_MASK;
 551                 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 552
 553                 if (error)
 554                         upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags);
 555                 else {
 556                         upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
 557
 558                         if ((b_flags & B_PHYS) && (b_flags & B_READ))
 559                                 upl_flags |= UPL_COMMIT_SET_DIRTY;
 560
 561                         if (b_flags & B_AGE)
 562                                 upl_flags |= UPL_COMMIT_INACTIVATE;
 563
 564                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
 565                 }
 566         }
 567         if ((b_flags & B_NEED_IODONE) && real_bp) {
 568                 if (error) {
 569                         real_bp->b_flags |= B_ERROR;
 570                         real_bp->b_error = error;
 571                 }
 572                 real_bp->b_resid = total_resid;
 573
 574                 buf_biodone(real_bp);
 575         }
 576         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 577                      (int)upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0);
 578
 579         return (error);
 580 }
 581
 582
 583 void
 584 cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, buf_t bp)
 585 {
 586
 587         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
 588                      upl_offset, size, (int)bp, 0, 0);
 589
 590         if (bp == NULL || bp->b_datap == 0) {
 591                 upl_page_info_t *pl;
 592                 addr64_t        zero_addr;
 593
 594                 pl = ubc_upl_pageinfo(upl);
 595
 596                 if (upl_device_page(pl) == TRUE) {
 597                         zero_addr = ((addr64_t)upl_phys_page(pl, 0) << 12) + upl_offset;
 598
 599                         bzero_phys_nc(zero_addr, size);
 600                 } else {
 601                         while (size) {
 602                                 int     page_offset;
 603                                 int     page_index;
 604                                 int     zero_cnt;
 605
 606                                 page_index  = upl_offset / PAGE_SIZE;
 607                                 page_offset = upl_offset & PAGE_MASK;
 608
 609                                 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
 610                                 zero_cnt  = min(PAGE_SIZE - page_offset, size);
 611
 612                                 bzero_phys(zero_addr, zero_cnt);
 613
 614                                 size       -= zero_cnt;
 615                                 upl_offset += zero_cnt;
 616                         }
 617                 }
 618         } else
 619                 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
 620
 621         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
 622                      upl_offset, size, 0, 0, 0);
 623 }
 624
 625
 626 static void
 627 cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset)
 628 {
 629         cbp_head->b_validend = zero_offset;
 630         cbp_tail->b_flags |= B_EOT;
 631 }
 632
 633 static void
 634 cluster_wait_IO(buf_t cbp_head, int async)
 635 {
 636         buf_t   cbp;
 637
 638         if (async) {
 639                 /*
 640                  * async callback completion will not normally
 641                  * generate a wakeup upon I/O completion...
 642                  * by setting BL_WANTED, we will force a wakeup
 643                  * to occur as any outstanding I/Os complete...
 644                  * I/Os already completed will have BL_CALLDONE already
 645                  * set and we won't block in buf_biowait_callback..
 646                  * note that we're actually waiting for the bp to have
 647                  * completed the callback function... only then
 648                  * can we safely take back ownership of the bp
 649                  * need the main buf mutex in order to safely
 650                  * update b_lflags
 651                  */
 652                 buf_list_lock();
 653
 654                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 655                       cbp->b_lflags |= BL_WANTED;
 656
 657                 buf_list_unlock();
 658         }
 659         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 660                 if (async)
 661                         buf_biowait_callback(cbp);
 662                 else
 663                         buf_biowait(cbp);
 664         }
 665 }
 666
 667 static void
 668 cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait)
 669 {
 670         buf_t   cbp;
 671         int     error;
 672
 673         /*
 674          * cluster_complete_transaction will
 675          * only be called if we've issued a complete chain in synchronous mode
 676          * or, we've already done a cluster_wait_IO on an incomplete chain
 677          */
 678         if (needwait) {
 679                 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next)
 680                         buf_biowait(cbp);
 681         }
 682         error = cluster_iodone(*cbp_head, callback_arg);
 683
 684         if ( !(flags & CL_ASYNC) && error && *retval == 0) {
 685                 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO))
 686                         *retval = error;
 687         }
 688         *cbp_head = (buf_t)NULL;
 689 }
 690
 691
 692 static int
 693 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 694            int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
 695 {
 696         buf_t   cbp;
 697         u_int   size;
 698         u_int   io_size;
 699         int     io_flags;
 700         int     bmap_flags;
 701         int     error = 0;
 702         int     retval = 0;
 703         buf_t   cbp_head = NULL;
 704         buf_t   cbp_tail = NULL;
 705         int     trans_count = 0;
 706         int     max_trans_count;
 707         u_int   pg_count;
 708         int     pg_offset;
 709         u_int   max_iosize;
 710         u_int   max_vectors;
 711         int     priv;
 712         int     zero_offset = 0;
 713         int     async_throttle = 0;
 714         mount_t mp;
 715         vm_offset_t upl_end_offset;
 716         boolean_t   need_EOT = FALSE;
 717
 718         /*
 719          * we currently don't support buffers larger than a page
 720          */
 721         if (real_bp && non_rounded_size > PAGE_SIZE)
 722                 panic("%s(): Called with real buffer of size %d bytes which "
 723                                 "is greater than the maximum allowed size of "
 724                                 "%d bytes (the system PAGE_SIZE).\n",
 725                                 __FUNCTION__, non_rounded_size, PAGE_SIZE);
 726
 727         mp = vp->v_mount;
 728
 729         /*
 730          * we don't want to do any funny rounding of the size for IO requests
 731          * coming through the DIRECT or CONTIGUOUS paths...  those pages don't
 732          * belong to us... we can't extend (nor do we need to) the I/O to fill
 733          * out a page
 734          */
 735         if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) {
 736                 /*
 737                  * round the requested size up so that this I/O ends on a
 738                  * page boundary in case this is a 'write'... if the filesystem
 739                  * has blocks allocated to back the page beyond the EOF, we want to
 740                  * make sure to write out the zero's that are sitting beyond the EOF
 741                  * so that in case the filesystem doesn't explicitly zero this area
 742                  * if a hole is created via a lseek/write beyond the current EOF,
 743                  * it will return zeros when it's read back from the disk.  If the
 744                  * physical allocation doesn't extend for the whole page, we'll
 745                  * only write/read from the disk up to the end of this allocation
 746                  * via the extent info returned from the VNOP_BLOCKMAP call.
 747                  */
 748                 pg_offset = upl_offset & PAGE_MASK;
 749
 750                 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
 751         } else {
 752                 /*
 753                  * anyone advertising a blocksize of 1 byte probably
 754                  * can't deal with us rounding up the request size
 755                  * AFP is one such filesystem/device
 756                  */
 757                 size = non_rounded_size;
 758         }
 759         upl_end_offset = upl_offset + size;
 760
 761         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
 762
 763         /*
 764          * Set the maximum transaction size to the maximum desired number of
 765          * buffers.
 766          */
 767         max_trans_count = 8;
 768         if (flags & CL_DEV_MEMORY)
 769                 max_trans_count = 16;
 770
 771         if (flags & CL_READ) {
 772                 io_flags = B_READ;
 773                 bmap_flags = VNODE_READ;
 774
 775                 max_iosize  = mp->mnt_maxreadcnt;
 776                 max_vectors = mp->mnt_segreadcnt;
 777         } else {
 778                 io_flags = B_WRITE;
 779                 bmap_flags = VNODE_WRITE;
 780
 781                 max_iosize  = mp->mnt_maxwritecnt;
 782                 max_vectors = mp->mnt_segwritecnt;
 783         }
 784         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
 785
 786         /*
 787          * make sure the maximum iosize is a
 788          * multiple of the page size
 789          */
 790         max_iosize  &= ~PAGE_MASK;
 791
 792         /*
 793          * Ensure the maximum iosize is sensible.
 794          */
 795         if (!max_iosize)
 796                 max_iosize = PAGE_SIZE;
 797
 798         if (flags & CL_THROTTLE) {
 799                 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
 800                         if (max_iosize > HARD_THROTTLE_MAXSIZE)
 801                                 max_iosize = HARD_THROTTLE_MAXSIZE;
 802                         async_throttle = HARD_THROTTLE_MAXCNT;
 803                 } else {
 804                         if ( (flags & CL_DEV_MEMORY) )
 805                                 async_throttle = VNODE_ASYNC_THROTTLE;
 806                         else {
 807                                 u_int max_cluster;
 808
 809                                 if (max_iosize > (MAX_CLUSTER_SIZE * PAGE_SIZE))
 810                                         max_cluster = (MAX_CLUSTER_SIZE * PAGE_SIZE);
 811                                 else
 812                                         max_cluster = max_iosize;
 813
 814                                 if (size < max_cluster)
 815                                         max_cluster = size;
 816
 817                                 async_throttle = min(VNODE_ASYNC_THROTTLE, (MAX_PREFETCH / max_cluster) - 1);
 818                         }
 819                 }
 820         }
 821         if (flags & CL_AGE)
 822                 io_flags |= B_AGE;
 823         if (flags & (CL_PAGEIN | CL_PAGEOUT))
 824                 io_flags |= B_PAGEIO;
 825         if (flags & CL_COMMIT)
 826                 io_flags |= B_COMMIT_UPL;
 827         if (flags & CL_PRESERVE)
 828                 io_flags |= B_PHYS;
 829         if (flags & CL_KEEPCACHED)
 830                 io_flags |= B_CACHE;
 831         if (flags & CL_PASSIVE)
 832                 io_flags |= B_PASSIVE;
 833         if (vp->v_flag & VSYSTEM)
 834                 io_flags |= B_META;
 835
 836         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 837                 /*
 838                  * then we are going to end up
 839                  * with a page that we can't complete (the file size wasn't a multiple
 840                  * of PAGE_SIZE and we're trying to read to the end of the file
 841                  * so we'll go ahead and zero out the portion of the page we can't
 842                  * read in from the file
 843                  */
 844                 zero_offset = upl_offset + non_rounded_size;
 845         }
 846         while (size) {
 847                 daddr64_t blkno;
 848                 daddr64_t lblkno;
 849                 u_int   io_size_wanted;
 850
 851                 if (size > max_iosize)
 852                         io_size = max_iosize;
 853                 else
 854                         io_size = size;
 855
 856                 io_size_wanted = io_size;
 857
 858                 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, bmap_flags, NULL)))
 859                         break;
 860
 861                 if (io_size > io_size_wanted)
 862                         io_size = io_size_wanted;
 863
 864                 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
 865                         real_bp->b_blkno = blkno;
 866
 867                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 868                              (int)f_offset, (int)(blkno>>32), (int)blkno, io_size, 0);
 869
 870                 if (io_size == 0) {
 871                         /*
 872                          * vnop_blockmap didn't return an error... however, it did
 873                          * return an extent size of 0 which means we can't
 874                          * make forward progress on this I/O... a hole in the
 875                          * file would be returned as a blkno of -1 with a non-zero io_size
 876                          * a real extent is returned with a blkno != -1 and a non-zero io_size
 877                          */
 878                         error = EINVAL;
 879                         break;
 880                 }
 881                 if ( !(flags & CL_READ) && blkno == -1) {
 882                         off_t   e_offset;
 883                         int     pageout_flags;
 884
 885                         /*
 886                          * we're writing into a 'hole'
 887                          */
 888                         if (flags & CL_PAGEOUT) {
 889                                 /*
 890                                  * if we got here via cluster_pageout
 891                                  * then just error the request and return
 892                                  * the 'hole' should already have been covered
 893                                  */
 894                                 error = EINVAL;
 895                                 break;
 896                         }
 897                         /*
 898                          * we can get here if the cluster code happens to
 899                          * pick up a page that was dirtied via mmap vs
 900                          * a 'write' and the page targets a 'hole'...
 901                          * i.e. the writes to the cluster were sparse
 902                          * and the file was being written for the first time
 903                          *
 904                          * we can also get here if the filesystem supports
 905                          * 'holes' that are less than PAGE_SIZE.... because
 906                          * we can't know if the range in the page that covers
 907                          * the 'hole' has been dirtied via an mmap or not,
 908                          * we have to assume the worst and try to push the
 909                          * entire page to storage.
 910                          *
 911                          * Try paging out the page individually before
 912                          * giving up entirely and dumping it (the pageout
 913                          * path will insure that the zero extent accounting
 914                          * has been taken care of before we get back into cluster_io)
 915                          *
 916                          * go direct to vnode_pageout so that we don't have to
 917                          * unbusy the page from the UPL... we used to do this
 918                          * so that we could call ubc_sync_range, but that results
 919                          * in a potential deadlock if someone else races us to acquire
 920                          * that page and wins and in addition needs one of the pages
 921                          * we're continuing to hold in the UPL
 922                          */
 923                         pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT;
 924
 925                         if ( !(flags & CL_ASYNC))
 926                                 pageout_flags |= UPL_IOSYNC;
 927                         if ( !(flags & CL_COMMIT))
 928                                 pageout_flags |= UPL_NOCOMMIT;
 929
 930                         if (cbp_head) {
 931                                 buf_t last_cbp;
 932
 933                                 /*
 934                                  * first we have to wait for the the current outstanding I/Os
 935                                  * to complete... EOT hasn't been set yet on this transaction
 936                                  * so the pages won't be released just because all of the current
 937                                  * I/O linked to this transaction has completed...
 938                                  */
 939                                 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
 940
 941                                 /*
 942                                  * we've got a transcation that
 943                                  * includes the page we're about to push out through vnode_pageout...
 944                                  * find the last bp in the list which will be the one that
 945                                  * includes the head of this page and round it's iosize down
 946                                  * to a page boundary...
 947                                  */
 948                                 for (last_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next)
 949                                         last_cbp = cbp;
 950
 951                                 cbp->b_bcount &= ~PAGE_MASK;
 952
 953                                 if (cbp->b_bcount == 0) {
 954                                         /*
 955                                          * this buf no longer has any I/O associated with it
 956                                          */
 957                                         free_io_buf(cbp);
 958
 959                                         if (cbp == cbp_head) {
 960                                                 /*
 961                                                  * the buf we just freed was the only buf in
 962                                                  * this transaction... so there's no I/O to do
 963                                                  */
 964                                                 cbp_head = NULL;
 965                                         } else {
 966                                                 /*
 967                                                  * remove the buf we just freed from
 968                                                  * the transaction list
 969                                                  */
 970                                                 last_cbp->b_trans_next = NULL;
 971                                                 cbp_tail = last_cbp;
 972                                         }
 973                                 }
 974                                 if (cbp_head) {
 975                                         /*
 976                                          * there was more to the current transaction
 977                                          * than just the page we are pushing out via vnode_pageout...
 978                                          * mark it as finished and complete it... we've already
 979                                          * waited for the I/Os to complete above in the call to cluster_wait_IO
 980                                          */
 981                                         cluster_EOT(cbp_head, cbp_tail, 0);
 982
 983                                         cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
 984
 985                                         trans_count = 0;
 986                                 }
 987                         }
 988                         if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
 989                                 error = EINVAL;
 990                                 break;
 991                         }
 992                         e_offset = round_page_64(f_offset + 1);
 993                         io_size = e_offset - f_offset;
 994
 995                         f_offset   += io_size;
 996                         upl_offset += io_size;
 997
 998                         if (size >= io_size)
 999                                 size -= io_size;
1000                         else
1001                                 size = 0;
1002                         /*
1003                          * keep track of how much of the original request
1004                          * that we've actually completed... non_rounded_size
1005                          * may go negative due to us rounding the request
1006                          * to a page size multiple (i.e.  size > non_rounded_size)
1007                          */
1008                         non_rounded_size -= io_size;
1009
1010                         if (non_rounded_size <= 0) {
1011                                 /*
1012                                  * we've transferred all of the data in the original
1013                                  * request, but we were unable to complete the tail
1014                                  * of the last page because the file didn't have
1015                                  * an allocation to back that portion... this is ok.
1016                                  */
1017                                 size = 0;
1018                         }
1019                         continue;
1020                 }
1021                 lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64);
1022                 /*
1023                  * we have now figured out how much I/O we can do - this is in 'io_size'
1024                  * pg_offset is the starting point in the first page for the I/O
1025                  * pg_count is the number of full and partial pages that 'io_size' encompasses
1026                  */
1027                 pg_offset = upl_offset & PAGE_MASK;
1028
1029                 if (flags & CL_DEV_MEMORY) {
1030                         /*
1031                          * treat physical requests as one 'giant' page
1032                          */
1033                         pg_count = 1;
1034                 } else
1035                         pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
1036
1037                 if ((flags & CL_READ) && blkno == -1) {
1038                         vm_offset_t  commit_offset;
1039                         int bytes_to_zero;
1040                         int complete_transaction_now = 0;
1041
1042                         /*
1043                          * if we're reading and blkno == -1, then we've got a
1044                          * 'hole' in the file that we need to deal with by zeroing
1045                          * out the affected area in the upl
1046                          */
1047                         if (io_size >= (u_int)non_rounded_size) {
1048                                 /*
1049                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1050                                  * than 'zero_offset' will be non-zero
1051                                  * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1052                                  * (indicated by the io_size finishing off the I/O request for this UPL)
1053                                  * than we're not going to issue an I/O for the
1054                                  * last page in this upl... we need to zero both the hole and the tail
1055                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
1056                                  */
1057                                 bytes_to_zero = non_rounded_size;
1058                                 if (!(flags & CL_NOZERO))
1059                                         bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
1060
1061                                 zero_offset = 0;
1062                         } else
1063                                 bytes_to_zero = io_size;
1064
1065                         pg_count = 0;
1066
1067                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
1068
1069                         if (cbp_head) {
1070                                 int     pg_resid;
1071
1072                                 /*
1073                                  * if there is a current I/O chain pending
1074                                  * then the first page of the group we just zero'd
1075                                  * will be handled by the I/O completion if the zero
1076                                  * fill started in the middle of the page
1077                                  */
1078                                 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1079
1080                                 pg_resid = commit_offset - upl_offset;
1081
1082                                 if (bytes_to_zero >= pg_resid) {
1083                                         /*
1084                                          * the last page of the current I/O
1085                                          * has been completed...
1086                                          * compute the number of fully zero'd
1087                                          * pages that are beyond it
1088                                          * plus the last page if its partial
1089                                          * and we have no more I/O to issue...
1090                                          * otherwise a partial page is left
1091                                          * to begin the next I/O
1092                                          */
1093                                         if ((int)io_size >= non_rounded_size)
1094                                                 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
1095                                         else
1096                                                 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
1097
1098                                         complete_transaction_now = 1;
1099                                 }
1100                         } else {
1101                                 /*
1102                                  * no pending I/O to deal with
1103                                  * so, commit all of the fully zero'd pages
1104                                  * plus the last page if its partial
1105                                  * and we have no more I/O to issue...
1106                                  * otherwise a partial page is left
1107                                  * to begin the next I/O
1108                                  */
1109                                 if ((int)io_size >= non_rounded_size)
1110                                         pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
1111                                 else
1112                                         pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
1113
1114                                 commit_offset = upl_offset & ~PAGE_MASK;
1115                         }
1116                         if ( (flags & CL_COMMIT) && pg_count) {
1117                                 ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE,
1118                                                      UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1119                         }
1120                         upl_offset += io_size;
1121                         f_offset   += io_size;
1122                         size       -= io_size;
1123
1124                         /*
1125                          * keep track of how much of the original request
1126                          * that we've actually completed... non_rounded_size
1127                          * may go negative due to us rounding the request
1128                          * to a page size multiple (i.e.  size > non_rounded_size)
1129                          */
1130                         non_rounded_size -= io_size;
1131
1132                         if (non_rounded_size <= 0) {
1133                                 /*
1134                                  * we've transferred all of the data in the original
1135                                  * request, but we were unable to complete the tail
1136                                  * of the last page because the file didn't have
1137                                  * an allocation to back that portion... this is ok.
1138                                  */
1139                                 size = 0;
1140                         }
1141                         if (cbp_head && (complete_transaction_now || size == 0))  {
1142                                 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1143
1144                                 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
1145
1146                                 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1147
1148                                 trans_count = 0;
1149                         }
1150                         continue;
1151                 }
1152                 if (pg_count > max_vectors) {
1153                         if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
1154                                 io_size = PAGE_SIZE - pg_offset;
1155                                 pg_count = 1;
1156                         } else {
1157                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
1158                                 pg_count = max_vectors;
1159                         }
1160                 }
1161                 /*
1162                  * If the transaction is going to reach the maximum number of
1163                  * desired elements, truncate the i/o to the nearest page so
1164                  * that the actual i/o is initiated after this buffer is
1165                  * created and added to the i/o chain.
1166                  *
1167                  * I/O directed to physically contiguous memory
1168                  * doesn't have a requirement to make sure we 'fill' a page
1169                  */
1170                 if ( !(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
1171                                 ((upl_offset + io_size) & PAGE_MASK)) {
1172                         vm_offset_t aligned_ofs;
1173
1174                         aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
1175                         /*
1176                          * If the io_size does not actually finish off even a
1177                          * single page we have to keep adding buffers to the
1178                          * transaction despite having reached the desired limit.
1179                          *
1180                          * Eventually we get here with the page being finished
1181                          * off (and exceeded) and then we truncate the size of
1182                          * this i/o request so that it is page aligned so that
1183                          * we can finally issue the i/o on the transaction.
1184                          */
1185                         if (aligned_ofs > upl_offset) {
1186                                 io_size = aligned_ofs - upl_offset;
1187                                 pg_count--;
1188                         }
1189                 }
1190
1191                 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
1192                         /*
1193                          * if we're not targeting a virtual device i.e. a disk image
1194                          * it's safe to dip into the reserve pool since real devices
1195                          * can complete this I/O request without requiring additional
1196                          * bufs from the alloc_io_buf pool
1197                          */
1198                         priv = 1;
1199                 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
1200                         /*
1201                          * Throttle the speculative IO
1202                          */
1203                         priv = 0;
1204                 else
1205                         priv = 1;
1206
1207                 cbp = alloc_io_buf(vp, priv);
1208
1209                 if (flags & CL_PAGEOUT) {
1210                         u_int i;
1211
1212                         for (i = 0; i < pg_count; i++) {
1213                                 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
1214                                         panic("BUSY bp found in cluster_io");
1215                         }
1216                 }
1217                 if (flags & CL_ASYNC) {
1218                         if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg))
1219                                 panic("buf_setcallback failed\n");
1220                 }
1221                 cbp->b_cliodone = (void *)callback;
1222                 cbp->b_flags |= io_flags;
1223
1224                 cbp->b_lblkno = lblkno;
1225                 cbp->b_blkno  = blkno;
1226                 cbp->b_bcount = io_size;
1227
1228                 if (buf_setupl(cbp, upl, upl_offset))
1229                         panic("buf_setupl failed\n");
1230
1231                 cbp->b_trans_next = (buf_t)NULL;
1232
1233                 if ((cbp->b_iostate = (void *)iostate))
1234                         /*
1235                          * caller wants to track the state of this
1236                          * io... bump the amount issued against this stream
1237                          */
1238                         iostate->io_issued += io_size;
1239
1240                 if (flags & CL_READ) {
1241                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
1242                                      (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1243                 }
1244                 else {
1245                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
1246                                      (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1247                 }
1248
1249                 if (cbp_head) {
1250                         cbp_tail->b_trans_next = cbp;
1251                         cbp_tail = cbp;
1252                 } else {
1253                         cbp_head = cbp;
1254                         cbp_tail = cbp;
1255
1256                         if ( (cbp_head->b_real_bp = real_bp) ) {
1257                                 cbp_head->b_flags |= B_NEED_IODONE;
1258                                 real_bp = (buf_t)NULL;
1259                         }
1260                 }
1261                 *(buf_t *)(&cbp->b_trans_head) = cbp_head;
1262
1263                 trans_count++;
1264
1265                 upl_offset += io_size;
1266                 f_offset   += io_size;
1267                 size       -= io_size;
1268                 /*
1269                  * keep track of how much of the original request
1270                  * that we've actually completed... non_rounded_size
1271                  * may go negative due to us rounding the request
1272                  * to a page size multiple (i.e.  size > non_rounded_size)
1273                  */
1274                 non_rounded_size -= io_size;
1275
1276                 if (non_rounded_size <= 0) {
1277                         /*
1278                          * we've transferred all of the data in the original
1279                          * request, but we were unable to complete the tail
1280                          * of the last page because the file didn't have
1281                          * an allocation to back that portion... this is ok.
1282                          */
1283                         size = 0;
1284                 }
1285                 if (size == 0) {
1286                         /*
1287                          * we have no more I/O to issue, so go
1288                          * finish the final transaction
1289                          */
1290                         need_EOT = TRUE;
1291                 } else if ( ((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) &&
1292                             ((flags & CL_ASYNC) || trans_count > max_trans_count) ) {
1293                         /*
1294                          * I/O directed to physically contiguous memory...
1295                          * which doesn't have a requirement to make sure we 'fill' a page
1296                          * or...
1297                          * the current I/O we've prepared fully
1298                          * completes the last page in this request
1299                          * and ...
1300                          * it's either an ASYNC request or
1301                          * we've already accumulated more than 8 I/O's into
1302                          * this transaction so mark it as complete so that
1303                          * it can finish asynchronously or via the cluster_complete_transaction
1304                          * below if the request is synchronous
1305                          */
1306                         need_EOT = TRUE;
1307                 }
1308                 if (need_EOT == TRUE)
1309                         cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
1310
1311                 if (flags & CL_THROTTLE)
1312                         (void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
1313
1314                 if ( !(io_flags & B_READ))
1315                         vnode_startwrite(vp);
1316
1317                 (void) VNOP_STRATEGY(cbp);
1318
1319                 if (need_EOT == TRUE) {
1320                         if ( !(flags & CL_ASYNC))
1321                                 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
1322
1323                         need_EOT = FALSE;
1324                         trans_count = 0;
1325                         cbp_head = NULL;
1326                 }
1327         }
1328         if (error) {
1329                 int abort_size;
1330
1331                 io_size = 0;
1332
1333                 if (cbp_head) {
1334                          /*
1335                           * first wait until all of the outstanding I/O
1336                           * for this partial transaction has completed
1337                           */
1338                         cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1339
1340                         /*
1341                          * Rewind the upl offset to the beginning of the
1342                          * transaction.
1343                          */
1344                         upl_offset = cbp_head->b_uploffset;
1345
1346                         for (cbp = cbp_head; cbp;) {
1347                                 buf_t   cbp_next;
1348
1349                                 size       += cbp->b_bcount;
1350                                 io_size    += cbp->b_bcount;
1351
1352                                 cbp_next = cbp->b_trans_next;
1353                                 free_io_buf(cbp);
1354                                 cbp = cbp_next;
1355                         }
1356                 }
1357                 if (iostate) {
1358                         int need_wakeup = 0;
1359
1360                         /*
1361                          * update the error condition for this stream
1362                          * since we never really issued the io
1363                          * just go ahead and adjust it back
1364                          */
1365                         lck_mtx_lock_spin(cl_mtxp);
1366
1367                         if (iostate->io_error == 0)
1368                                 iostate->io_error = error;
1369                         iostate->io_issued -= io_size;
1370
1371                         if (iostate->io_wanted) {
1372                                 /*
1373                                  * someone is waiting for the state of
1374                                  * this io stream to change
1375                                  */
1376                                 iostate->io_wanted = 0;
1377                                 need_wakeup = 1;
1378                         }
1379                         lck_mtx_unlock(cl_mtxp);
1380
1381                         if (need_wakeup)
1382                                 wakeup((caddr_t)&iostate->io_wanted);
1383                 }
1384                 if (flags & CL_COMMIT) {
1385                         int     upl_flags;
1386
1387                         pg_offset  = upl_offset & PAGE_MASK;
1388                         abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK;
1389
1390                         upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags);
1391
1392                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
1393                                      (int)upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0);
1394                 }
1395                 if (retval == 0)
1396                         retval = error;
1397         } else if (cbp_head)
1398                         panic("%s(): cbp_head is not NULL.\n", __FUNCTION__);
1399
1400         if (real_bp) {
1401                 /*
1402                  * can get here if we either encountered an error
1403                  * or we completely zero-filled the request and
1404                  * no I/O was issued
1405                  */
1406                 if (error) {
1407                         real_bp->b_flags |= B_ERROR;
1408                         real_bp->b_error = error;
1409                 }
1410                 buf_biodone(real_bp);
1411         }
1412         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
1413
1414         return (retval);
1415 }
1416
1417
1418 static int
1419 cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
1420 {
1421         int           pages_in_prefetch;
1422
1423         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
1424                      (int)f_offset, size, (int)filesize, 0, 0);
1425
1426         if (f_offset >= filesize) {
1427                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1428                              (int)f_offset, 0, 0, 0, 0);
1429                 return(0);
1430         }
1431         if ((off_t)size > (filesize - f_offset))
1432                 size = filesize - f_offset;
1433         pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1434
1435         advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
1436
1437         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1438                      (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1439
1440         return (pages_in_prefetch);
1441 }
1442
1443
1444
1445 static void
1446 cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg,
1447                    int bflag)
1448 {
1449         daddr64_t       r_addr;
1450         off_t           f_offset;
1451         int             size_of_prefetch;
1452
1453
1454         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
1455                      (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
1456
1457         if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
1458                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1459                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
1460                 return;
1461         }
1462         if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
1463                 rap->cl_ralen = 0;
1464                 rap->cl_maxra = 0;
1465
1466                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1467                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
1468
1469                 return;
1470         }
1471         if (extent->e_addr < rap->cl_maxra) {
1472                 if ((rap->cl_maxra - extent->e_addr) > ((MAX_PREFETCH / PAGE_SIZE) / 4)) {
1473
1474                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1475                                      rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
1476                         return;
1477                 }
1478         }
1479         r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
1480         f_offset = (off_t)(r_addr * PAGE_SIZE_64);
1481
1482         size_of_prefetch = 0;
1483
1484         ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
1485
1486         if (size_of_prefetch) {
1487                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1488                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
1489                 return;
1490         }
1491         if (f_offset < filesize) {
1492                 daddr64_t read_size;
1493
1494                 rap->cl_ralen = rap->cl_ralen ? min(MAX_PREFETCH / PAGE_SIZE, rap->cl_ralen << 1) : 1;
1495
1496                 read_size = (extent->e_addr + 1) - extent->b_addr;
1497
1498                 if (read_size > rap->cl_ralen) {
1499                         if (read_size > MAX_PREFETCH / PAGE_SIZE)
1500                                 rap->cl_ralen = MAX_PREFETCH / PAGE_SIZE;
1501                         else
1502                                 rap->cl_ralen = read_size;
1503                 }
1504                 size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
1505
1506                 if (size_of_prefetch)
1507                         rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
1508         }
1509         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1510                      rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
1511 }
1512
1513
1514 int
1515 cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1516                 int size, off_t filesize, int flags)
1517 {
1518         return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
1519
1520 }
1521
1522
1523 int
1524 cluster_pageout_ext(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1525                 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
1526 {
1527         int           io_size;
1528         int           rounded_size;
1529         off_t         max_size;
1530         int           local_flags;
1531
1532         if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
1533                 /*
1534                  * if we know we're issuing this I/O to a virtual device (i.e. disk image)
1535                  * then we don't want to enforce this throttle... if we do, we can
1536                  * potentially deadlock since we're stalling the pageout thread at a time
1537                  * when the disk image might need additional memory (which won't be available
1538                  * if the pageout thread can't run)... instead we'll just depend on the throttle
1539                  * that the pageout thread now has in place to deal with external files
1540                  */
1541                 local_flags = CL_PAGEOUT;
1542         else
1543                 local_flags = CL_PAGEOUT | CL_THROTTLE;
1544
1545         if ((flags & UPL_IOSYNC) == 0)
1546                 local_flags |= CL_ASYNC;
1547         if ((flags & UPL_NOCOMMIT) == 0)
1548                 local_flags |= CL_COMMIT;
1549         if ((flags & UPL_KEEPCACHED))
1550                 local_flags |= CL_KEEPCACHED;
1551         if (flags & IO_PASSIVE)
1552                 local_flags |= CL_PASSIVE;
1553
1554
1555         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
1556                      (int)f_offset, size, (int)filesize, local_flags, 0);
1557
1558         /*
1559          * If they didn't specify any I/O, then we are done...
1560          * we can't issue an abort because we don't know how
1561          * big the upl really is
1562          */
1563         if (size <= 0)
1564                 return (EINVAL);
1565
1566         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1567                 if (local_flags & CL_COMMIT)
1568                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1569                 return (EROFS);
1570         }
1571         /*
1572          * can't page-in from a negative offset
1573          * or if we're starting beyond the EOF
1574          * or if the file offset isn't page aligned
1575          * or the size requested isn't a multiple of PAGE_SIZE
1576          */
1577         if (f_offset < 0 || f_offset >= filesize ||
1578            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
1579                 if (local_flags & CL_COMMIT)
1580                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1581                 return (EINVAL);
1582         }
1583         max_size = filesize - f_offset;
1584
1585         if (size < max_size)
1586                 io_size = size;
1587         else
1588                 io_size = max_size;
1589
1590         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1591
1592         if (size > rounded_size) {
1593                 if (local_flags & CL_COMMIT)
1594                         ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
1595                                         UPL_ABORT_FREE_ON_EMPTY);
1596         }
1597         return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
1598                            local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg));
1599 }
1600
1601
1602 int
1603 cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1604                int size, off_t filesize, int flags)
1605 {
1606         return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
1607 }
1608
1609
1610 int
1611 cluster_pagein_ext(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1612                int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
1613 {
1614         u_int         io_size;
1615         int           rounded_size;
1616         off_t         max_size;
1617         int           retval;
1618         int           local_flags = 0;
1619
1620         if (upl == NULL || size < 0)
1621                 panic("cluster_pagein: NULL upl passed in");
1622
1623         if ((flags & UPL_IOSYNC) == 0)
1624                 local_flags |= CL_ASYNC;
1625         if ((flags & UPL_NOCOMMIT) == 0)
1626                 local_flags |= CL_COMMIT;
1627         if (flags & IO_PASSIVE)
1628                 local_flags |= CL_PASSIVE;
1629
1630
1631         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1632                      (int)f_offset, size, (int)filesize, local_flags, 0);
1633
1634         /*
1635          * can't page-in from a negative offset
1636          * or if we're starting beyond the EOF
1637          * or if the file offset isn't page aligned
1638          * or the size requested isn't a multiple of PAGE_SIZE
1639          */
1640         if (f_offset < 0 || f_offset >= filesize ||
1641            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1642                 if (local_flags & CL_COMMIT)
1643                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1644                 return (EINVAL);
1645         }
1646         max_size = filesize - f_offset;
1647
1648         if (size < max_size)
1649                 io_size = size;
1650         else
1651                 io_size = max_size;
1652
1653         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1654
1655         if (size > rounded_size && (local_flags & CL_COMMIT))
1656                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1657                                     size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1658
1659         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
1660                             local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
1661
1662         return (retval);
1663 }
1664
1665
1666 int
1667 cluster_bp(buf_t bp)
1668 {
1669        return cluster_bp_ext(bp, NULL, NULL);
1670 }
1671
1672
1673 int
1674 cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg)
1675 {
1676         off_t  f_offset;
1677         int    flags;
1678
1679         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1680                      (int)bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1681
1682         if (bp->b_flags & B_READ)
1683                 flags = CL_ASYNC | CL_READ;
1684         else
1685                 flags = CL_ASYNC;
1686         if (bp->b_flags & B_PASSIVE)
1687                 flags |= CL_PASSIVE;
1688
1689         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1690
1691         return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg));
1692 }
1693
1694
1695
1696 int
1697 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
1698 {
1699         return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
1700 }
1701
1702
1703 int
1704 cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
1705                   int xflags, int (*callback)(buf_t, void *), void *callback_arg)
1706 {
1707         user_ssize_t    cur_resid;
1708         int             retval = 0;
1709         int             flags;
1710         int             zflags;
1711         int             bflag;
1712         int             write_type = IO_COPY;
1713         u_int32_t       write_length;
1714
1715         flags = xflags;
1716
1717         if (flags & IO_PASSIVE)
1718             bflag = CL_PASSIVE;
1719         else
1720             bflag = 0;
1721
1722         if (vp->v_flag & VNOCACHE_DATA)
1723                 flags |= IO_NOCACHE;
1724
1725         if (uio == NULL) {
1726                 /*
1727                  * no user data...
1728                  * this call is being made to zero-fill some range in the file
1729                  */
1730                 retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
1731
1732                 return(retval);
1733         }
1734         /*
1735          * do a write through the cache if one of the following is true....
1736          *   NOCACHE is not true and
1737          *   the uio request doesn't target USERSPACE
1738          * otherwise, find out if we want the direct or contig variant for
1739          * the first vector in the uio request
1740          */
1741         if ( (flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) )
1742                 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
1743
1744         if ( (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT)
1745                 /*
1746                  * must go through the cached variant in this case
1747                  */
1748                 write_type = IO_COPY;
1749
1750         while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
1751
1752                 switch (write_type) {
1753
1754                 case IO_COPY:
1755                         /*
1756                          * make sure the uio_resid isn't too big...
1757                          * internally, we want to handle all of the I/O in
1758                          * chunk sizes that fit in a 32 bit int
1759                          */
1760                         if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
1761                                 /*
1762                                  * we're going to have to call cluster_write_copy
1763                                  * more than once...
1764                                  *
1765                                  * only want the last call to cluster_write_copy to
1766                                  * have the IO_TAILZEROFILL flag set and only the
1767                                  * first call should have IO_HEADZEROFILL
1768                                  */
1769                                 zflags = flags & ~IO_TAILZEROFILL;
1770                                 flags &= ~IO_HEADZEROFILL;
1771
1772                                 write_length = MAX_IO_REQUEST_SIZE;
1773                         } else {
1774                                 /*
1775                                  * last call to cluster_write_copy
1776                                  */
1777                                 zflags = flags;
1778
1779                                 write_length = (u_int32_t)cur_resid;
1780                         }
1781                         retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
1782                         break;
1783
1784                 case IO_CONTIG:
1785                         zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL);
1786
1787                         if (flags & IO_HEADZEROFILL) {
1788                                 /*
1789                                  * only do this once per request
1790                                  */
1791                                 flags &= ~IO_HEADZEROFILL;
1792
1793                                 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
1794                                                             headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
1795                                 if (retval)
1796                                         break;
1797                         }
1798                         retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
1799
1800                         if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
1801                                 /*
1802                                  * we're done with the data from the user specified buffer(s)
1803                                  * and we've been requested to zero fill at the tail
1804                                  * treat this as an IO_HEADZEROFILL which doesn't require a uio
1805                                  * by rearranging the args and passing in IO_HEADZEROFILL
1806                                  */
1807                                 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset,
1808                                                             (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
1809                         }
1810                         break;
1811
1812                 case IO_DIRECT:
1813                         /*
1814                          * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
1815                          */
1816                         retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
1817                         break;
1818
1819                 case IO_UNKNOWN:
1820                         retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
1821                         break;
1822                 }
1823         }
1824         return (retval);
1825 }
1826
1827
1828 static int
1829 cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
1830                      int flags, int (*callback)(buf_t, void *), void *callback_arg)
1831 {
1832         upl_t            upl;
1833         upl_page_info_t  *pl;
1834         vm_offset_t      upl_offset;
1835         u_int32_t        io_req_size;
1836         u_int32_t        offset_in_file;
1837         u_int32_t        offset_in_iovbase;
1838         int              io_size;
1839         int              io_flag;
1840         int              bflag;
1841         vm_size_t        upl_size;
1842         vm_size_t        upl_needed_size;
1843         mach_msg_type_number_t  pages_in_pl;
1844         int              upl_flags;
1845         kern_return_t    kret;
1846         mach_msg_type_number_t  i;
1847         int              force_data_sync;
1848         int              retval = 0;
1849         int              first_IO = 1;
1850         struct clios     iostate;
1851         user_addr_t      iov_base;
1852         u_int32_t        mem_alignment_mask;
1853         u_int32_t        devblocksize;
1854
1855         if (flags & IO_PASSIVE)
1856             bflag = CL_PASSIVE;
1857         else
1858             bflag = 0;
1859
1860         /*
1861          * When we enter this routine, we know
1862          *  -- the resid will not exceed iov_len
1863          */
1864         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1865                      (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
1866
1867         iostate.io_completed = 0;
1868         iostate.io_issued = 0;
1869         iostate.io_error = 0;
1870         iostate.io_wanted = 0;
1871
1872         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
1873         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
1874
1875         if (devblocksize == 1) {
1876                /*
1877                 * the AFP client advertises a devblocksize of 1
1878                 * however, its BLOCKMAP routine maps to physical
1879                 * blocks that are PAGE_SIZE in size...
1880                 * therefore we can't ask for I/Os that aren't page aligned
1881                 * or aren't multiples of PAGE_SIZE in size
1882                 * by setting devblocksize to PAGE_SIZE, we re-instate
1883                 * the old behavior we had before the mem_alignment_mask
1884                 * changes went in...
1885                 */
1886                devblocksize = PAGE_SIZE;
1887         }
1888
1889 next_dwrite:
1890         io_req_size = *write_length;
1891         iov_base = uio_curriovbase(uio);
1892
1893         offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
1894         offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
1895
1896         if (offset_in_file || offset_in_iovbase) {
1897                 /*
1898                  * one of the 2 important offsets is misaligned
1899                  * so fire an I/O through the cache for this entire vector
1900                  */
1901                 goto wait_for_dwrites;
1902         }
1903         if (iov_base & (devblocksize - 1)) {
1904                 /*
1905                  * the offset in memory must be on a device block boundary
1906                  * so that we can guarantee that we can generate an
1907                  * I/O that ends on a page boundary in cluster_io
1908                  */
1909                 goto wait_for_dwrites;
1910         }
1911
1912         while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
1913
1914                 if (first_IO) {
1915                         cluster_syncup(vp, newEOF, callback, callback_arg);
1916                         first_IO = 0;
1917                 }
1918                 io_size  = io_req_size & ~PAGE_MASK;
1919                 iov_base = uio_curriovbase(uio);
1920
1921                 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1922                         io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1923
1924                 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
1925                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1926
1927                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1928                              (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
1929
1930                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1931                         pages_in_pl = 0;
1932                         upl_size = upl_needed_size;
1933                         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1934                                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1935
1936                         kret = vm_map_get_upl(current_map(),
1937                                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1938                                               &upl_size,
1939                                               &upl,
1940                                               NULL,
1941                                               &pages_in_pl,
1942                                               &upl_flags,
1943                                               force_data_sync);
1944
1945                         if (kret != KERN_SUCCESS) {
1946                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1947                                              0, 0, 0, kret, 0);
1948                                 /*
1949                                  * failed to get pagelist
1950                                  *
1951                                  * we may have already spun some portion of this request
1952                                  * off as async requests... we need to wait for the I/O
1953                                  * to complete before returning
1954                                  */
1955                                 goto wait_for_dwrites;
1956                         }
1957                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1958                         pages_in_pl = upl_size / PAGE_SIZE;
1959
1960                         for (i = 0; i < pages_in_pl; i++) {
1961                                 if (!upl_valid_page(pl, i))
1962                                         break;
1963                         }
1964                         if (i == pages_in_pl)
1965                                 break;
1966
1967                         /*
1968                          * didn't get all the pages back that we
1969                          * needed... release this upl and try again
1970                          */
1971                         ubc_upl_abort(upl, 0);
1972                 }
1973                 if (force_data_sync >= 3) {
1974                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1975                                      i, pages_in_pl, upl_size, kret, 0);
1976                         /*
1977                          * for some reason, we couldn't acquire a hold on all
1978                          * the pages needed in the user's address space
1979                          *
1980                          * we may have already spun some portion of this request
1981                          * off as async requests... we need to wait for the I/O
1982                          * to complete before returning
1983                          */
1984                         goto wait_for_dwrites;
1985                 }
1986
1987                 /*
1988                  * Consider the possibility that upl_size wasn't satisfied.
1989                  */
1990                 if (upl_size < upl_needed_size) {
1991                         if (upl_size && upl_offset == 0)
1992                                 io_size = upl_size;
1993                         else
1994                                 io_size = 0;
1995                 }
1996                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1997                              (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
1998
1999                 if (io_size == 0) {
2000                         ubc_upl_abort(upl, 0);
2001                         /*
2002                          * we may have already spun some portion of this request
2003                          * off as async requests... we need to wait for the I/O
2004                          * to complete before returning
2005                          */
2006                         goto wait_for_dwrites;
2007                 }
2008
2009                 /*
2010                  * Now look for pages already in the cache
2011                  * and throw them away.
2012                  * uio->uio_offset is page aligned within the file
2013                  * io_size is a multiple of PAGE_SIZE
2014                  */
2015                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
2016
2017                 /*
2018                  * we want push out these writes asynchronously so that we can overlap
2019                  * the preparation of the next I/O
2020                  * if there are already too many outstanding writes
2021                  * wait until some complete before issuing the next
2022                  */
2023                 lck_mtx_lock(cl_mtxp);
2024
2025                 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2026                         iostate.io_wanted = 1;
2027                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL);
2028                 }
2029                 lck_mtx_unlock(cl_mtxp);
2030
2031                 if (iostate.io_error) {
2032                         /*
2033                          * one of the earlier writes we issued ran into a hard error
2034                          * don't issue any more writes, cleanup the UPL
2035                          * that was just created but not used, then
2036                          * go wait for all writes that are part of this stream
2037                          * to complete before returning the error to the caller
2038                          */
2039                         ubc_upl_abort(upl, 0);
2040
2041                         goto wait_for_dwrites;
2042                 }
2043                 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO | bflag;
2044
2045                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
2046                              (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
2047
2048                 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2049                                    io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2050
2051                 /*
2052                  * update the uio structure to
2053                  * reflect the I/O that we just issued
2054                  */
2055                 uio_update(uio, (user_size_t)io_size);
2056
2057                 io_req_size -= io_size;
2058
2059                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
2060                              (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
2061
2062         } /* end while */
2063
2064         if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
2065
2066                 retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
2067
2068                 if (retval == 0 && *write_type == IO_DIRECT) {
2069
2070                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE,
2071                                      (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2072
2073                         goto next_dwrite;
2074                 }
2075         }
2076
2077 wait_for_dwrites:
2078         if (iostate.io_issued) {
2079                 /*
2080                  * make sure all async writes issued as part of this stream
2081                  * have completed before we return
2082                  */
2083                 lck_mtx_lock(cl_mtxp);
2084
2085                 while (iostate.io_issued != iostate.io_completed) {
2086                         iostate.io_wanted = 1;
2087                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL);
2088                 }
2089                 lck_mtx_unlock(cl_mtxp);
2090         }
2091         if (iostate.io_error)
2092                 retval = iostate.io_error;
2093
2094         if (io_req_size && retval == 0) {
2095                 /*
2096                  * we couldn't handle the tail of this request in DIRECT mode
2097                  * so fire it through the copy path
2098                  *
2099                  * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2100                  * so we can just pass 0 in for the headOff and tailOff
2101                  */
2102                 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
2103
2104                 *write_type = IO_UNKNOWN;
2105         }
2106         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
2107                      (int)uio->uio_offset, io_req_size, retval, 4, 0);
2108
2109         return (retval);
2110 }
2111
2112
2113 static int
2114 cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length,
2115                      int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2116 {
2117         upl_page_info_t *pl;
2118         addr64_t         src_paddr = 0;
2119         upl_t            upl[MAX_VECTS];
2120         vm_offset_t      upl_offset;
2121         u_int32_t        tail_size = 0;
2122         u_int32_t        io_size;
2123         u_int32_t        xsize;
2124         vm_size_t        upl_size;
2125         vm_size_t        upl_needed_size;
2126         mach_msg_type_number_t  pages_in_pl;
2127         int              upl_flags;
2128         kern_return_t    kret;
2129         struct clios     iostate;
2130         int              error  = 0;
2131         int              cur_upl = 0;
2132         int              num_upl = 0;
2133         int              n;
2134         user_addr_t      iov_base;
2135         u_int32_t        devblocksize;
2136         u_int32_t        mem_alignment_mask;
2137
2138         /*
2139          * When we enter this routine, we know
2140          *  -- the io_req_size will not exceed iov_len
2141          *  -- the target address is physically contiguous
2142          */
2143         cluster_syncup(vp, newEOF, callback, callback_arg);
2144
2145         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2146         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2147
2148         iostate.io_completed = 0;
2149         iostate.io_issued = 0;
2150         iostate.io_error = 0;
2151         iostate.io_wanted = 0;
2152
2153 next_cwrite:
2154         io_size = *write_length;
2155
2156         iov_base = uio_curriovbase(uio);
2157
2158         upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2159         upl_needed_size = upl_offset + io_size;
2160
2161         pages_in_pl = 0;
2162         upl_size = upl_needed_size;
2163         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2164                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2165
2166         kret = vm_map_get_upl(current_map(),
2167                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2168                               &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0);
2169
2170         if (kret != KERN_SUCCESS) {
2171                 /*
2172                  * failed to get pagelist
2173                  */
2174                 error = EINVAL;
2175                 goto wait_for_cwrites;
2176         }
2177         num_upl++;
2178
2179         /*
2180          * Consider the possibility that upl_size wasn't satisfied.
2181          */
2182         if (upl_size < upl_needed_size) {
2183                 /*
2184                  * This is a failure in the physical memory case.
2185                  */
2186                 error = EINVAL;
2187                 goto wait_for_cwrites;
2188         }
2189         pl = ubc_upl_pageinfo(upl[cur_upl]);
2190
2191         src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
2192
2193         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
2194                 u_int32_t   head_size;
2195
2196                 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
2197
2198                 if (head_size > io_size)
2199                         head_size = io_size;
2200
2201                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
2202
2203                 if (error)
2204                         goto wait_for_cwrites;
2205
2206                 upl_offset += head_size;
2207                 src_paddr  += head_size;
2208                 io_size    -= head_size;
2209
2210                 iov_base   += head_size;
2211         }
2212         if ((u_int32_t)iov_base & mem_alignment_mask) {
2213                 /*
2214                  * request doesn't set up on a memory boundary
2215                  * the underlying DMA engine can handle...
2216                  * return an error instead of going through
2217                  * the slow copy path since the intent of this
2218                  * path is direct I/O from device memory
2219                  */
2220                 error = EINVAL;
2221                 goto wait_for_cwrites;
2222         }
2223
2224         tail_size = io_size & (devblocksize - 1);
2225         io_size  -= tail_size;
2226
2227         while (io_size && error == 0) {
2228
2229                 if (io_size > MAX_IO_CONTIG_SIZE)
2230                         xsize = MAX_IO_CONTIG_SIZE;
2231                 else
2232                         xsize = io_size;
2233                 /*
2234                  * request asynchronously so that we can overlap
2235                  * the preparation of the next I/O... we'll do
2236                  * the commit after all the I/O has completed
2237                  * since its all issued against the same UPL
2238                  * if there are already too many outstanding writes
2239                  * wait until some have completed before issuing the next
2240                  */
2241                 if (iostate.io_issued) {
2242                         lck_mtx_lock(cl_mtxp);
2243
2244                         while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_IO_CONTIG_SIZE)) {
2245                                 iostate.io_wanted = 1;
2246                                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL);
2247                         }
2248                         lck_mtx_unlock(cl_mtxp);
2249                 }
2250                 if (iostate.io_error) {
2251                         /*
2252                          * one of the earlier writes we issued ran into a hard error
2253                          * don't issue any more writes...
2254                          * go wait for all writes that are part of this stream
2255                          * to complete before returning the error to the caller
2256                          */
2257                         goto wait_for_cwrites;
2258                 }
2259                 /*
2260                  * issue an asynchronous write to cluster_io
2261                  */
2262                 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
2263                                    xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
2264
2265                 if (error == 0) {
2266                         /*
2267                          * The cluster_io write completed successfully,
2268                          * update the uio structure
2269                          */
2270                         uio_update(uio, (user_size_t)xsize);
2271
2272                         upl_offset += xsize;
2273                         src_paddr  += xsize;
2274                         io_size    -= xsize;
2275                 }
2276         }
2277         if (error == 0 && iostate.io_error == 0 && tail_size == 0) {
2278
2279                 error = cluster_io_type(uio, write_type, write_length, 0);
2280
2281                 if (error == 0 && *write_type == IO_CONTIG) {
2282                         cur_upl++;
2283                         goto next_cwrite;
2284                 }
2285         } else
2286                 *write_type = IO_UNKNOWN;
2287
2288 wait_for_cwrites:
2289         /*
2290          * make sure all async writes that are part of this stream
2291          * have completed before we proceed
2292          */
2293         lck_mtx_lock(cl_mtxp);
2294
2295         while (iostate.io_issued != iostate.io_completed) {
2296                 iostate.io_wanted = 1;
2297                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL);
2298         }
2299         lck_mtx_unlock(cl_mtxp);
2300
2301         if (iostate.io_error)
2302                 error = iostate.io_error;
2303
2304         if (error == 0 && tail_size)
2305                 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
2306
2307         for (n = 0; n < num_upl; n++)
2308                 /*
2309                  * just release our hold on each physically contiguous
2310                  * region without changing any state
2311                  */
2312                 ubc_upl_abort(upl[n], 0);
2313
2314         return (error);
2315 }
2316
2317
2318 static int
2319 cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
2320                    off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2321 {
2322         upl_page_info_t *pl;
2323         upl_t            upl;
2324         vm_offset_t      upl_offset = 0;
2325         vm_size_t        upl_size;
2326         off_t            upl_f_offset;
2327         int              pages_in_upl;
2328         int              start_offset;
2329         int              xfer_resid;
2330         int              io_size;
2331         int              io_offset;
2332         int              bytes_to_zero;
2333         int              bytes_to_move;
2334         kern_return_t    kret;
2335         int              retval = 0;
2336         int              io_resid;
2337         long long        total_size;
2338         long long        zero_cnt;
2339         off_t            zero_off;
2340         long long        zero_cnt1;
2341         off_t            zero_off1;
2342         struct cl_extent cl;
2343         int              intersection;
2344         struct cl_writebehind *wbp;
2345         int              bflag;
2346
2347         if (flags & IO_PASSIVE)
2348             bflag = CL_PASSIVE;
2349         else
2350             bflag = 0;
2351
2352         if (uio) {
2353                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
2354                              (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
2355
2356                 io_resid = io_req_size;
2357         } else {
2358                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
2359                              0, 0, (int)oldEOF, (int)newEOF, 0);
2360
2361                 io_resid = 0;
2362         }
2363         zero_cnt  = 0;
2364         zero_cnt1 = 0;
2365         zero_off  = 0;
2366         zero_off1 = 0;
2367
2368         if (flags & IO_HEADZEROFILL) {
2369                 /*
2370                  * some filesystems (HFS is one) don't support unallocated holes within a file...
2371                  * so we zero fill the intervening space between the old EOF and the offset
2372                  * where the next chunk of real data begins.... ftruncate will also use this
2373                  * routine to zero fill to the new EOF when growing a file... in this case, the
2374                  * uio structure will not be provided
2375                  */
2376                 if (uio) {
2377                         if (headOff < uio->uio_offset) {
2378                                 zero_cnt = uio->uio_offset - headOff;
2379                                 zero_off = headOff;
2380                         }
2381                 } else if (headOff < newEOF) {
2382                         zero_cnt = newEOF - headOff;
2383                         zero_off = headOff;
2384                 }
2385         }
2386         if (flags & IO_TAILZEROFILL) {
2387                 if (uio) {
2388                         zero_off1 = uio->uio_offset + io_req_size;
2389
2390                         if (zero_off1 < tailOff)
2391                                 zero_cnt1 = tailOff - zero_off1;
2392                 }
2393         }
2394         if (zero_cnt == 0 && uio == (struct uio *) 0) {
2395                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2396                              retval, 0, 0, 0, 0);
2397                 return (0);
2398         }
2399
2400         while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
2401                 /*
2402                  * for this iteration of the loop, figure out where our starting point is
2403                  */
2404                 if (zero_cnt) {
2405                         start_offset = (int)(zero_off & PAGE_MASK_64);
2406                         upl_f_offset = zero_off - start_offset;
2407                 } else if (io_resid) {
2408                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2409                         upl_f_offset = uio->uio_offset - start_offset;
2410                 } else {
2411                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
2412                         upl_f_offset = zero_off1 - start_offset;
2413                 }
2414                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
2415                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
2416
2417                 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2418                         total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2419
2420                 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
2421
2422                 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
2423                         /*
2424                          * assumption... total_size <= io_resid
2425                          * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
2426                          */
2427                         if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
2428                                 total_size -= start_offset;
2429                         xfer_resid = total_size;
2430
2431                         retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
2432
2433                         if (retval)
2434                                 break;
2435
2436                         io_resid    -= (total_size - xfer_resid);
2437                         total_size   = xfer_resid;
2438                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2439                         upl_f_offset = uio->uio_offset - start_offset;
2440
2441                         if (total_size == 0) {
2442                                 if (start_offset) {
2443                                         /*
2444                                          * the write did not finish on a page boundary
2445                                          * which will leave upl_f_offset pointing to the
2446                                          * beginning of the last page written instead of
2447                                          * the page beyond it... bump it in this case
2448                                          * so that the cluster code records the last page
2449                                          * written as dirty
2450                                          */
2451                                         upl_f_offset += PAGE_SIZE_64;
2452                                 }
2453                                 upl_size = 0;
2454
2455                                 goto check_cluster;
2456                         }
2457                 }
2458                 /*
2459                  * compute the size of the upl needed to encompass
2460                  * the requested write... limit each call to cluster_io
2461                  * to the maximum UPL size... cluster_io will clip if
2462                  * this exceeds the maximum io_size for the device,
2463                  * make sure to account for
2464                  * a starting offset that's not page aligned
2465                  */
2466                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2467
2468                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2469                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2470
2471                 pages_in_upl = upl_size / PAGE_SIZE;
2472                 io_size      = upl_size - start_offset;
2473
2474                 if ((long long)io_size > total_size)
2475                         io_size = total_size;
2476
2477                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
2478
2479
2480                 /*
2481                  * Gather the pages from the buffer cache.
2482                  * The UPL_WILL_MODIFY flag lets the UPL subsystem know
2483                  * that we intend to modify these pages.
2484                  */
2485                 kret = ubc_create_upl(vp,
2486                                       upl_f_offset,
2487                                       upl_size,
2488                                       &upl,
2489                                       &pl,
2490                                       UPL_SET_LITE | UPL_WILL_MODIFY);
2491                 if (kret != KERN_SUCCESS)
2492                         panic("cluster_write_copy: failed to get pagelist");
2493
2494                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
2495                         (int)upl, (int)upl_f_offset, start_offset, 0, 0);
2496
2497                 if (start_offset && !upl_valid_page(pl, 0)) {
2498                         int   read_size;
2499
2500                         /*
2501                          * we're starting in the middle of the first page of the upl
2502                          * and the page isn't currently valid, so we're going to have
2503                          * to read it in first... this is a synchronous operation
2504                          */
2505                         read_size = PAGE_SIZE;
2506
2507                         if ((upl_f_offset + read_size) > newEOF)
2508                                 read_size = newEOF - upl_f_offset;
2509
2510                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
2511                                             CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2512                         if (retval) {
2513                                 /*
2514                                  * we had an error during the read which causes us to abort
2515                                  * the current cluster_write request... before we do, we need
2516                                  * to release the rest of the pages in the upl without modifying
2517                                  * there state and mark the failed page in error
2518                                  */
2519                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2520
2521                                 if (upl_size > PAGE_SIZE)
2522                                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2523
2524                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2525                                              (int)upl, 0, 0, retval, 0);
2526                                 break;
2527                         }
2528                 }
2529                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
2530                         /*
2531                          * the last offset we're writing to in this upl does not end on a page
2532                          * boundary... if it's not beyond the old EOF, then we'll also need to
2533                          * pre-read this page in if it isn't already valid
2534                          */
2535                         upl_offset = upl_size - PAGE_SIZE;
2536
2537                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
2538                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
2539                                 int   read_size;
2540
2541                                 read_size = PAGE_SIZE;
2542
2543                                 if ((upl_f_offset + upl_offset + read_size) > newEOF)
2544                                         read_size = newEOF - (upl_f_offset + upl_offset);
2545
2546                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
2547                                                     CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2548                                 if (retval) {
2549                                         /*
2550                                          * we had an error during the read which causes us to abort
2551                                          * the current cluster_write request... before we do, we
2552                                          * need to release the rest of the pages in the upl without
2553                                          * modifying there state and mark the failed page in error
2554                                          */
2555                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2556
2557                                         if (upl_size > PAGE_SIZE)
2558                                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2559
2560                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2561                                                      (int)upl, 0, 0, retval, 0);
2562                                         break;
2563                                 }
2564                         }
2565                 }
2566                 xfer_resid = io_size;
2567                 io_offset = start_offset;
2568
2569                 while (zero_cnt && xfer_resid) {
2570
2571                         if (zero_cnt < (long long)xfer_resid)
2572                                 bytes_to_zero = zero_cnt;
2573                         else
2574                                 bytes_to_zero = xfer_resid;
2575
2576                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2577                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2578                         } else {
2579                                 int zero_pg_index;
2580
2581                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
2582                                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
2583
2584                                 if ( !upl_valid_page(pl, zero_pg_index)) {
2585                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2586
2587                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2588                                            !upl_dirty_page(pl, zero_pg_index)) {
2589                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2590                                 }
2591                         }
2592                         xfer_resid -= bytes_to_zero;
2593                         zero_cnt   -= bytes_to_zero;
2594                         zero_off   += bytes_to_zero;
2595                         io_offset  += bytes_to_zero;
2596                 }
2597                 if (xfer_resid && io_resid) {
2598                         u_int32_t  io_requested;
2599
2600                         bytes_to_move = min(io_resid, xfer_resid);
2601                         io_requested = bytes_to_move;
2602
2603                         retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
2604
2605                         if (retval) {
2606
2607                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2608
2609                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2610                                              (int)upl, 0, 0, retval, 0);
2611                         } else {
2612                                 io_resid   -= bytes_to_move;
2613                                 xfer_resid -= bytes_to_move;
2614                                 io_offset  += bytes_to_move;
2615                         }
2616                 }
2617                 while (xfer_resid && zero_cnt1 && retval == 0) {
2618
2619                         if (zero_cnt1 < (long long)xfer_resid)
2620                                 bytes_to_zero = zero_cnt1;
2621                         else
2622                                 bytes_to_zero = xfer_resid;
2623
2624                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2625                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2626                         } else {
2627                                 int zero_pg_index;
2628
2629                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
2630                                 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
2631
2632                                 if ( !upl_valid_page(pl, zero_pg_index)) {
2633                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2634                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2635                                            !upl_dirty_page(pl, zero_pg_index)) {
2636                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2637                                 }
2638                         }
2639                         xfer_resid -= bytes_to_zero;
2640                         zero_cnt1  -= bytes_to_zero;
2641                         zero_off1  += bytes_to_zero;
2642                         io_offset  += bytes_to_zero;
2643                 }
2644
2645                 if (retval == 0) {
2646                         int cl_index;
2647                         int ret_cluster_try_push;
2648
2649                         io_size += start_offset;
2650
2651                         if ((upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
2652                                 /*
2653                                  * if we're extending the file with this write
2654                                  * we'll zero fill the rest of the page so that
2655                                  * if the file gets extended again in such a way as to leave a
2656                                  * hole starting at this EOF, we'll have zero's in the correct spot
2657                                  */
2658                                 cluster_zero(upl, io_size, upl_size - io_size, NULL);
2659                         }
2660                         if (flags & IO_SYNC)
2661                                 /*
2662                                  * if the IO_SYNC flag is set than we need to
2663                                  * bypass any clusters and immediately issue
2664                                  * the I/O
2665                                  */
2666                                 goto issue_io;
2667 check_cluster:
2668                         /*
2669                          * take the lock to protect our accesses
2670                          * of the writebehind and sparse cluster state
2671                          */
2672                         wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
2673
2674                         /*
2675                          * calculate the last logical block number
2676                          * that this delayed I/O encompassed
2677                          */
2678                         cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
2679
2680                         if (wbp->cl_scmap) {
2681
2682                                 if ( !(flags & IO_NOCACHE)) {
2683                                         /*
2684                                          * we've fallen into the sparse
2685                                          * cluster method of delaying dirty pages
2686                                          * first, we need to release the upl if we hold one
2687                                          * since pages in it may be present in the sparse cluster map
2688                                          * and may span 2 separate buckets there... if they do and
2689                                          * we happen to have to flush a bucket to make room and it intersects
2690                                          * this upl, a deadlock may result on page BUSY
2691                                          */
2692                                         if (upl_size)
2693                                                 ubc_upl_commit_range(upl, 0, upl_size,
2694                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2695
2696                                         sparse_cluster_add(wbp, vp, &cl, newEOF, callback, callback_arg);
2697
2698                                         lck_mtx_unlock(&wbp->cl_lockw);
2699
2700                                         continue;
2701                                 }
2702                                 /*
2703                                  * must have done cached writes that fell into
2704                                  * the sparse cluster mechanism... we've switched
2705                                  * to uncached writes on the file, so go ahead
2706                                  * and push whatever's in the sparse map
2707                                  * and switch back to normal clustering
2708                                  *
2709                                  * see the comment above concerning a possible deadlock...
2710                                  */
2711                                 if (upl_size) {
2712                                         ubc_upl_commit_range(upl, 0, upl_size,
2713                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2714                                         /*
2715                                          * setting upl_size to 0 keeps us from committing a
2716                                          * second time in the start_new_cluster path
2717                                          */
2718                                         upl_size = 0;
2719                                 }
2720                                 sparse_cluster_push(wbp, vp, newEOF, PUSH_ALL, callback, callback_arg);
2721
2722                                 wbp->cl_number = 0;
2723                                 /*
2724                                  * no clusters of either type present at this point
2725                                  * so just go directly to start_new_cluster since
2726                                  * we know we need to delay this I/O since we've
2727                                  * already released the pages back into the cache
2728                                  * to avoid the deadlock with sparse_cluster_push
2729                                  */
2730                                 goto start_new_cluster;
2731                         }
2732                         upl_offset = 0;
2733
2734                         if (wbp->cl_number == 0)
2735                                 /*
2736                                  * no clusters currently present
2737                                  */
2738                                 goto start_new_cluster;
2739
2740                         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
2741                                 /*
2742                                  * check each cluster that we currently hold
2743                                  * try to merge some or all of this write into
2744                                  * one or more of the existing clusters... if
2745                                  * any portion of the write remains, start a
2746                                  * new cluster
2747                                  */
2748                                 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
2749                                         /*
2750                                          * the current write starts at or after the current cluster
2751                                          */
2752                                         if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + MAX_CLUSTER_SIZE)) {
2753                                                 /*
2754                                                  * we have a write that fits entirely
2755                                                  * within the existing cluster limits
2756                                                  */
2757                                                 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
2758                                                         /*
2759                                                          * update our idea of where the cluster ends
2760                                                          */
2761                                                         wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2762                                                 break;
2763                                         }
2764                                         if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + MAX_CLUSTER_SIZE)) {
2765                                                 /*
2766                                                  * we have a write that starts in the middle of the current cluster
2767                                                  * but extends beyond the cluster's limit... we know this because
2768                                                  * of the previous checks
2769                                                  * we'll extend the current cluster to the max
2770                                                  * and update the b_addr for the current write to reflect that
2771                                                  * the head of it was absorbed into this cluster...
2772                                                  * note that we'll always have a leftover tail in this case since
2773                                                  * full absorbtion would have occurred in the clause above
2774                                                  */
2775                                                 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + MAX_CLUSTER_SIZE;
2776
2777                                                 if (upl_size) {
2778                                                         daddr64_t start_pg_in_upl;
2779
2780                                                         start_pg_in_upl = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
2781
2782                                                         if (start_pg_in_upl < wbp->cl_clusters[cl_index].e_addr) {
2783                                                                 intersection = (int)((wbp->cl_clusters[cl_index].e_addr - start_pg_in_upl) * PAGE_SIZE);
2784
2785                                                                 ubc_upl_commit_range(upl, upl_offset, intersection,
2786                                                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2787                                                                 upl_f_offset += intersection;
2788                                                                 upl_offset   += intersection;
2789                                                                 upl_size     -= intersection;
2790                                                         }
2791                                                 }
2792                                                 cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
2793                                         }
2794                                         /*
2795                                          * we come here for the case where the current write starts
2796                                          * beyond the limit of the existing cluster or we have a leftover
2797                                          * tail after a partial absorbtion
2798                                          *
2799                                          * in either case, we'll check the remaining clusters before
2800                                          * starting a new one
2801                                          */
2802                                 } else {
2803                                         /*
2804                                          * the current write starts in front of the cluster we're currently considering
2805                                          */
2806                                         if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= MAX_CLUSTER_SIZE) {
2807                                                 /*
2808                                                  * we can just merge the new request into
2809                                                  * this cluster and leave it in the cache
2810                                                  * since the resulting cluster is still
2811                                                  * less than the maximum allowable size
2812                                                  */
2813                                                 wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
2814
2815                                                 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
2816                                                         /*
2817                                                          * the current write completely
2818                                                          * envelops the existing cluster and since
2819                                                          * each write is limited to at most MAX_CLUSTER_SIZE pages
2820                                                          * we can just use the start and last blocknos of the write
2821                                                          * to generate the cluster limits
2822                                                          */
2823                                                         wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2824                                                 }
2825                                                 break;
2826                                         }
2827
2828                                         /*
2829                                          * if we were to combine this write with the current cluster
2830                                          * we would exceed the cluster size limit.... so,
2831                                          * let's see if there's any overlap of the new I/O with
2832                                          * the cluster we're currently considering... in fact, we'll
2833                                          * stretch the cluster out to it's full limit and see if we
2834                                          * get an intersection with the current write
2835                                          *
2836                                          */
2837                                         if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - MAX_CLUSTER_SIZE) {
2838                                                 /*
2839                                                  * the current write extends into the proposed cluster
2840                                                  * clip the length of the current write after first combining it's
2841                                                  * tail with the newly shaped cluster
2842                                                  */
2843                                                 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - MAX_CLUSTER_SIZE;
2844
2845                                                 if (upl_size) {
2846                                                         intersection = (int)((cl.e_addr - wbp->cl_clusters[cl_index].b_addr) * PAGE_SIZE);
2847
2848                                                         if ((u_int)intersection > upl_size)
2849                                                                 /*
2850                                                                  * because the current write may consist of a number of pages found in the cache
2851                                                                  * which are not part of the UPL, we may have an intersection that exceeds
2852                                                                  * the size of the UPL that is also part of this write
2853                                                                  */
2854                                                                 intersection = upl_size;
2855
2856                                                         ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2857                                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2858                                                         upl_size -= intersection;
2859                                                 }
2860                                                 cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
2861                                         }
2862                                         /*
2863                                          * if we get here, there was no way to merge
2864                                          * any portion of this write with this cluster
2865                                          * or we could only merge part of it which
2866                                          * will leave a tail...
2867                                          * we'll check the remaining clusters before starting a new one
2868                                          */
2869                                 }
2870                         }
2871                         if (cl_index < wbp->cl_number)
2872                                 /*
2873                                  * we found an existing cluster(s) that we
2874                                  * could entirely merge this I/O into
2875                                  */
2876                                 goto delay_io;
2877
2878                         if (wbp->cl_number < MAX_CLUSTERS)
2879                                 /*
2880                                  * we didn't find an existing cluster to
2881                                  * merge into, but there's room to start
2882                                  * a new one
2883                                  */
2884                                 goto start_new_cluster;
2885
2886                         /*
2887                          * no exisitng cluster to merge with and no
2888                          * room to start a new one... we'll try
2889                          * pushing one of the existing ones... if none of
2890                          * them are able to be pushed, we'll switch
2891                          * to the sparse cluster mechanism
2892                          * cluster_try_push updates cl_number to the
2893                          * number of remaining clusters... and
2894                          * returns the number of currently unused clusters
2895                          */
2896                         ret_cluster_try_push = 0;
2897
2898                         /*
2899                          * if writes are not deferred, call cluster push immediately
2900                          */
2901                         if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2902
2903                                 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, callback, callback_arg);
2904                         }
2905
2906                         /*
2907                          * execute following regardless of writes being deferred or not
2908                          */
2909                         if (ret_cluster_try_push == 0) {
2910                                 /*
2911                                  * no more room in the normal cluster mechanism
2912                                  * so let's switch to the more expansive but expensive
2913                                  * sparse mechanism....
2914                                  * first, we need to release the upl if we hold one
2915                                  * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2916                                  * and may span 2 separate buckets there... if they do and
2917                                  * we happen to have to flush a bucket to make room and it intersects
2918                                  * this upl, a deadlock may result on page BUSY
2919                                  */
2920                                 if (upl_size)
2921                                         ubc_upl_commit_range(upl, upl_offset, upl_size,
2922                                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2923
2924                                 sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg);
2925                                 sparse_cluster_add(wbp, vp, &cl, newEOF, callback, callback_arg);
2926
2927                                 lck_mtx_unlock(&wbp->cl_lockw);
2928
2929                                 continue;
2930                         }
2931                         /*
2932                          * we pushed one cluster successfully, so we must be sequentially writing this file
2933                          * otherwise, we would have failed and fallen into the sparse cluster support
2934                          * so let's take the opportunity to push out additional clusters...
2935                          * this will give us better I/O locality if we're in a copy loop
2936                          * (i.e.  we won't jump back and forth between the read and write points
2937                          */
2938                         if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2939                                 while (wbp->cl_number)
2940                                         cluster_try_push(wbp, vp, newEOF, 0, callback, callback_arg);
2941                         }
2942
2943 start_new_cluster:
2944                         wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
2945                         wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
2946
2947                         wbp->cl_clusters[wbp->cl_number].io_flags = 0;
2948
2949                         if (flags & IO_NOCACHE)
2950                                 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
2951
2952                         if (bflag & CL_PASSIVE)
2953                                 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
2954
2955                         wbp->cl_number++;
2956 delay_io:
2957                         if (upl_size)
2958                                 ubc_upl_commit_range(upl, upl_offset, upl_size,
2959                                                      UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2960
2961                         lck_mtx_unlock(&wbp->cl_lockw);
2962
2963                         continue;
2964 issue_io:
2965                         /*
2966                          * we don't hold the vnode lock at this point
2967                          *
2968                          * because we had to ask for a UPL that provides currenty non-present pages, the
2969                          * UPL has been automatically set to clear the dirty flags (both software and hardware)
2970                          * upon committing it... this is not the behavior we want since it's possible for
2971                          * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
2972                          * in order to maintain some semblance of coherency with mapped writes
2973                          * we need to drop the current upl and pick it back up with COPYOUT_FROM set
2974                          * so that we correctly deal with a change in state of the hardware modify bit...
2975                          * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
2976                          * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
2977                          * responsible for generating the correct sized I/O(s)
2978                          */
2979                         ubc_upl_commit_range(upl, 0, upl_size,
2980                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2981
2982                         cl.e_addr = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
2983
2984                         retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg);
2985                 }
2986         }
2987         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0);
2988
2989         return (retval);
2990 }
2991
2992
2993
2994 int
2995 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
2996 {
2997         return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
2998 }
2999
3000
3001 int
3002 cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg)
3003 {
3004         int             retval = 0;
3005         int             flags;
3006         user_ssize_t    cur_resid;
3007         u_int32_t       io_size;
3008         u_int32_t       read_length = 0;
3009         int             read_type = IO_COPY;
3010
3011         flags = xflags;
3012
3013         if (vp->v_flag & VNOCACHE_DATA)
3014                 flags |= IO_NOCACHE;
3015         if ((vp->v_flag & VRAOFF) || speculative_reads_disabled)
3016                 flags |= IO_RAOFF;
3017
3018         /*
3019          * do a read through the cache if one of the following is true....
3020          *   NOCACHE is not true
3021          *   the uio request doesn't target USERSPACE
3022          * otherwise, find out if we want the direct or contig variant for
3023          * the first vector in the uio request
3024          */
3025         if ( (flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) )
3026                 retval = cluster_io_type(uio, &read_type, &read_length, 0);
3027
3028         while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
3029
3030                 switch (read_type) {
3031
3032                 case IO_COPY:
3033                         /*
3034                          * make sure the uio_resid isn't too big...
3035                          * internally, we want to handle all of the I/O in
3036                          * chunk sizes that fit in a 32 bit int
3037                          */
3038                         if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE))
3039                                 io_size = MAX_IO_REQUEST_SIZE;
3040                         else
3041                                 io_size = (u_int32_t)cur_resid;
3042
3043                         retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
3044                         break;
3045
3046                 case IO_DIRECT:
3047                         retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
3048                         break;
3049
3050                 case IO_CONTIG:
3051                         retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
3052                         break;
3053
3054                 case IO_UNKNOWN:
3055                         retval = cluster_io_type(uio, &read_type, &read_length, 0);
3056                         break;
3057                 }
3058         }
3059         return (retval);
3060 }
3061
3062
3063
3064 static void
3065 cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int flags)
3066 {
3067         int range;
3068         int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
3069
3070         if ((range = last_pg - start_pg)) {
3071                 if ( !(flags & IO_NOCACHE))
3072                         abort_flags |= UPL_ABORT_REFERENCE;
3073
3074                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
3075         }
3076 }
3077
3078
3079 static int
3080 cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3081 {
3082         upl_page_info_t *pl;
3083         upl_t            upl;
3084         vm_offset_t      upl_offset;
3085         int              upl_size;
3086         off_t            upl_f_offset;
3087         int              start_offset;
3088         int              start_pg;
3089         int              last_pg;
3090         int              uio_last = 0;
3091         int              pages_in_upl;
3092         off_t            max_size;
3093         off_t            last_ioread_offset;
3094         off_t            last_request_offset;
3095         kern_return_t    kret;
3096         int              error  = 0;
3097         int              retval = 0;
3098         u_int32_t        size_of_prefetch;
3099         u_int32_t        xsize;
3100         u_int32_t        io_size;
3101         u_int32_t        max_rd_size = MAX_PREFETCH;
3102         u_int            rd_ahead_enabled = 1;
3103         u_int            prefetch_enabled = 1;
3104         struct cl_readahead *   rap;
3105         struct clios            iostate;
3106         struct cl_extent        extent;
3107         int              bflag;
3108         int              take_reference = 1;
3109         struct uthread  *ut;
3110         int              policy = IOPOL_DEFAULT;
3111
3112         policy = current_proc()->p_iopol_disk;
3113
3114         ut = get_bsdthread_info(current_thread());
3115
3116         if (ut->uu_iopol_disk != IOPOL_DEFAULT)
3117                 policy = ut->uu_iopol_disk;
3118
3119         if (policy == IOPOL_THROTTLE)
3120                 take_reference = 0;
3121
3122         if (flags & IO_PASSIVE)
3123             bflag = CL_PASSIVE;
3124         else
3125             bflag = 0;
3126
3127         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
3128                      (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
3129
3130         last_request_offset = uio->uio_offset + io_req_size;
3131
3132         if ((flags & (IO_RAOFF|IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
3133                 rd_ahead_enabled = 0;
3134                 rap = NULL;
3135         } else {
3136                 if (cluster_hard_throttle_on(vp)) {
3137                         rd_ahead_enabled = 0;
3138                         prefetch_enabled = 0;
3139
3140                         max_rd_size = HARD_THROTTLE_MAXSIZE;
3141                 }
3142                 if ((rap = cluster_get_rap(vp)) == NULL)
3143                         rd_ahead_enabled = 0;
3144         }
3145         if (last_request_offset > filesize)
3146                 last_request_offset = filesize;
3147         extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
3148         extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
3149
3150         if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
3151                 /*
3152                  * determine if we already have a read-ahead in the pipe courtesy of the
3153                  * last read systemcall that was issued...
3154                  * if so, pick up it's extent to determine where we should start
3155                  * with respect to any read-ahead that might be necessary to
3156                  * garner all the data needed to complete this read systemcall
3157                  */
3158                 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
3159
3160                 if (last_ioread_offset < uio->uio_offset)
3161                         last_ioread_offset = (off_t)0;
3162                 else if (last_ioread_offset > last_request_offset)
3163                         last_ioread_offset = last_request_offset;
3164         } else
3165                 last_ioread_offset = (off_t)0;
3166
3167         while (io_req_size && uio->uio_offset < filesize && retval == 0) {
3168                 /*
3169                  * compute the size of the upl needed to encompass
3170                  * the requested read... limit each call to cluster_io
3171                  * to the maximum UPL size... cluster_io will clip if
3172                  * this exceeds the maximum io_size for the device,
3173                  * make sure to account for
3174                  * a starting offset that's not page aligned
3175                  */
3176                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3177                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
3178                 max_size     = filesize - uio->uio_offset;
3179
3180                 if ((off_t)(io_req_size) < max_size)
3181                         io_size = io_req_size;
3182                 else
3183                         io_size = max_size;
3184
3185                 if (!(flags & IO_NOCACHE)) {
3186
3187                         while (io_size) {
3188                                 u_int32_t io_resid;
3189                                 u_int32_t io_requested;
3190
3191                                 /*
3192                                  * if we keep finding the pages we need already in the cache, then
3193                                  * don't bother to call cluster_read_prefetch since it costs CPU cycles
3194                                  * to determine that we have all the pages we need... once we miss in
3195                                  * the cache and have issued an I/O, than we'll assume that we're likely
3196                                  * to continue to miss in the cache and it's to our advantage to try and prefetch
3197                                  */
3198                                 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
3199                                         if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
3200                                                 /*
3201                                                  * we've already issued I/O for this request and
3202                                                  * there's still work to do and
3203                                                  * our prefetch stream is running dry, so issue a
3204                                                  * pre-fetch I/O... the I/O latency will overlap
3205                                                  * with the copying of the data
3206                                                  */
3207                                                 if (size_of_prefetch > max_rd_size)
3208                                                         size_of_prefetch = max_rd_size;
3209
3210                                                 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
3211
3212                                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
3213
3214                                                 if (last_ioread_offset > last_request_offset)
3215                                                         last_ioread_offset = last_request_offset;
3216                                         }
3217                                 }
3218                                 /*
3219                                  * limit the size of the copy we're about to do so that
3220                                  * we can notice that our I/O pipe is running dry and
3221                                  * get the next I/O issued before it does go dry
3222                                  */
3223                                 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
3224                                         io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
3225                                 else
3226                                         io_resid = io_size;
3227
3228                                 io_requested = io_resid;
3229
3230                                 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
3231
3232                                 xsize = io_requested - io_resid;
3233
3234                                 io_size -= xsize;
3235                                 io_req_size -= xsize;
3236
3237                                 if (retval || io_resid)
3238                                         /*
3239                                          * if we run into a real error or
3240                                          * a page that is not in the cache
3241                                          * we need to leave streaming mode
3242                                          */
3243                                         break;
3244
3245                                 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
3246                                         /*
3247                                          * we're already finished the I/O for this read request
3248                                          * let's see if we should do a read-ahead
3249                                          */
3250                                         cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
3251                                 }
3252                         }
3253                         if (retval)
3254                                 break;
3255                         if (io_size == 0) {
3256                                 if (rap != NULL) {
3257                                         if (extent.e_addr < rap->cl_lastr)
3258                                                 rap->cl_maxra = 0;
3259                                         rap->cl_lastr = extent.e_addr;
3260                                 }
3261                                 break;
3262                         }
3263                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3264                         upl_f_offset = uio->uio_offset - (off_t)start_offset;
3265                         max_size     = filesize - uio->uio_offset;
3266                 }
3267                 if (io_size > max_rd_size)
3268                         io_size = max_rd_size;
3269
3270                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3271
3272                 if (flags & IO_NOCACHE) {
3273                         if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3274                                 upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE);
3275                 } else {
3276                         if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
3277                                 upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
3278                 }
3279                 pages_in_upl = upl_size / PAGE_SIZE;
3280
3281                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
3282                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3283
3284                 kret = ubc_create_upl(vp,
3285                                       upl_f_offset,
3286                                       upl_size,
3287                                       &upl,
3288                                       &pl,
3289                                       UPL_FILE_IO | UPL_SET_LITE);
3290                 if (kret != KERN_SUCCESS)
3291                         panic("cluster_read_copy: failed to get pagelist");
3292
3293                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
3294                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3295
3296                 /*
3297                  * scan from the beginning of the upl looking for the first
3298                  * non-valid page.... this will become the first page in
3299                  * the request we're going to make to 'cluster_io'... if all
3300                  * of the pages are valid, we won't call through to 'cluster_io'
3301                  */
3302                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
3303                         if (!upl_valid_page(pl, start_pg))
3304                                 break;
3305                 }
3306
3307                 /*
3308                  * scan from the starting invalid page looking for a valid
3309                  * page before the end of the upl is reached, if we
3310                  * find one, then it will be the last page of the request to
3311                  * 'cluster_io'
3312                  */
3313                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3314                         if (upl_valid_page(pl, last_pg))
3315                                 break;
3316                 }
3317                 iostate.io_completed = 0;
3318                 iostate.io_issued = 0;
3319                 iostate.io_error = 0;
3320                 iostate.io_wanted = 0;
3321
3322                 if (start_pg < last_pg) {
3323                         /*
3324                          * we found a range of 'invalid' pages that must be filled
3325                          * if the last page in this range is the last page of the file
3326                          * we may have to clip the size of it to keep from reading past
3327                          * the end of the last physical block associated with the file
3328                          */
3329                         upl_offset = start_pg * PAGE_SIZE;
3330                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
3331
3332                         if ((upl_f_offset + upl_offset + io_size) > filesize)
3333                                 io_size = filesize - (upl_f_offset + upl_offset);
3334
3335                         /*
3336                          * issue an asynchronous read to cluster_io
3337                          */
3338
3339                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
3340                                            io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg);
3341                 }
3342                 if (error == 0) {
3343                         /*
3344                          * if the read completed successfully, or there was no I/O request
3345                          * issued, than copy the data into user land via 'cluster_upl_copy_data'
3346                          * we'll first add on any 'valid'
3347                          * pages that were present in the upl when we acquired it.
3348                          */
3349                         u_int  val_size;
3350
3351                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
3352                                 if (!upl_valid_page(pl, uio_last))
3353                                         break;
3354                         }
3355                         if (uio_last < pages_in_upl) {
3356                                 /*
3357                                  * there were some invalid pages beyond the valid pages
3358                                  * that we didn't issue an I/O for, just release them
3359                                  * unchanged now, so that any prefetch/readahed can
3360                                  * include them
3361                                  */
3362                                 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
3363                                                     (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
3364                         }
3365
3366                         /*
3367                          * compute size to transfer this round,  if io_req_size is
3368                          * still non-zero after this attempt, we'll loop around and
3369                          * set up for another I/O.
3370                          */
3371                         val_size = (uio_last * PAGE_SIZE) - start_offset;
3372
3373                         if (val_size > max_size)
3374                                 val_size = max_size;
3375
3376                         if (val_size > io_req_size)
3377                                 val_size = io_req_size;
3378
3379                         if ((uio->uio_offset + val_size) > last_ioread_offset)
3380                                 last_ioread_offset = uio->uio_offset + val_size;
3381
3382                         if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
3383
3384                                 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
3385                                         /*
3386                                          * if there's still I/O left to do for this request, and...
3387                                          * we're not in hard throttle mode, and...
3388                                          * we're close to using up the previous prefetch, then issue a
3389                                          * new pre-fetch I/O... the I/O latency will overlap
3390                                          * with the copying of the data
3391                                          */
3392                                         if (size_of_prefetch > max_rd_size)
3393                                                 size_of_prefetch = max_rd_size;
3394
3395                                         size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
3396
3397                                         last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
3398
3399                                         if (last_ioread_offset > last_request_offset)
3400                                                 last_ioread_offset = last_request_offset;
3401                                 }
3402
3403                         } else if ((uio->uio_offset + val_size) == last_request_offset) {
3404                                 /*
3405                                  * this transfer will finish this request, so...
3406                                  * let's try to read ahead if we're in
3407                                  * a sequential access pattern and we haven't
3408                                  * explicitly disabled it
3409                                  */
3410                                 if (rd_ahead_enabled)
3411                                         cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
3412
3413                                 if (rap != NULL) {
3414                                         if (extent.e_addr < rap->cl_lastr)
3415                                                 rap->cl_maxra = 0;
3416                                         rap->cl_lastr = extent.e_addr;
3417                                 }
3418                         }
3419                         lck_mtx_lock(cl_mtxp);
3420
3421                         while (iostate.io_issued != iostate.io_completed) {
3422                                 iostate.io_wanted = 1;
3423                                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_copy", NULL);
3424                         }
3425                         lck_mtx_unlock(cl_mtxp);
3426
3427                         if (iostate.io_error)
3428                                 error = iostate.io_error;
3429                         else {
3430                                 u_int32_t io_requested;
3431
3432                                 io_requested = val_size;
3433
3434                                 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
3435
3436                                 io_req_size -= (val_size - io_requested);
3437                         }
3438                 }
3439                 if (start_pg < last_pg) {
3440                         /*
3441                          * compute the range of pages that we actually issued an I/O for
3442                          * and either commit them as valid if the I/O succeeded
3443                          * or abort them if the I/O failed or we're not supposed to
3444                          * keep them in the cache
3445                          */
3446                         io_size = (last_pg - start_pg) * PAGE_SIZE;
3447
3448                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3449
3450                         if (error || (flags & IO_NOCACHE))
3451                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
3452                                                     UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3453                         else
3454                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
3455                                                      UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY | UPL_COMMIT_INACTIVATE);
3456
3457                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3458                 }
3459                 if ((last_pg - start_pg) < pages_in_upl) {
3460                         /*
3461                          * the set of pages that we issued an I/O for did not encompass
3462                          * the entire upl... so just release these without modifying
3463                          * their state
3464                          */
3465                         if (error)
3466                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3467                         else {
3468
3469                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3470                                              (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
3471
3472                                 /*
3473                                  * handle any valid pages at the beginning of
3474                                  * the upl... release these appropriately
3475                                  */
3476                                 cluster_read_upl_release(upl, 0, start_pg, flags);
3477
3478                                 /*
3479                                  * handle any valid pages immediately after the
3480                                  * pages we issued I/O for... ... release these appropriately
3481                                  */
3482                                 cluster_read_upl_release(upl, last_pg, uio_last, flags);
3483
3484                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, (int)upl, -1, -1, 0, 0);
3485                         }
3486                 }
3487                 if (retval == 0)
3488                         retval = error;
3489
3490                 if (io_req_size) {
3491                         if (cluster_hard_throttle_on(vp)) {
3492                                 rd_ahead_enabled = 0;
3493                                 prefetch_enabled = 0;
3494
3495                                 max_rd_size = HARD_THROTTLE_MAXSIZE;
3496                         } else {
3497                                 if (max_rd_size == HARD_THROTTLE_MAXSIZE) {
3498                                         /*
3499                                          * coming out of throttled state
3500                                          */
3501                                         if (rap != NULL)
3502                                                 rd_ahead_enabled = 1;
3503                                         prefetch_enabled = 1;
3504
3505                                         max_rd_size = MAX_PREFETCH;
3506                                         last_ioread_offset = 0;
3507                                 }
3508                         }
3509                 }
3510         }
3511         if (rap != NULL) {
3512                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3513                              (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
3514
3515                 lck_mtx_unlock(&rap->cl_lockr);
3516         } else {
3517                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3518                              (int)uio->uio_offset, io_req_size, 0, retval, 0);
3519         }
3520
3521         return (retval);
3522 }
3523
3524
3525 static int
3526 cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
3527                     int flags, int (*callback)(buf_t, void *), void *callback_arg)
3528 {
3529         upl_t            upl;
3530         upl_page_info_t  *pl;
3531         off_t            max_io_size;
3532         vm_offset_t      upl_offset;
3533         vm_size_t        upl_size;
3534         vm_size_t        upl_needed_size;
3535         unsigned int     pages_in_pl;
3536         int              upl_flags;
3537         int              bflag;
3538         kern_return_t    kret;
3539         unsigned int     i;
3540         int              force_data_sync;
3541         int              retval = 0;
3542         int              no_zero_fill = 0;
3543         int              abort_flag = 0;
3544         int              io_flag = 0;
3545         int              misaligned = 0;
3546         struct clios     iostate;
3547         user_addr_t      iov_base;
3548         u_int32_t        io_req_size;
3549         u_int32_t        offset_in_file;
3550         u_int32_t        offset_in_iovbase;
3551         u_int32_t        io_size;
3552         u_int32_t        io_min;
3553         u_int32_t        xsize;
3554         u_int32_t        devblocksize;
3555         u_int32_t        mem_alignment_mask;
3556         u_int32_t        max_rd_size  = MAX_UPL_TRANSFER * PAGE_SIZE;
3557         u_int32_t        max_rd_ahead = MAX_PREFETCH;
3558
3559         if (flags & IO_PASSIVE)
3560             bflag = CL_PASSIVE;
3561         else
3562             bflag = 0;
3563
3564         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
3565                      (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
3566
3567         iostate.io_completed = 0;
3568         iostate.io_issued = 0;
3569         iostate.io_error = 0;
3570         iostate.io_wanted = 0;
3571
3572         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
3573         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
3574
3575         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
3576                      (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
3577
3578         if (devblocksize == 1) {
3579                /*
3580                 * the AFP client advertises a devblocksize of 1
3581                 * however, its BLOCKMAP routine maps to physical
3582                 * blocks that are PAGE_SIZE in size...
3583                 * therefore we can't ask for I/Os that aren't page aligned
3584                 * or aren't multiples of PAGE_SIZE in size
3585                 * by setting devblocksize to PAGE_SIZE, we re-instate
3586                 * the old behavior we had before the mem_alignment_mask
3587                 * changes went in...
3588                 */
3589                devblocksize = PAGE_SIZE;
3590         }
3591 next_dread:
3592         io_req_size = *read_length;
3593         iov_base = uio_curriovbase(uio);
3594
3595         max_io_size = filesize - uio->uio_offset;
3596
3597         if ((off_t)io_req_size > max_io_size)
3598                 io_req_size = max_io_size;
3599
3600         offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
3601         offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
3602
3603         if (offset_in_file || offset_in_iovbase) {
3604                 /*
3605                  * one of the 2 important offsets is misaligned
3606                  * so fire an I/O through the cache for this entire vector
3607                  */
3608                 misaligned = 1;
3609         }
3610         if (iov_base & (devblocksize - 1)) {
3611                 /*
3612                  * the offset in memory must be on a device block boundary
3613                  * so that we can guarantee that we can generate an
3614                  * I/O that ends on a page boundary in cluster_io
3615                  */
3616                 misaligned = 1;
3617         }
3618         /*
3619          * When we get to this point, we know...
3620          *  -- the offset into the file is on a devblocksize boundary
3621          */
3622
3623         while (io_req_size && retval == 0) {
3624                 u_int32_t io_start;
3625
3626                 if (cluster_hard_throttle_on(vp)) {
3627                         max_rd_size  = HARD_THROTTLE_MAXSIZE;
3628                         max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
3629                 } else {
3630                         max_rd_size  = MAX_UPL_TRANSFER * PAGE_SIZE;
3631                         max_rd_ahead = MAX_PREFETCH;
3632                 }
3633                 io_start = io_size = io_req_size;
3634
3635                 /*
3636                  * First look for pages already in the cache
3637                  * and move them to user space.
3638                  *
3639                  * cluster_copy_ubc_data returns the resid
3640                  * in io_size
3641                  */
3642                 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
3643
3644                 /*
3645                  * calculate the number of bytes actually copied
3646                  * starting size - residual
3647                  */
3648                 xsize = io_start - io_size;
3649
3650                 io_req_size -= xsize;
3651
3652                 /*
3653                  * check to see if we are finished with this request...
3654                  */
3655                 if (io_req_size == 0 || misaligned) {
3656                         /*
3657                          * see if there's another uio vector to
3658                          * process that's of type IO_DIRECT
3659                          *
3660                          * break out of while loop to get there
3661                          */
3662                         break;
3663                 }
3664                 /*
3665                  * assume the request ends on a device block boundary
3666                  */
3667                 io_min = devblocksize;
3668
3669                 /*
3670                  * we can handle I/O's in multiples of the device block size
3671                  * however, if io_size isn't a multiple of devblocksize we
3672                  * want to clip it back to the nearest page boundary since
3673                  * we are going to have to go through cluster_read_copy to
3674                  * deal with the 'overhang'... by clipping it to a PAGE_SIZE
3675                  * multiple, we avoid asking the drive for the same physical
3676                  * blocks twice.. once for the partial page at the end of the
3677                  * request and a 2nd time for the page we read into the cache
3678                  * (which overlaps the end of the direct read) in order to
3679                  * get at the overhang bytes
3680                  */
3681                 if (io_size & (devblocksize - 1)) {
3682                         /*
3683                          * request does NOT end on a device block boundary
3684                          * so clip it back to a PAGE_SIZE boundary
3685                          */
3686                         io_size &= ~PAGE_MASK;
3687                         io_min = PAGE_SIZE;
3688                 }
3689                 if (retval || io_size < io_min) {
3690                         /*
3691                          * either an error or we only have the tail left to
3692                          * complete via the copy path...
3693                          * we may have already spun some portion of this request
3694                          * off as async requests... we need to wait for the I/O
3695                          * to complete before returning
3696                          */
3697                         goto wait_for_dreads;
3698                 }
3699                 if ((xsize = io_size) > max_rd_size)
3700                         xsize = max_rd_size;
3701
3702                 io_size = 0;
3703
3704                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
3705
3706                 if (io_size == 0) {
3707                         /*
3708                          * a page must have just come into the cache
3709                          * since the first page in this range is no
3710                          * longer absent, go back and re-evaluate
3711                          */
3712                         continue;
3713                 }
3714                 iov_base = uio_curriovbase(uio);
3715
3716                 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
3717                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
3718
3719                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
3720                              (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
3721
3722                 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
3723                         no_zero_fill = 1;
3724                         abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY;
3725                 } else {
3726                         no_zero_fill = 0;
3727                         abort_flag = UPL_ABORT_FREE_ON_EMPTY;
3728                 }
3729                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
3730                         pages_in_pl = 0;
3731                         upl_size = upl_needed_size;
3732                         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3733
3734                         if (no_zero_fill)
3735                                 upl_flags |= UPL_NOZEROFILL;
3736                         if (force_data_sync)
3737                                 upl_flags |= UPL_FORCE_DATA_SYNC;
3738
3739                         kret = vm_map_create_upl(current_map(),
3740                                                  (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
3741                                                  &upl_size, &upl, NULL, &pages_in_pl, &upl_flags);
3742
3743                         if (kret != KERN_SUCCESS) {
3744                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3745                                              (int)upl_offset, upl_size, io_size, kret, 0);
3746                                 /*
3747                                  * failed to get pagelist
3748                                  *
3749                                  * we may have already spun some portion of this request
3750                                  * off as async requests... we need to wait for the I/O
3751                                  * to complete before returning
3752                                  */
3753                                 goto wait_for_dreads;
3754                         }
3755                         pages_in_pl = upl_size / PAGE_SIZE;
3756                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3757
3758                         for (i = 0; i < pages_in_pl; i++) {
3759                                 if (!upl_valid_page(pl, i))
3760                                         break;
3761                         }
3762                         if (i == pages_in_pl)
3763                                 break;
3764
3765                         ubc_upl_abort(upl, abort_flag);
3766                 }
3767                 if (force_data_sync >= 3) {
3768                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3769                                      (int)upl_offset, upl_size, io_size, kret, 0);
3770
3771                         goto wait_for_dreads;
3772                 }
3773                 /*
3774                  * Consider the possibility that upl_size wasn't satisfied.
3775                  */
3776                 if (upl_size < upl_needed_size) {
3777                         if (upl_size && upl_offset == 0)
3778                                 io_size = upl_size;
3779                         else
3780                                 io_size = 0;
3781                 }
3782                 if (io_size == 0) {
3783                         ubc_upl_abort(upl, abort_flag);
3784                         goto wait_for_dreads;
3785                 }
3786                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3787                              (int)upl_offset, upl_size, io_size, kret, 0);
3788
3789                 /*
3790                  * request asynchronously so that we can overlap
3791                  * the preparation of the next I/O
3792                  * if there are already too many outstanding reads
3793                  * wait until some have completed before issuing the next read
3794                  */
3795                 lck_mtx_lock(cl_mtxp);
3796
3797                 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
3798                         iostate.io_wanted = 1;
3799                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL);
3800                 }
3801                 lck_mtx_unlock(cl_mtxp);
3802
3803                 if (iostate.io_error) {
3804                         /*
3805                          * one of the earlier reads we issued ran into a hard error
3806                          * don't issue any more reads, cleanup the UPL
3807                          * that was just created but not used, then
3808                          * go wait for any other reads to complete before
3809                          * returning the error to the caller
3810                          */
3811                         ubc_upl_abort(upl, abort_flag);
3812
3813                         goto wait_for_dreads;
3814                 }
3815                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
3816                              (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
3817
3818                 if (no_zero_fill)
3819                         io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO | bflag;
3820                 else
3821                         io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO | CL_PRESERVE | bflag;
3822
3823                 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
3824
3825                 /*
3826                  * update the uio structure
3827                  */
3828                 uio_update(uio, (user_size_t)io_size);
3829
3830                 io_req_size -= io_size;
3831
3832                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
3833                              (int)upl, (int)uio->uio_offset, io_req_size, retval, 0);
3834
3835         } /* end while */
3836
3837         if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
3838
3839                 retval = cluster_io_type(uio, read_type, read_length, 0);
3840
3841                 if (retval == 0 && *read_type == IO_DIRECT) {
3842
3843                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
3844                                      (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
3845
3846                         goto next_dread;
3847                 }
3848         }
3849
3850 wait_for_dreads:
3851         if (iostate.io_issued) {
3852                 /*
3853                  * make sure all async reads that are part of this stream
3854                  * have completed before we return
3855                  */
3856                 lck_mtx_lock(cl_mtxp);
3857
3858                 while (iostate.io_issued != iostate.io_completed) {
3859                         iostate.io_wanted = 1;
3860                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL);
3861                 }
3862                 lck_mtx_unlock(cl_mtxp);
3863         }
3864
3865         if (iostate.io_error)
3866                 retval = iostate.io_error;
3867
3868         if (io_req_size && retval == 0) {
3869                 /*
3870                  * we couldn't handle the tail of this request in DIRECT mode
3871                  * so fire it through the copy path
3872                  */
3873                 retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
3874
3875                 *read_type = IO_UNKNOWN;
3876         }
3877         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
3878                      (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
3879
3880         return (retval);
3881 }
3882
3883
3884 static int
3885 cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
3886                     int (*callback)(buf_t, void *), void *callback_arg, int flags)
3887 {
3888         upl_page_info_t *pl;
3889         upl_t            upl[MAX_VECTS];
3890         vm_offset_t      upl_offset;
3891         addr64_t         dst_paddr = 0;
3892         user_addr_t      iov_base;
3893         off_t            max_size;
3894         vm_size_t        upl_size;
3895         vm_size_t        upl_needed_size;
3896         mach_msg_type_number_t  pages_in_pl;
3897         int              upl_flags;
3898         kern_return_t    kret;
3899         struct clios     iostate;
3900         int              error= 0;
3901         int              cur_upl = 0;
3902         int              num_upl = 0;
3903         int              n;
3904         u_int32_t        xsize;
3905         u_int32_t        io_size;
3906         u_int32_t        devblocksize;
3907         u_int32_t        mem_alignment_mask;
3908         u_int32_t        tail_size = 0;
3909         int              bflag;
3910
3911         if (flags & IO_PASSIVE)
3912             bflag = CL_PASSIVE;
3913         else
3914             bflag = 0;
3915
3916         /*
3917          * When we enter this routine, we know
3918          *  -- the read_length will not exceed the current iov_len
3919          *  -- the target address is physically contiguous for read_length
3920          */
3921         cluster_syncup(vp, filesize, callback, callback_arg);
3922
3923         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
3924         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
3925
3926         iostate.io_completed = 0;
3927         iostate.io_issued = 0;
3928         iostate.io_error = 0;
3929         iostate.io_wanted = 0;
3930
3931 next_cread:
3932         io_size = *read_length;
3933
3934         max_size = filesize - uio->uio_offset;
3935
3936         if (io_size > max_size)
3937                 io_size = max_size;
3938
3939         iov_base = uio_curriovbase(uio);
3940
3941         upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
3942         upl_needed_size = upl_offset + io_size;
3943
3944         pages_in_pl = 0;
3945         upl_size = upl_needed_size;
3946         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3947
3948
3949         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START,
3950                      (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
3951
3952         kret = vm_map_get_upl(current_map(),
3953                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
3954                               &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0);
3955
3956         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END,
3957                      (int)upl_offset, upl_size, io_size, kret, 0);
3958
3959         if (kret != KERN_SUCCESS) {
3960                 /*
3961                  * failed to get pagelist
3962                  */
3963                 error = EINVAL;
3964                 goto wait_for_creads;
3965         }
3966         num_upl++;
3967
3968         if (upl_size < upl_needed_size) {
3969                 /*
3970                  * The upl_size wasn't satisfied.
3971                  */
3972                 error = EINVAL;
3973                 goto wait_for_creads;
3974         }
3975         pl = ubc_upl_pageinfo(upl[cur_upl]);
3976
3977         dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
3978
3979         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3980                 u_int32_t   head_size;
3981
3982                 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
3983
3984                 if (head_size > io_size)
3985                         head_size = io_size;
3986
3987                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
3988
3989                 if (error)
3990                         goto wait_for_creads;
3991
3992                 upl_offset += head_size;
3993                 dst_paddr  += head_size;
3994                 io_size    -= head_size;
3995
3996                 iov_base   += head_size;
3997         }
3998         if ((u_int32_t)iov_base & mem_alignment_mask) {
3999                 /*
4000                  * request doesn't set up on a memory boundary
4001                  * the underlying DMA engine can handle...
4002                  * return an error instead of going through
4003                  * the slow copy path since the intent of this
4004                  * path is direct I/O to device memory
4005                  */
4006                 error = EINVAL;
4007                 goto wait_for_creads;
4008         }
4009
4010         tail_size = io_size & (devblocksize - 1);
4011
4012         io_size  -= tail_size;
4013
4014         while (io_size && error == 0) {
4015
4016                 if (io_size > MAX_IO_CONTIG_SIZE)
4017                         xsize = MAX_IO_CONTIG_SIZE;
4018                 else
4019                         xsize = io_size;
4020                 /*
4021                  * request asynchronously so that we can overlap
4022                  * the preparation of the next I/O... we'll do
4023                  * the commit after all the I/O has completed
4024                  * since its all issued against the same UPL
4025                  * if there are already too many outstanding reads
4026                  * wait until some have completed before issuing the next
4027                  */
4028                 if (iostate.io_issued) {
4029                         lck_mtx_lock(cl_mtxp);
4030
4031                         while ((iostate.io_issued - iostate.io_completed) > (3 * MAX_IO_CONTIG_SIZE)) {
4032                                 iostate.io_wanted = 1;
4033                                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL);
4034                         }
4035                         lck_mtx_unlock(cl_mtxp);
4036                 }
4037                 if (iostate.io_error) {
4038                         /*
4039                          * one of the earlier reads we issued ran into a hard error
4040                          * don't issue any more reads...
4041                          * go wait for any other reads to complete before
4042                          * returning the error to the caller
4043                          */
4044                         goto wait_for_creads;
4045                 }
4046                 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
4047                                    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag,
4048                                    (buf_t)NULL, &iostate, callback, callback_arg);
4049                 /*
4050                  * The cluster_io read was issued successfully,
4051                  * update the uio structure
4052                  */
4053                 if (error == 0) {
4054                         uio_update(uio, (user_size_t)xsize);
4055
4056                         dst_paddr  += xsize;
4057                         upl_offset += xsize;
4058                         io_size    -= xsize;
4059                 }
4060         }
4061         if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
4062
4063                 error = cluster_io_type(uio, read_type, read_length, 0);
4064
4065                 if (error == 0 && *read_type == IO_CONTIG) {
4066                         cur_upl++;
4067                         goto next_cread;
4068                 }
4069         } else
4070                 *read_type = IO_UNKNOWN;
4071
4072 wait_for_creads:
4073         /*
4074          * make sure all async reads that are part of this stream
4075          * have completed before we proceed
4076          */
4077         lck_mtx_lock(cl_mtxp);
4078
4079         while (iostate.io_issued != iostate.io_completed) {
4080                 iostate.io_wanted = 1;
4081                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL);
4082         }
4083         lck_mtx_unlock(cl_mtxp);
4084
4085         if (iostate.io_error)
4086                 error = iostate.io_error;
4087
4088         if (error == 0 && tail_size)
4089                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
4090
4091         for (n = 0; n < num_upl; n++)
4092                 /*
4093                  * just release our hold on each physically contiguous
4094                  * region without changing any state
4095                  */
4096                 ubc_upl_abort(upl[n], 0);
4097
4098         return (error);
4099 }
4100
4101
4102 static int
4103 cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length)
4104 {
4105         user_size_t      iov_len;
4106         user_addr_t      iov_base = 0;
4107         upl_t            upl;
4108         vm_size_t        upl_size;
4109         int              upl_flags;
4110         int              retval = 0;
4111
4112         /*
4113          * skip over any emtpy vectors
4114          */
4115         uio_update(uio, (user_size_t)0);
4116
4117         iov_len = uio_curriovlen(uio);
4118
4119         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, (int)uio, (int)iov_len, 0, 0, 0);
4120
4121         if (iov_len) {
4122                 iov_base = uio_curriovbase(uio);
4123                 /*
4124                  * make sure the size of the vector isn't too big...
4125                  * internally, we want to handle all of the I/O in
4126                  * chunk sizes that fit in a 32 bit int
4127                  */
4128                 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE)
4129                         upl_size = MAX_IO_REQUEST_SIZE;
4130                 else
4131                         upl_size = (u_int32_t)iov_len;
4132
4133                 upl_flags = UPL_QUERY_OBJECT_TYPE;
4134
4135                 if ((vm_map_get_upl(current_map(),
4136                                     (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
4137                                     &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
4138                         /*
4139                          * the user app must have passed in an invalid address
4140                          */
4141                         retval = EFAULT;
4142                 }
4143                 if (upl_size == 0)
4144                         retval = EFAULT;
4145
4146                 *io_length = upl_size;
4147
4148                 if (upl_flags & UPL_PHYS_CONTIG)
4149                         *io_type = IO_CONTIG;
4150                 else if (iov_len >= min_length)
4151                         *io_type = IO_DIRECT;
4152                 else
4153                         *io_type = IO_COPY;
4154         } else {
4155                 /*
4156                  * nothing left to do for this uio
4157                  */
4158                 *io_length = 0;
4159                 *io_type   = IO_UNKNOWN;
4160         }
4161         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, (int)iov_base, *io_type, *io_length, retval, 0);
4162
4163         return (retval);
4164 }
4165
4166
4167 /*
4168  * generate advisory I/O's in the largest chunks possible
4169  * the completed pages will be released into the VM cache
4170  */
4171 int
4172 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
4173 {
4174         return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
4175 }
4176
4177 int
4178 advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
4179 {
4180         upl_page_info_t *pl;
4181         upl_t            upl;
4182         vm_offset_t      upl_offset;
4183         int              upl_size;
4184         off_t            upl_f_offset;
4185         int              start_offset;
4186         int              start_pg;
4187         int              last_pg;
4188         int              pages_in_upl;
4189         off_t            max_size;
4190         int              io_size;
4191         kern_return_t    kret;
4192         int              retval = 0;
4193         int              issued_io;
4194         int              skip_range;
4195
4196         if ( !UBCINFOEXISTS(vp))
4197                 return(EINVAL);
4198
4199         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
4200                      (int)f_offset, resid, (int)filesize, 0, 0);
4201
4202         while (resid && f_offset < filesize && retval == 0) {
4203                 /*
4204                  * compute the size of the upl needed to encompass
4205                  * the requested read... limit each call to cluster_io
4206                  * to the maximum UPL size... cluster_io will clip if
4207                  * this exceeds the maximum io_size for the device,
4208                  * make sure to account for
4209                  * a starting offset that's not page aligned
4210                  */
4211                 start_offset = (int)(f_offset & PAGE_MASK_64);
4212                 upl_f_offset = f_offset - (off_t)start_offset;
4213                 max_size     = filesize - f_offset;
4214
4215                 if (resid < max_size)
4216                         io_size = resid;
4217                 else
4218                         io_size = max_size;
4219
4220                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4221                 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
4222                         upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
4223
4224                 skip_range = 0;
4225                 /*
4226                  * return the number of contiguously present pages in the cache
4227                  * starting at upl_f_offset within the file
4228                  */
4229                 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
4230
4231                 if (skip_range) {
4232                         /*
4233                          * skip over pages already present in the cache
4234                          */
4235                         io_size = skip_range - start_offset;
4236
4237                         f_offset += io_size;
4238                         resid    -= io_size;
4239
4240                         if (skip_range == upl_size)
4241                                 continue;
4242                         /*
4243                          * have to issue some real I/O
4244                          * at this point, we know it's starting on a page boundary
4245                          * because we've skipped over at least the first page in the request
4246                          */
4247                         start_offset = 0;
4248                         upl_f_offset += skip_range;
4249                         upl_size     -= skip_range;
4250                 }
4251                 pages_in_upl = upl_size / PAGE_SIZE;
4252
4253                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
4254                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
4255
4256                 kret = ubc_create_upl(vp,
4257                                       upl_f_offset,
4258                                       upl_size,
4259                                       &upl,
4260                                       &pl,
4261                                       UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
4262                 if (kret != KERN_SUCCESS)
4263                         return(retval);
4264                 issued_io = 0;
4265
4266                 /*
4267                  * before we start marching forward, we must make sure we end on
4268                  * a present page, otherwise we will be working with a freed
4269                  * upl
4270                  */
4271                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
4272                         if (upl_page_present(pl, last_pg))
4273                                 break;
4274                 }
4275                 pages_in_upl = last_pg + 1;
4276
4277
4278                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
4279                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
4280
4281
4282                 for (last_pg = 0; last_pg < pages_in_upl; ) {
4283                         /*
4284                          * scan from the beginning of the upl looking for the first
4285                          * page that is present.... this will become the first page in
4286                          * the request we're going to make to 'cluster_io'... if all
4287                          * of the pages are absent, we won't call through to 'cluster_io'
4288                          */
4289                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
4290                                 if (upl_page_present(pl, start_pg))
4291                                         break;
4292                         }
4293
4294                         /*
4295                          * scan from the starting present page looking for an absent
4296                          * page before the end of the upl is reached, if we
4297                          * find one, then it will terminate the range of pages being
4298                          * presented to 'cluster_io'
4299                          */
4300                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4301                                 if (!upl_page_present(pl, last_pg))
4302                                         break;
4303                         }
4304
4305                         if (last_pg > start_pg) {
4306                                 /*
4307                                  * we found a range of pages that must be filled
4308                                  * if the last page in this range is the last page of the file
4309                                  * we may have to clip the size of it to keep from reading past
4310                                  * the end of the last physical block associated with the file
4311                                  */
4312                                 upl_offset = start_pg * PAGE_SIZE;
4313                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
4314
4315                                 if ((upl_f_offset + upl_offset + io_size) > filesize)
4316                                         io_size = filesize - (upl_f_offset + upl_offset);
4317
4318                                 /*
4319                                  * issue an asynchronous read to cluster_io
4320                                  */
4321                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
4322                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
4323
4324                                 issued_io = 1;
4325                         }
4326                 }
4327                 if (issued_io == 0)
4328                         ubc_upl_abort(upl, 0);
4329
4330                 io_size = upl_size - start_offset;
4331
4332                 if (io_size > resid)
4333                         io_size = resid;
4334                 f_offset += io_size;
4335                 resid    -= io_size;
4336         }
4337
4338         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
4339                      (int)f_offset, resid, retval, 0, 0);
4340
4341         return(retval);
4342 }
4343
4344
4345 int
4346 cluster_push(vnode_t vp, int flags)
4347 {
4348         return cluster_push_ext(vp, flags, NULL, NULL);
4349 }
4350
4351
4352 int
4353 cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg)
4354 {
4355         int     retval;
4356         struct  cl_writebehind *wbp;
4357
4358         if ( !UBCINFOEXISTS(vp)) {
4359                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -1, 0);
4360                 return (0);
4361         }
4362         /* return if deferred write is set */
4363         if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
4364                 return (0);
4365         }
4366         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
4367                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -2, 0);
4368                 return (0);
4369         }
4370         if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
4371                 lck_mtx_unlock(&wbp->cl_lockw);
4372
4373                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -3, 0);
4374                 return(0);
4375         }
4376         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
4377                      (int)wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
4378
4379         if (wbp->cl_scmap) {
4380                 sparse_cluster_push(wbp, vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg);
4381
4382                 retval = 1;
4383         } else
4384                 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg);
4385
4386         lck_mtx_unlock(&wbp->cl_lockw);
4387
4388         if (flags & IO_SYNC)
4389                 (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
4390
4391         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
4392                      (int)wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
4393
4394         return (retval);
4395 }
4396
4397
4398 __private_extern__ void
4399 cluster_release(struct ubc_info *ubc)
4400 {
4401         struct cl_writebehind *wbp;
4402         struct cl_readahead   *rap;
4403
4404         if ((wbp = ubc->cl_wbehind)) {
4405
4406                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4407
4408                 if (wbp->cl_scmap)
4409                         vfs_drt_control(&(wbp->cl_scmap), 0);
4410         } else {
4411                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, 0, 0, 0, 0);
4412         }
4413
4414         rap = ubc->cl_rahead;
4415
4416         if (wbp != NULL) {
4417                 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
4418                 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
4419         }
4420         if ((rap = ubc->cl_rahead)) {
4421                 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
4422                 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
4423         }
4424         ubc->cl_rahead  = NULL;
4425         ubc->cl_wbehind = NULL;
4426
4427         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)ubc, (int)rap, (int)wbp, 0, 0);
4428 }
4429
4430
4431 static int
4432 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int (*callback)(buf_t, void *), void *callback_arg)
4433 {
4434         int cl_index;
4435         int cl_index1;
4436         int min_index;
4437         int cl_len;
4438         int cl_pushed = 0;
4439         struct cl_wextent l_clusters[MAX_CLUSTERS];
4440
4441         /*
4442          * the write behind context exists and has
4443          * already been locked...
4444          */
4445         if (wbp->cl_number == 0)
4446                 /*
4447                  * no clusters to push
4448                  * return number of empty slots
4449                  */
4450                 return (MAX_CLUSTERS);
4451
4452         /*
4453          * make a local 'sorted' copy of the clusters
4454          * and clear wbp->cl_number so that new clusters can
4455          * be developed
4456          */
4457         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
4458                 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
4459                         if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
4460                                 continue;
4461                         if (min_index == -1)
4462                                 min_index = cl_index1;
4463                         else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
4464                                 min_index = cl_index1;
4465                 }
4466                 if (min_index == -1)
4467                         break;
4468                 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
4469                 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
4470                 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
4471
4472                 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
4473         }
4474         wbp->cl_number = 0;
4475
4476         cl_len = cl_index;
4477
4478         if ( (push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS ) {
4479                 int   i;
4480
4481                 /*
4482                  * determine if we appear to be writing the file sequentially
4483                  * if not, by returning without having pushed any clusters
4484                  * we will cause this vnode to be pushed into the sparse cluster mechanism
4485                  * used for managing more random I/O patterns
4486                  *
4487                  * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
4488                  * that's why we're in try_push with PUSH_DELAY...
4489                  *
4490                  * check to make sure that all the clusters except the last one are 'full'... and that each cluster
4491                  * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
4492                  * so we can just make a simple pass through, up to, but not including the last one...
4493                  * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
4494                  * are sequential
4495                  *
4496                  * we let the last one be partial as long as it was adjacent to the previous one...
4497                  * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
4498                  * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
4499                  */
4500                 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
4501                         if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != MAX_CLUSTER_SIZE)
4502                                 goto dont_try;
4503                         if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
4504                                 goto dont_try;
4505                 }
4506         }
4507         /*
4508          * drop the lock while we're firing off the I/Os...
4509          * this is safe since I'm working off of a private sorted copy
4510          * of the clusters, and I'm going to re-evaluate the public
4511          * state after I retake the lock
4512          *
4513          * we need to drop it to avoid a lock inversion when trying to
4514          * grab pages into the UPL... another thread in 'write' may
4515          * have these pages in its UPL and be blocked trying to
4516          * gain the write-behind lock for this vnode
4517          */
4518         lck_mtx_unlock(&wbp->cl_lockw);
4519
4520         for (cl_index = 0; cl_index < cl_len; cl_index++) {
4521                 int     flags;
4522                 struct  cl_extent cl;
4523
4524                 /*
4525                  * try to push each cluster in turn...
4526                  */
4527                 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE)
4528                         flags = IO_NOCACHE;
4529                 else
4530                         flags = 0;
4531
4532                 if ((l_clusters[cl_index].io_flags & CLW_IOPASSIVE) || (push_flag & IO_PASSIVE))
4533                         flags |= IO_PASSIVE;
4534
4535                 if (push_flag & PUSH_SYNC)
4536                         flags |= IO_SYNC;
4537
4538                 cl.b_addr = l_clusters[cl_index].b_addr;
4539                 cl.e_addr = l_clusters[cl_index].e_addr;
4540
4541                 cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg);
4542
4543                 l_clusters[cl_index].b_addr = 0;
4544                 l_clusters[cl_index].e_addr = 0;
4545
4546                 cl_pushed++;
4547
4548                 if ( !(push_flag & PUSH_ALL) )
4549                         break;
4550         }
4551         lck_mtx_lock(&wbp->cl_lockw);
4552
4553 dont_try:
4554         if (cl_len > cl_pushed) {
4555                /*
4556                 * we didn't push all of the clusters, so
4557                 * lets try to merge them back in to the vnode
4558                 */
4559                 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
4560                         /*
4561                          * we picked up some new clusters while we were trying to
4562                          * push the old ones... this can happen because I've dropped
4563                          * the vnode lock... the sum of the
4564                          * leftovers plus the new cluster count exceeds our ability
4565                          * to represent them, so switch to the sparse cluster mechanism
4566                          *
4567                          * collect the active public clusters...
4568                          */
4569                         sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg);
4570
4571                         for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
4572                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
4573                                         continue;
4574                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
4575                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
4576                                 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
4577
4578                                 cl_index1++;
4579                         }
4580                         /*
4581                          * update the cluster count
4582                          */
4583                         wbp->cl_number = cl_index1;
4584
4585                         /*
4586                          * and collect the original clusters that were moved into the
4587                          * local storage for sorting purposes
4588                          */
4589                         sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg);
4590
4591                 } else {
4592                         /*
4593                          * we've got room to merge the leftovers back in
4594                          * just append them starting at the next 'hole'
4595                          * represented by wbp->cl_number
4596                          */
4597                         for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
4598                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
4599                                         continue;
4600
4601                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
4602                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
4603                                 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
4604
4605                                 cl_index1++;
4606                         }
4607                         /*
4608                          * update the cluster count
4609                          */
4610                         wbp->cl_number = cl_index1;
4611                 }
4612         }
4613         return (MAX_CLUSTERS - wbp->cl_number);
4614 }
4615
4616
4617
4618 static int
4619 cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*callback)(buf_t, void *), void *callback_arg)
4620 {
4621         upl_page_info_t *pl;
4622         upl_t            upl;
4623         vm_offset_t      upl_offset;
4624         int              upl_size;
4625         off_t            upl_f_offset;
4626         int              pages_in_upl;
4627         int              start_pg;
4628         int              last_pg;
4629         int              io_size;
4630         int              io_flags;
4631         int              upl_flags;
4632         int              bflag;
4633         int              size;
4634         int              error = 0;
4635         int              retval;
4636         kern_return_t    kret;
4637
4638         if (flags & IO_PASSIVE)
4639             bflag = CL_PASSIVE;
4640         else
4641             bflag = 0;
4642
4643         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
4644                      (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
4645
4646         if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
4647                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
4648
4649                 return (0);
4650         }
4651         upl_size = pages_in_upl * PAGE_SIZE;
4652         upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4653
4654         if (upl_f_offset + upl_size >= EOF) {
4655
4656                 if (upl_f_offset >= EOF) {
4657                         /*
4658                          * must have truncated the file and missed
4659                          * clearing a dangling cluster (i.e. it's completely
4660                          * beyond the new EOF
4661                          */
4662                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
4663
4664                         return(0);
4665                 }
4666                 size = EOF - upl_f_offset;
4667
4668                 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4669                 pages_in_upl = upl_size / PAGE_SIZE;
4670         } else
4671                 size = upl_size;
4672
4673         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
4674
4675         /*
4676          * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
4677          *
4678          * - only pages that are currently dirty are returned... these are the ones we need to clean
4679          * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
4680          * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
4681          * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
4682          *   someone dirties this page while the I/O is in progress, we don't lose track of the new state
4683          *
4684          * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
4685          */
4686
4687         if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE))
4688                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
4689         else
4690                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
4691
4692         kret = ubc_create_upl(vp,
4693                                 upl_f_offset,
4694                                 upl_size,
4695                                 &upl,
4696                                 &pl,
4697                                 upl_flags);
4698         if (kret != KERN_SUCCESS)
4699                 panic("cluster_push: failed to get pagelist");
4700
4701         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
4702
4703         /*
4704          * since we only asked for the dirty pages back
4705          * it's possible that we may only get a few or even none, so...
4706          * before we start marching forward, we must make sure we know
4707          * where the last present page is in the UPL, otherwise we could
4708          * end up working with a freed upl due to the FREE_ON_EMPTY semantics
4709          * employed by commit_range and abort_range.
4710          */
4711         for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
4712                 if (upl_page_present(pl, last_pg))
4713                         break;
4714         }
4715         pages_in_upl = last_pg + 1;
4716
4717         if (pages_in_upl == 0) {
4718                 ubc_upl_abort(upl, 0);
4719
4720                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
4721                 return(0);
4722         }
4723
4724         for (last_pg = 0; last_pg < pages_in_upl; ) {
4725                 /*
4726                  * find the next dirty page in the UPL
4727                  * this will become the first page in the
4728                  * next I/O to generate
4729                  */
4730                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
4731                         if (upl_dirty_page(pl, start_pg))
4732                                 break;
4733                         if (upl_page_present(pl, start_pg))
4734                                 /*
4735                                  * RET_ONLY_DIRTY will return non-dirty 'precious' pages
4736                                  * just release these unchanged since we're not going
4737                                  * to steal them or change their state
4738                                  */
4739                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4740                 }
4741                 if (start_pg >= pages_in_upl)
4742                         /*
4743                          * done... no more dirty pages to push
4744                          */
4745                         break;
4746                 if (start_pg > last_pg)
4747                         /*
4748                          * skipped over some non-dirty pages
4749                          */
4750                         size -= ((start_pg - last_pg) * PAGE_SIZE);
4751
4752                 /*
4753                  * find a range of dirty pages to write
4754                  */
4755                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4756                         if (!upl_dirty_page(pl, last_pg))
4757                                 break;
4758                 }
4759                 upl_offset = start_pg * PAGE_SIZE;
4760
4761                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
4762
4763                 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag;
4764
4765                 if ( !(flags & IO_SYNC))
4766                         io_flags |= CL_ASYNC;
4767
4768                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
4769                                     io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
4770
4771                 if (error == 0 && retval)
4772                         error = retval;
4773
4774                 size -= io_size;
4775         }
4776         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
4777
4778         return(error);
4779 }
4780
4781
4782 /*
4783  * sparse_cluster_switch is called with the write behind lock held
4784  */
4785 static void
4786 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg)
4787 {
4788         int     cl_index;
4789
4790         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4791
4792         if (wbp->cl_scmap == NULL)
4793                 wbp->cl_scdirty = 0;
4794
4795         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
4796                 int       flags;
4797                 struct cl_extent cl;
4798
4799                 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
4800
4801                         if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
4802                                 if (flags & UPL_POP_DIRTY) {
4803                                         cl.e_addr = cl.b_addr + 1;
4804
4805                                         sparse_cluster_add(wbp, vp, &cl, EOF, callback, callback_arg);
4806                                 }
4807                         }
4808                 }
4809         }
4810         wbp->cl_number = 0;
4811
4812         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4813 }
4814
4815
4816 /*
4817  * sparse_cluster_push is called with the write behind lock held
4818  */
4819 static void
4820 sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int (*callback)(buf_t, void *), void *callback_arg)
4821 {
4822         struct cl_extent cl;
4823         off_t           offset;
4824         u_int           length;
4825
4826         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, push_flag, 0);
4827
4828         if (push_flag & PUSH_ALL)
4829                 vfs_drt_control(&(wbp->cl_scmap), 1);
4830
4831         for (;;) {
4832                 if (vfs_drt_get_cluster(&(wbp->cl_scmap), &offset, &length) != KERN_SUCCESS)
4833                         break;
4834
4835                 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
4836                 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
4837
4838                 wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr);
4839
4840                 /*
4841                  * drop the lock while we're firing off the I/Os...
4842                  * this is safe since I've already updated the state
4843                  * this lock is protecting and I'm going to re-evaluate
4844                  * the public state after I retake the lock
4845                  *
4846                  * we need to drop it to avoid a lock inversion when trying to
4847                  * grab pages into the UPL... another thread in 'write' may
4848                  * have these pages in its UPL and be blocked trying to
4849                  * gain the write-behind lock for this vnode
4850                  */
4851                 lck_mtx_unlock(&wbp->cl_lockw);
4852
4853                 cluster_push_now(vp, &cl, EOF, push_flag & IO_PASSIVE, callback, callback_arg);
4854
4855                 lck_mtx_lock(&wbp->cl_lockw);
4856
4857                 if ( !(push_flag & PUSH_ALL) )
4858                         break;
4859         }
4860         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4861 }
4862
4863
4864 /*
4865  * sparse_cluster_add is called with the write behind lock held
4866  */
4867 static void
4868 sparse_cluster_add(struct cl_writebehind *wbp, vnode_t vp, struct cl_extent *cl, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg)
4869 {
4870         u_int   new_dirty;
4871         u_int   length;
4872         off_t   offset;
4873
4874         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)wbp->cl_scmap, wbp->cl_scdirty, (int)cl->b_addr, (int)cl->e_addr, 0);
4875
4876         offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4877         length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
4878
4879         while (vfs_drt_mark_pages(&(wbp->cl_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
4880                 /*
4881                  * no room left in the map
4882                  * only a partial update was done
4883                  * push out some pages and try again
4884                  */
4885                 wbp->cl_scdirty += new_dirty;
4886
4887                 sparse_cluster_push(wbp, vp, EOF, 0, callback, callback_arg);
4888
4889                 offset += (new_dirty * PAGE_SIZE_64);
4890                 length -= (new_dirty * PAGE_SIZE);
4891         }
4892         wbp->cl_scdirty += new_dirty;
4893
4894         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4895 }
4896
4897
4898 static int
4899 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
4900 {
4901         upl_page_info_t  *pl;
4902         upl_t            upl;
4903         addr64_t         ubc_paddr;
4904         kern_return_t    kret;
4905         int              error = 0;
4906         int              did_read = 0;
4907         int              abort_flags;
4908         int              upl_flags;
4909         int              bflag;
4910
4911         if (flags & IO_PASSIVE)
4912             bflag = CL_PASSIVE;
4913         else
4914             bflag = 0;
4915
4916         upl_flags = UPL_SET_LITE;
4917
4918         if ( !(flags & CL_READ) ) {
4919                 /*
4920                  * "write" operation:  let the UPL subsystem know
4921                  * that we intend to modify the buffer cache pages
4922                  * we're gathering.
4923                  */
4924                 upl_flags |= UPL_WILL_MODIFY;
4925         } else {
4926                 /*
4927                  * indicate that there is no need to pull the
4928                  * mapping for this page... we're only going
4929                  * to read from it, not modify it.
4930                  */
4931                 upl_flags |= UPL_FILE_IO;
4932         }
4933         kret = ubc_create_upl(vp,
4934                               uio->uio_offset & ~PAGE_MASK_64,
4935                               PAGE_SIZE,
4936                               &upl,
4937                               &pl,
4938                               upl_flags);
4939
4940         if (kret != KERN_SUCCESS)
4941                 return(EINVAL);
4942
4943         if (!upl_valid_page(pl, 0)) {
4944                 /*
4945                  * issue a synchronous read to cluster_io
4946                  */
4947                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4948                                    CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
4949                 if (error) {
4950                           ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4951
4952                           return(error);
4953                 }
4954                 did_read = 1;
4955         }
4956         ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
4957
4958 /*
4959  *      NOTE:  There is no prototype for the following in BSD. It, and the definitions
4960  *      of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
4961  *      osfmk/ppc/mappings.h.  They are not included here because there appears to be no
4962  *      way to do so without exporting them to kexts as well.
4963  */
4964         if (flags & CL_READ)
4965 //              copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);    /* Copy physical to physical and flush the destination */
4966                 copypv(ubc_paddr, usr_paddr, xsize,        2 |        1 |        4);    /* Copy physical to physical and flush the destination */
4967         else
4968 //              copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);    /* Copy physical to physical and flush the source */
4969                 copypv(usr_paddr, ubc_paddr, xsize,        2 |        1 |        8);    /* Copy physical to physical and flush the source */
4970
4971         if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
4972                 /*
4973                  * issue a synchronous write to cluster_io
4974                  */
4975                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4976                                    bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
4977         }
4978         if (error == 0)
4979                 uio_update(uio, (user_size_t)xsize);
4980
4981         if (did_read)
4982                 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4983         else
4984                 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
4985
4986         ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
4987
4988         return (error);
4989 }
4990
4991
4992
4993 int
4994 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid)
4995 {
4996         int       pg_offset;
4997         int       pg_index;
4998         int       csize;
4999         int       segflg;
5000         int       retval = 0;
5001         int       xsize;
5002         upl_page_info_t *pl;
5003
5004         xsize = *io_resid;
5005
5006         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
5007                      (int)uio->uio_offset, upl_offset, xsize, 0, 0);
5008
5009         segflg = uio->uio_segflg;
5010
5011         switch(segflg) {
5012
5013           case UIO_USERSPACE32:
5014           case UIO_USERISPACE32:
5015                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
5016                 break;
5017
5018           case UIO_USERSPACE:
5019           case UIO_USERISPACE:
5020                 uio->uio_segflg = UIO_PHYS_USERSPACE;
5021                 break;
5022
5023           case UIO_USERSPACE64:
5024           case UIO_USERISPACE64:
5025                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
5026                 break;
5027
5028           case UIO_SYSSPACE32:
5029                 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
5030                 break;
5031
5032           case UIO_SYSSPACE:
5033                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
5034                 break;
5035
5036           case UIO_SYSSPACE64:
5037                 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
5038                 break;
5039         }
5040         pl = ubc_upl_pageinfo(upl);
5041
5042         pg_index  = upl_offset / PAGE_SIZE;
5043         pg_offset = upl_offset & PAGE_MASK;
5044         csize     = min(PAGE_SIZE - pg_offset, xsize);
5045
5046         while (xsize && retval == 0) {
5047                 addr64_t  paddr;
5048
5049                 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
5050
5051                 retval = uiomove64(paddr, csize, uio);
5052
5053                 pg_index += 1;
5054                 pg_offset = 0;
5055                 xsize    -= csize;
5056                 csize     = min(PAGE_SIZE, xsize);
5057         }
5058         *io_resid = xsize;
5059
5060         uio->uio_segflg = segflg;
5061
5062         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
5063                      (int)uio->uio_offset, xsize, retval, segflg, 0);
5064
5065         return (retval);
5066 }
5067
5068
5069 int
5070 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
5071 {
5072
5073         return (cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1));
5074 }
5075
5076
5077 static int
5078 cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference)
5079 {
5080         int       segflg;
5081         int       io_size;
5082         int       xsize;
5083         int       start_offset;
5084         int       retval = 0;
5085         memory_object_control_t  control;
5086
5087         io_size = *io_resid;
5088
5089         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
5090                      (int)uio->uio_offset, 0, io_size, 0, 0);
5091
5092         control = ubc_getobject(vp, UBC_FLAGS_NONE);
5093
5094         if (control == MEMORY_OBJECT_CONTROL_NULL) {
5095                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
5096                              (int)uio->uio_offset, io_size, retval, 3, 0);
5097
5098                 return(0);
5099         }
5100         segflg = uio->uio_segflg;
5101
5102         switch(segflg) {
5103
5104           case UIO_USERSPACE32:
5105           case UIO_USERISPACE32:
5106                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
5107                 break;
5108
5109           case UIO_USERSPACE64:
5110           case UIO_USERISPACE64:
5111                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
5112                 break;
5113
5114           case UIO_SYSSPACE32:
5115                 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
5116                 break;
5117
5118           case UIO_SYSSPACE64:
5119                 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
5120                 break;
5121
5122           case UIO_USERSPACE:
5123           case UIO_USERISPACE:
5124                 uio->uio_segflg = UIO_PHYS_USERSPACE;
5125                 break;
5126
5127           case UIO_SYSSPACE:
5128                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
5129                 break;
5130         }
5131
5132         if ( (io_size = *io_resid) ) {
5133                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
5134                 xsize = uio_resid(uio);
5135
5136                 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
5137                                                        start_offset, io_size, mark_dirty, take_reference);
5138                 xsize -= uio_resid(uio);
5139                 io_size -= xsize;
5140         }
5141         uio->uio_segflg = segflg;
5142         *io_resid       = io_size;
5143
5144         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
5145                      (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0);
5146
5147         return(retval);
5148 }
5149
5150
5151 int
5152 is_file_clean(vnode_t vp, off_t filesize)
5153 {
5154         off_t f_offset;
5155         int   flags;
5156         int   total_dirty = 0;
5157
5158         for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
5159                 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
5160                         if (flags & UPL_POP_DIRTY) {
5161                                 total_dirty++;
5162                         }
5163                 }
5164         }
5165         if (total_dirty)
5166                 return(EINVAL);
5167
5168         return (0);
5169 }
5170
5171
5172
5173 /*
5174  * Dirty region tracking/clustering mechanism.
5175  *
5176  * This code (vfs_drt_*) provides a mechanism for tracking and clustering
5177  * dirty regions within a larger space (file).  It is primarily intended to
5178  * support clustering in large files with many dirty areas.
5179  *
5180  * The implementation assumes that the dirty regions are pages.
5181  *
5182  * To represent dirty pages within the file, we store bit vectors in a
5183  * variable-size circular hash.
5184  */
5185
5186 /*
5187  * Bitvector size.  This determines the number of pages we group in a
5188  * single hashtable entry.  Each hashtable entry is aligned to this
5189  * size within the file.
5190  */
5191 #define DRT_BITVECTOR_PAGES             256
5192
5193 /*
5194  * File offset handling.
5195  *
5196  * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
5197  * the correct formula is  (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
5198  */
5199 #define DRT_ADDRESS_MASK                (~((1 << 20) - 1))
5200 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK)
5201
5202 /*
5203  * Hashtable address field handling.
5204  *
5205  * The low-order bits of the hashtable address are used to conserve
5206  * space.
5207  *
5208  * DRT_HASH_COUNT_MASK must be large enough to store the range
5209  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
5210  * to indicate that the bucket is actually unoccupied.
5211  */
5212 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
5213 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \
5214         do {                                                                                            \
5215                 (scm)->scm_hashtable[(i)].dhe_control =                                                 \
5216                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
5217         } while (0)
5218 #define DRT_HASH_COUNT_MASK             0x1ff
5219 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
5220 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \
5221         do {                                                                                                            \
5222                 (scm)->scm_hashtable[(i)].dhe_control =                                                                 \
5223                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \
5224         } while (0)
5225 #define DRT_HASH_CLEAR(scm, i)                                                                                          \
5226         do {                                                                                                            \
5227                 (scm)->scm_hashtable[(i)].dhe_control = 0;                                                              \
5228         } while (0)
5229 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
5230 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
5231 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \
5232         do {                                                                                            \
5233                 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \
5234                 DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \
5235         } while(0);
5236
5237
5238 /*
5239  * Hash table moduli.
5240  *
5241  * Since the hashtable entry's size is dependent on the size of
5242  * the bitvector, and since the hashtable size is constrained to
5243  * both being prime and fitting within the desired allocation
5244  * size, these values need to be manually determined.
5245  *
5246  * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
5247  *
5248  * The small hashtable allocation is 1024 bytes, so the modulus is 23.
5249  * The large hashtable allocation is 16384 bytes, so the modulus is 401.
5250  */
5251 #define DRT_HASH_SMALL_MODULUS  23
5252 #define DRT_HASH_LARGE_MODULUS  401
5253
5254 #define DRT_SMALL_ALLOCATION    1024    /* 104 bytes spare */
5255 #define DRT_LARGE_ALLOCATION    16384   /* 344 bytes spare */
5256
5257 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
5258
5259 /*
5260  * Hashtable bitvector handling.
5261  *
5262  * Bitvector fields are 32 bits long.
5263  */
5264
5265 #define DRT_HASH_SET_BIT(scm, i, bit)                           \
5266         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
5267
5268 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \
5269         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
5270
5271 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \
5272         ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
5273
5274 #define DRT_BITVECTOR_CLEAR(scm, i)                             \
5275         bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
5276
5277 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \
5278         bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \
5279             &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \
5280             (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
5281
5282
5283
5284 /*
5285  * Hashtable entry.
5286  */
5287 struct vfs_drt_hashentry {
5288         u_int64_t       dhe_control;
5289         u_int32_t       dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
5290 };
5291
5292 /*
5293  * Dirty Region Tracking structure.
5294  *
5295  * The hashtable is allocated entirely inside the DRT structure.
5296  *
5297  * The hash is a simple circular prime modulus arrangement, the structure
5298  * is resized from small to large if it overflows.
5299  */
5300
5301 struct vfs_drt_clustermap {
5302         u_int32_t               scm_magic;      /* sanity/detection */
5303 #define DRT_SCM_MAGIC           0x12020003
5304         u_int32_t               scm_modulus;    /* current ring size */
5305         u_int32_t               scm_buckets;    /* number of occupied buckets */
5306         u_int32_t               scm_lastclean;  /* last entry we cleaned */
5307         u_int32_t               scm_iskips;     /* number of slot skips */
5308
5309         struct vfs_drt_hashentry scm_hashtable[0];
5310 };
5311
5312
5313 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus)
5314 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus)
5315
5316 /*
5317  * Debugging codes and arguments.
5318  */
5319 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
5320 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
5321 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
5322 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
5323 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
5324                                                             * dirty */
5325                                                            /* 0, setcount */
5326                                                            /* 1 (clean, no map) */
5327                                                            /* 2 (map alloc fail) */
5328                                                            /* 3, resid (partial) */
5329 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87))
5330 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
5331                                                             * lastclean, iskips */
5332
5333
5334 static kern_return_t    vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
5335 static kern_return_t    vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
5336 static kern_return_t    vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
5337         u_int64_t offset, int *indexp);
5338 static kern_return_t    vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
5339         u_int64_t offset,
5340         int *indexp,
5341         int recursed);
5342 static kern_return_t    vfs_drt_do_mark_pages(
5343         void            **cmapp,
5344         u_int64_t       offset,
5345         u_int           length,
5346         u_int           *setcountp,
5347         int             dirty);
5348 static void             vfs_drt_trace(
5349         struct vfs_drt_clustermap *cmap,
5350         int code,
5351         int arg1,
5352         int arg2,
5353         int arg3,
5354         int arg4);
5355
5356
5357 /*
5358  * Allocate and initialise a sparse cluster map.
5359  *
5360  * Will allocate a new map, resize or compact an existing map.
5361  *
5362  * XXX we should probably have at least one intermediate map size,
5363  * as the 1:16 ratio seems a bit drastic.
5364  */
5365 static kern_return_t
5366 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
5367 {
5368         struct vfs_drt_clustermap *cmap, *ocmap;
5369         kern_return_t   kret;
5370         u_int64_t       offset;
5371         u_int32_t       i;
5372         int             nsize, active_buckets, index, copycount;
5373
5374         ocmap = NULL;
5375         if (cmapp != NULL)
5376                 ocmap = *cmapp;
5377
5378         /*
5379          * Decide on the size of the new map.
5380          */
5381         if (ocmap == NULL) {
5382                 nsize = DRT_HASH_SMALL_MODULUS;
5383         } else {
5384                 /* count the number of active buckets in the old map */
5385                 active_buckets = 0;
5386                 for (i = 0; i < ocmap->scm_modulus; i++) {
5387                         if (!DRT_HASH_VACANT(ocmap, i) &&
5388                             (DRT_HASH_GET_COUNT(ocmap, i) != 0))
5389                                 active_buckets++;
5390                 }
5391                 /*
5392                  * If we're currently using the small allocation, check to
5393                  * see whether we should grow to the large one.
5394                  */
5395                 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
5396                         /* if the ring is nearly full */
5397                         if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
5398                                 nsize = DRT_HASH_LARGE_MODULUS;
5399                         } else {
5400                                 nsize = DRT_HASH_SMALL_MODULUS;
5401                         }
5402                 } else {
5403                         /* already using the large modulus */
5404                         nsize = DRT_HASH_LARGE_MODULUS;
5405                         /*
5406                          * If the ring is completely full, there's
5407                          * nothing useful for us to do.  Behave as
5408                          * though we had compacted into the new
5409                          * array and return.
5410                          */
5411                         if (active_buckets >= DRT_HASH_LARGE_MODULUS)
5412                                 return(KERN_SUCCESS);
5413                 }
5414         }
5415
5416         /*
5417          * Allocate and initialise the new map.
5418          */
5419
5420         kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
5421             (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
5422         if (kret != KERN_SUCCESS)
5423                 return(kret);
5424         cmap->scm_magic = DRT_SCM_MAGIC;
5425         cmap->scm_modulus = nsize;
5426         cmap->scm_buckets = 0;
5427         cmap->scm_lastclean = 0;
5428         cmap->scm_iskips = 0;
5429         for (i = 0; i < cmap->scm_modulus; i++) {
5430                 DRT_HASH_CLEAR(cmap, i);
5431                 DRT_HASH_VACATE(cmap, i);
5432                 DRT_BITVECTOR_CLEAR(cmap, i);
5433         }
5434
5435         /*
5436          * If there's an old map, re-hash entries from it into the new map.
5437          */
5438         copycount = 0;
5439         if (ocmap != NULL) {
5440                 for (i = 0; i < ocmap->scm_modulus; i++) {
5441                         /* skip empty buckets */
5442                         if (DRT_HASH_VACANT(ocmap, i) ||
5443                             (DRT_HASH_GET_COUNT(ocmap, i) == 0))
5444                                 continue;
5445                         /* get new index */
5446                         offset = DRT_HASH_GET_ADDRESS(ocmap, i);
5447                         kret = vfs_drt_get_index(&cmap, offset, &index, 1);
5448                         if (kret != KERN_SUCCESS) {
5449                                 /* XXX need to bail out gracefully here */
5450                                 panic("vfs_drt: new cluster map mysteriously too small");
5451                                 index = 0;
5452                         }
5453                         /* copy */
5454                         DRT_HASH_COPY(ocmap, i, cmap, index);
5455                         copycount++;
5456                 }
5457         }
5458
5459         /* log what we've done */
5460         vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
5461
5462         /*
5463          * It's important to ensure that *cmapp always points to
5464          * a valid map, so we must overwrite it before freeing
5465          * the old map.
5466          */
5467         *cmapp = cmap;
5468         if (ocmap != NULL) {
5469                 /* emit stats into trace buffer */
5470                 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
5471                               ocmap->scm_modulus,
5472                               ocmap->scm_buckets,
5473                               ocmap->scm_lastclean,
5474                               ocmap->scm_iskips);
5475
5476                 vfs_drt_free_map(ocmap);
5477         }
5478         return(KERN_SUCCESS);
5479 }
5480
5481
5482 /*
5483  * Free a sparse cluster map.
5484  */
5485 static kern_return_t
5486 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
5487 {
5488         kmem_free(kernel_map, (vm_offset_t)cmap,
5489                   (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
5490         return(KERN_SUCCESS);
5491 }
5492
5493
5494 /*
5495  * Find the hashtable slot currently occupied by an entry for the supplied offset.
5496  */
5497 static kern_return_t
5498 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
5499 {
5500         int             index;
5501         u_int32_t       i;
5502
5503         offset = DRT_ALIGN_ADDRESS(offset);
5504         index = DRT_HASH(cmap, offset);
5505
5506         /* traverse the hashtable */
5507         for (i = 0; i < cmap->scm_modulus; i++) {
5508
5509                 /*
5510                  * If the slot is vacant, we can stop.
5511                  */
5512                 if (DRT_HASH_VACANT(cmap, index))
5513                         break;
5514
5515                 /*
5516                  * If the address matches our offset, we have success.
5517                  */
5518                 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
5519                         *indexp = index;
5520                         return(KERN_SUCCESS);
5521                 }
5522
5523                 /*
5524                  * Move to the next slot, try again.
5525                  */
5526                 index = DRT_HASH_NEXT(cmap, index);
5527         }
5528         /*
5529          * It's not there.
5530          */
5531         return(KERN_FAILURE);
5532 }
5533
5534 /*
5535  * Find the hashtable slot for the supplied offset.  If we haven't allocated
5536  * one yet, allocate one and populate the address field.  Note that it will
5537  * not have a nonzero page count and thus will still technically be free, so
5538  * in the case where we are called to clean pages, the slot will remain free.
5539  */
5540 static kern_return_t
5541 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
5542 {
5543         struct vfs_drt_clustermap *cmap;
5544         kern_return_t   kret;
5545         u_int32_t       index;
5546         u_int32_t       i;
5547
5548         cmap = *cmapp;
5549
5550         /* look for an existing entry */
5551         kret = vfs_drt_search_index(cmap, offset, indexp);
5552         if (kret == KERN_SUCCESS)
5553                 return(kret);
5554
5555         /* need to allocate an entry */
5556         offset = DRT_ALIGN_ADDRESS(offset);
5557         index = DRT_HASH(cmap, offset);
5558
5559         /* scan from the index forwards looking for a vacant slot */
5560         for (i = 0; i < cmap->scm_modulus; i++) {
5561                 /* slot vacant? */
5562                 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
5563                         cmap->scm_buckets++;
5564                         if (index < cmap->scm_lastclean)
5565                                 cmap->scm_lastclean = index;
5566                         DRT_HASH_SET_ADDRESS(cmap, index, offset);
5567                         DRT_HASH_SET_COUNT(cmap, index, 0);
5568                         DRT_BITVECTOR_CLEAR(cmap, index);
5569                         *indexp = index;
5570                         vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
5571                         return(KERN_SUCCESS);
5572                 }
5573                 cmap->scm_iskips += i;
5574                 index = DRT_HASH_NEXT(cmap, index);
5575         }
5576
5577         /*
5578          * We haven't found a vacant slot, so the map is full.  If we're not
5579          * already recursed, try reallocating/compacting it.
5580          */
5581         if (recursed)
5582                 return(KERN_FAILURE);
5583         kret = vfs_drt_alloc_map(cmapp);
5584         if (kret == KERN_SUCCESS) {
5585                 /* now try to insert again */
5586                 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
5587         }
5588         return(kret);
5589 }
5590
5591 /*
5592  * Implementation of set dirty/clean.
5593  *
5594  * In the 'clean' case, not finding a map is OK.
5595  */
5596 static kern_return_t
5597 vfs_drt_do_mark_pages(
5598         void            **private,
5599         u_int64_t       offset,
5600         u_int           length,
5601         u_int           *setcountp,
5602         int             dirty)
5603 {
5604         struct vfs_drt_clustermap *cmap, **cmapp;
5605         kern_return_t   kret;
5606         int             i, index, pgoff, pgcount, setcount, ecount;
5607
5608         cmapp = (struct vfs_drt_clustermap **)private;
5609         cmap = *cmapp;
5610
5611         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
5612
5613         if (setcountp != NULL)
5614                 *setcountp = 0;
5615
5616         /* allocate a cluster map if we don't already have one */
5617         if (cmap == NULL) {
5618                 /* no cluster map, nothing to clean */
5619                 if (!dirty) {
5620                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
5621                         return(KERN_SUCCESS);
5622                 }
5623                 kret = vfs_drt_alloc_map(cmapp);
5624                 if (kret != KERN_SUCCESS) {
5625                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
5626                         return(kret);
5627                 }
5628         }
5629         setcount = 0;
5630
5631         /*
5632          * Iterate over the length of the region.
5633          */
5634         while (length > 0) {
5635                 /*
5636                  * Get the hashtable index for this offset.
5637                  *
5638                  * XXX this will add blank entries if we are clearing a range
5639                  * that hasn't been dirtied.
5640                  */
5641                 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
5642                 cmap = *cmapp;  /* may have changed! */
5643                 /* this may be a partial-success return */
5644                 if (kret != KERN_SUCCESS) {
5645                         if (setcountp != NULL)
5646                                 *setcountp = setcount;
5647                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
5648
5649                         return(kret);
5650                 }
5651
5652                 /*
5653                  * Work out how many pages we're modifying in this
5654                  * hashtable entry.
5655                  */
5656                 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
5657                 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
5658
5659                 /*
5660                  * Iterate over pages, dirty/clearing as we go.
5661                  */
5662                 ecount = DRT_HASH_GET_COUNT(cmap, index);
5663                 for (i = 0; i < pgcount; i++) {
5664                         if (dirty) {
5665                                 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
5666                                         DRT_HASH_SET_BIT(cmap, index, pgoff + i);
5667                                         ecount++;
5668                                         setcount++;
5669                                 }
5670                         } else {
5671                                 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
5672                                         DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
5673                                         ecount--;
5674                                         setcount++;
5675                                 }
5676                         }
5677                 }
5678                 DRT_HASH_SET_COUNT(cmap, index, ecount);
5679
5680                 offset += pgcount * PAGE_SIZE;
5681                 length -= pgcount * PAGE_SIZE;
5682         }
5683         if (setcountp != NULL)
5684                 *setcountp = setcount;
5685
5686         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
5687
5688         return(KERN_SUCCESS);
5689 }
5690
5691 /*
5692  * Mark a set of pages as dirty/clean.
5693  *
5694  * This is a public interface.
5695  *
5696  * cmapp
5697  *      Pointer to storage suitable for holding a pointer.  Note that
5698  *      this must either be NULL or a value set by this function.
5699  *
5700  * size
5701  *      Current file size in bytes.
5702  *
5703  * offset
5704  *      Offset of the first page to be marked as dirty, in bytes.  Must be
5705  *      page-aligned.
5706  *
5707  * length
5708  *      Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE.
5709  *
5710  * setcountp
5711  *      Number of pages newly marked dirty by this call (optional).
5712  *
5713  * Returns KERN_SUCCESS if all the pages were successfully marked.
5714  */
5715 static kern_return_t
5716 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp)
5717 {
5718         /* XXX size unused, drop from interface */
5719         return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
5720 }
5721
5722 #if 0
5723 static kern_return_t
5724 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
5725 {
5726         return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
5727 }
5728 #endif
5729
5730 /*
5731  * Get a cluster of dirty pages.
5732  *
5733  * This is a public interface.
5734  *
5735  * cmapp
5736  *      Pointer to storage managed by drt_mark_pages.  Note that this must
5737  *      be NULL or a value set by drt_mark_pages.
5738  *
5739  * offsetp
5740  *      Returns the byte offset into the file of the first page in the cluster.
5741  *
5742  * lengthp
5743  *      Returns the length in bytes of the cluster of dirty pages.
5744  *
5745  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there
5746  * are no dirty pages meeting the minmum size criteria.  Private storage will
5747  * be released if there are no more dirty pages left in the map
5748  *
5749  */
5750 static kern_return_t
5751 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
5752 {
5753         struct vfs_drt_clustermap *cmap;
5754         u_int64_t       offset;
5755         u_int           length;
5756         u_int32_t       j;
5757         int             index, i, fs, ls;
5758
5759         /* sanity */
5760         if ((cmapp == NULL) || (*cmapp == NULL))
5761                 return(KERN_FAILURE);
5762         cmap = *cmapp;
5763
5764         /* walk the hashtable */
5765         for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
5766                 index = DRT_HASH(cmap, offset);
5767
5768                 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
5769                         continue;
5770
5771                 /* scan the bitfield for a string of bits */
5772                 fs = -1;
5773
5774                 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5775                         if (DRT_HASH_TEST_BIT(cmap, index, i)) {
5776                                 fs = i;
5777                                 break;
5778                         }
5779                 }
5780                 if (fs == -1) {
5781                         /*  didn't find any bits set */
5782                         panic("vfs_drt: entry summary count > 0 but no bits set in map");
5783                 }
5784                 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
5785                         if (!DRT_HASH_TEST_BIT(cmap, index, i))
5786                                 break;
5787                 }
5788
5789                 /* compute offset and length, mark pages clean */
5790                 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
5791                 length = ls * PAGE_SIZE;
5792                 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
5793                 cmap->scm_lastclean = index;
5794
5795                 /* return successful */
5796                 *offsetp = (off_t)offset;
5797                 *lengthp = length;
5798
5799                 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
5800                 return(KERN_SUCCESS);
5801         }
5802         /*
5803          * We didn't find anything... hashtable is empty
5804          * emit stats into trace buffer and
5805          * then free it
5806          */
5807         vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5808                       cmap->scm_modulus,
5809                       cmap->scm_buckets,
5810                       cmap->scm_lastclean,
5811                       cmap->scm_iskips);
5812
5813         vfs_drt_free_map(cmap);
5814         *cmapp = NULL;
5815
5816         return(KERN_FAILURE);
5817 }
5818
5819
5820 static kern_return_t
5821 vfs_drt_control(void **cmapp, int op_type)
5822 {
5823         struct vfs_drt_clustermap *cmap;
5824
5825         /* sanity */
5826         if ((cmapp == NULL) || (*cmapp == NULL))
5827                 return(KERN_FAILURE);
5828         cmap = *cmapp;
5829
5830         switch (op_type) {
5831         case 0:
5832                 /* emit stats into trace buffer */
5833                 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5834                               cmap->scm_modulus,
5835                               cmap->scm_buckets,
5836                               cmap->scm_lastclean,
5837                               cmap->scm_iskips);
5838
5839                 vfs_drt_free_map(cmap);
5840                 *cmapp = NULL;
5841                 break;
5842
5843         case 1:
5844                 cmap->scm_lastclean = 0;
5845                 break;
5846         }
5847         return(KERN_SUCCESS);
5848 }
5849
5850
5851
5852 /*
5853  * Emit a summary of the state of the clustermap into the trace buffer
5854  * along with some caller-provided data.
5855  */
5856 #if KDEBUG
5857 static void
5858 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
5859 {
5860         KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
5861 }
5862 #else
5863 static void
5864 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
5865                           __unused int arg1, __unused int arg2, __unused int arg3,
5866                           __unused int arg4)
5867 {
5868 }
5869 #endif
5870
5871 #if 0
5872 /*
5873  * Perform basic sanity check on the hash entry summary count
5874  * vs. the actual bits set in the entry.
5875  */
5876 static void
5877 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
5878 {
5879         int index, i;
5880         int bits_on;
5881
5882         for (index = 0; index < cmap->scm_modulus; index++) {
5883                 if (DRT_HASH_VACANT(cmap, index))
5884                         continue;
5885
5886                 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5887                         if (DRT_HASH_TEST_BIT(cmap, index, i))
5888                                 bits_on++;
5889                 }
5890                 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
5891                         panic("bits_on = %d,  index = %d\n", bits_on, index);
5892         }
5893 }
5894 #endif