bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  62  */
  63
  64 #include <sys/param.h>
  65 #include <sys/proc_internal.h>
  66 #include <sys/buf_internal.h>
  67 #include <sys/mount_internal.h>
  68 #include <sys/vnode_internal.h>
  69 #include <sys/trace.h>
  70 #include <sys/malloc.h>
  71 #include <sys/time.h>
  72 #include <sys/kernel.h>
  73 #include <sys/resourcevar.h>
  74 #include <sys/uio_internal.h>
  75 #include <libkern/libkern.h>
  76 #include <machine/machine_routines.h>
  77
  78 #include <sys/ubc_internal.h>
  79 #include <vm/vnode_pager.h>
  80
  81 #include <mach/mach_types.h>
  82 #include <mach/memory_object_types.h>
  83 #include <mach/vm_map.h>
  84 #include <mach/upl.h>
  85
  86 #include <vm/vm_kern.h>
  87 #include <vm/vm_map.h>
  88 #include <vm/vm_pageout.h>
  89
  90 #include <sys/kdebug.h>
  91
  92 #define CL_READ         0x01
  93 #define CL_WRITE        0x02
  94 #define CL_ASYNC        0x04
  95 #define CL_COMMIT       0x08
  96 #define CL_PAGEOUT      0x10
  97 #define CL_AGE          0x20
  98 #define CL_NOZERO       0x40
  99 #define CL_PAGEIN       0x80
 100 #define CL_DEV_MEMORY   0x100
 101 #define CL_PRESERVE     0x200
 102 #define CL_THROTTLE     0x400
 103 #define CL_KEEPCACHED   0x800
 104 #define CL_DIRECT_IO    0x1000
 105 #define CL_PASSIVE      0x2000
 106
 107
 108 struct clios {
 109         u_int  io_completed;       /* amount of io that has currently completed */
 110         u_int  io_issued;          /* amount of io that was successfully issued */
 111         int    io_error;           /* error code of first error encountered */
 112         int    io_wanted;          /* someone is sleeping waiting for a change in state */
 113 };
 114
 115 static lck_grp_t        *cl_mtx_grp;
 116 static lck_attr_t       *cl_mtx_attr;
 117 static lck_grp_attr_t   *cl_mtx_grp_attr;
 118 static lck_mtx_t        *cl_mtxp;
 119
 120
 121 #define IO_UNKNOWN      0
 122 #define IO_DIRECT       1
 123 #define IO_CONTIG       2
 124 #define IO_COPY         3
 125
 126 #define PUSH_DELAY      0x01
 127 #define PUSH_ALL        0x02
 128 #define PUSH_SYNC       0x04
 129
 130
 131 static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset);
 132 static void cluster_wait_IO(buf_t cbp_head, int async);
 133 static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait);
 134
 135 static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length);
 136
 137 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 138                       int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg);
 139 static int cluster_iodone(buf_t bp, void *callback_arg);
 140 static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags);
 141 static int cluster_hard_throttle_on(vnode_t vp);
 142
 143 static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg);
 144
 145 static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int flags);
 146 static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference);
 147
 148 static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size,  off_t filesize, int flags,
 149                              int (*)(buf_t, void *), void *callback_arg);
 150 static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
 151                                int flags, int (*)(buf_t, void *), void *callback_arg);
 152 static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
 153                                int (*)(buf_t, void *), void *callback_arg, int flags);
 154
 155 static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
 156                               off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg);
 157 static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
 158                                 int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg);
 159 static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
 160                                 int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag);
 161
 162 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg);
 163
 164 static int      cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
 165 static void     cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
 166
 167 static int      cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg);
 168
 169 static int      cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int (*)(buf_t, void *), void *callback_arg);
 170
 171 static void     sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg);
 172 static void     sparse_cluster_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int (*)(buf_t, void *), void *callback_arg);
 173 static void     sparse_cluster_add(struct cl_writebehind *, vnode_t vp, struct cl_extent *, off_t EOF, int (*)(buf_t, void *), void *callback_arg);
 174
 175 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp);
 176 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
 177 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
 178
 179 int     is_file_clean(vnode_t, off_t);
 180
 181 /*
 182  * limit the internal I/O size so that we
 183  * can represent it in a 32 bit int
 184  */
 185 #define MAX_IO_REQUEST_SIZE     (1024 * 1024 * 256)
 186 #define MAX_IO_CONTIG_SIZE      (MAX_UPL_SIZE * PAGE_SIZE)
 187 #define MAX_VECTS               16
 188 #define MIN_DIRECT_WRITE_SIZE   (4 * PAGE_SIZE)
 189
 190
 191 #define MAX_CLUSTER_SIZE(vp)   (cluster_max_io_size(vp->v_mount, CL_WRITE))
 192 #define MAX_PREFETCH(vp)       (cluster_max_io_size(vp->v_mount, CL_READ) * 3);
 193
 194
 195 int speculative_reads_disabled = 0;
 196
 197 /*
 198  * throttle the number of async writes that
 199  * can be outstanding on a single vnode
 200  * before we issue a synchronous write
 201  */
 202 #define HARD_THROTTLE_MAXCNT    0
 203 #define HARD_THROTTLE_MAXSIZE   (64 * 1024)
 204
 205 int hard_throttle_on_root = 0;
 206 struct timeval priority_IO_timestamp_for_root;
 207
 208
 209 void
 210 cluster_init(void) {
 211         /*
 212          * allocate lock group attribute and group
 213          */
 214         cl_mtx_grp_attr = lck_grp_attr_alloc_init();
 215         cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
 216
 217         /*
 218          * allocate the lock attribute
 219          */
 220         cl_mtx_attr = lck_attr_alloc_init();
 221
 222         /*
 223          * allocate and initialize mutex's used to protect updates and waits
 224          * on the cluster_io context
 225          */
 226         cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
 227
 228         if (cl_mtxp == NULL)
 229                 panic("cluster_init: failed to allocate cl_mtxp");
 230 }
 231
 232
 233 uint32_t
 234 cluster_max_io_size(mount_t mp, int type)
 235 {
 236        uint32_t        max_io_size;
 237        uint32_t        segcnt;
 238        uint32_t        maxcnt;
 239
 240        switch(type) {
 241
 242        case CL_READ:
 243                segcnt = mp->mnt_segreadcnt;
 244                maxcnt = mp->mnt_maxreadcnt;
 245                break;
 246        case CL_WRITE:
 247                segcnt = mp->mnt_segwritecnt;
 248                maxcnt = mp->mnt_maxwritecnt;
 249                break;
 250        default:
 251                segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
 252                maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
 253                break;
 254        }
 255        if (segcnt > MAX_UPL_SIZE) {
 256                /*
 257                 * don't allow a size beyond the max UPL size we can create
 258                 */
 259                segcnt = MAX_UPL_SIZE;
 260        }
 261        max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
 262
 263        if (max_io_size < (MAX_UPL_TRANSFER * PAGE_SIZE)) {
 264                /*
 265                 * don't allow a size smaller than the old fixed limit
 266                 */
 267                max_io_size = (MAX_UPL_TRANSFER * PAGE_SIZE);
 268        } else {
 269                /*
 270                 * make sure the size specified is a multiple of PAGE_SIZE
 271                 */
 272                max_io_size &= ~PAGE_MASK;
 273        }
 274        return (max_io_size);
 275 }
 276
 277
 278
 279
 280 #define CLW_ALLOCATE            0x01
 281 #define CLW_RETURNLOCKED        0x02
 282 #define CLW_IONOCACHE           0x04
 283 #define CLW_IOPASSIVE   0x08
 284
 285 /*
 286  * if the read ahead context doesn't yet exist,
 287  * allocate and initialize it...
 288  * the vnode lock serializes multiple callers
 289  * during the actual assignment... first one
 290  * to grab the lock wins... the other callers
 291  * will release the now unnecessary storage
 292  *
 293  * once the context is present, try to grab (but don't block on)
 294  * the lock associated with it... if someone
 295  * else currently owns it, than the read
 296  * will run without read-ahead.  this allows
 297  * multiple readers to run in parallel and
 298  * since there's only 1 read ahead context,
 299  * there's no real loss in only allowing 1
 300  * reader to have read-ahead enabled.
 301  */
 302 static struct cl_readahead *
 303 cluster_get_rap(vnode_t vp)
 304 {
 305         struct ubc_info         *ubc;
 306         struct cl_readahead     *rap;
 307
 308         ubc = vp->v_ubcinfo;
 309
 310         if ((rap = ubc->cl_rahead) == NULL) {
 311                 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
 312
 313                 bzero(rap, sizeof *rap);
 314                 rap->cl_lastr = -1;
 315                 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
 316
 317                 vnode_lock(vp);
 318
 319                 if (ubc->cl_rahead == NULL)
 320                         ubc->cl_rahead = rap;
 321                 else {
 322                         lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
 323                         FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
 324                         rap = ubc->cl_rahead;
 325                 }
 326                 vnode_unlock(vp);
 327         }
 328         if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
 329                 return(rap);
 330
 331         return ((struct cl_readahead *)NULL);
 332 }
 333
 334
 335 /*
 336  * if the write behind context doesn't yet exist,
 337  * and CLW_ALLOCATE is specified, allocate and initialize it...
 338  * the vnode lock serializes multiple callers
 339  * during the actual assignment... first one
 340  * to grab the lock wins... the other callers
 341  * will release the now unnecessary storage
 342  *
 343  * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
 344  * the lock associated with the write behind context before
 345  * returning
 346  */
 347
 348 static struct cl_writebehind *
 349 cluster_get_wbp(vnode_t vp, int flags)
 350 {
 351         struct ubc_info *ubc;
 352         struct cl_writebehind *wbp;
 353
 354         ubc = vp->v_ubcinfo;
 355
 356         if ((wbp = ubc->cl_wbehind) == NULL) {
 357
 358                 if ( !(flags & CLW_ALLOCATE))
 359                         return ((struct cl_writebehind *)NULL);
 360
 361                 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
 362
 363                 bzero(wbp, sizeof *wbp);
 364                 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
 365
 366                 vnode_lock(vp);
 367
 368                 if (ubc->cl_wbehind == NULL)
 369                         ubc->cl_wbehind = wbp;
 370                 else {
 371                         lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
 372                         FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
 373                         wbp = ubc->cl_wbehind;
 374                 }
 375                 vnode_unlock(vp);
 376         }
 377         if (flags & CLW_RETURNLOCKED)
 378                 lck_mtx_lock(&wbp->cl_lockw);
 379
 380         return (wbp);
 381 }
 382
 383
 384 static void
 385 cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg)
 386 {
 387         struct cl_writebehind *wbp;
 388
 389         if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
 390
 391                 if (wbp->cl_number) {
 392                         lck_mtx_lock(&wbp->cl_lockw);
 393
 394                         cluster_try_push(wbp, vp, newEOF, PUSH_ALL | PUSH_SYNC, callback, callback_arg);
 395
 396                         lck_mtx_unlock(&wbp->cl_lockw);
 397                 }
 398         }
 399 }
 400
 401
 402 static int
 403 cluster_hard_throttle_on(vnode_t vp)
 404 {
 405         static struct timeval hard_throttle_maxelapsed = { 0, 200000 };
 406
 407         if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
 408                 struct timeval elapsed;
 409
 410                 if (hard_throttle_on_root)
 411                         return(1);
 412
 413                 microuptime(&elapsed);
 414                 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
 415
 416                 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
 417                         return(1);
 418         }
 419         struct uthread  *ut;
 420         if (throttle_get_io_policy(&ut) == IOPOL_THROTTLE) {
 421                 size_t devbsdunit;
 422                 if (vp->v_mount != NULL)
 423                         devbsdunit = vp->v_mount->mnt_devbsdunit;
 424                 else
 425                         devbsdunit = LOWPRI_MAX_NUM_DEV - 1;
 426                 if (throttle_io_will_be_throttled(-1, devbsdunit)) {
 427                         return(1);
 428                 }
 429         }
 430         return(0);
 431 }
 432
 433
 434 static int
 435 cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags)
 436 {
 437         int upl_abort_code = 0;
 438         int page_in  = 0;
 439         int page_out = 0;
 440
 441         if (io_flags & B_PHYS)
 442                 /*
 443                  * direct write of any flavor, or a direct read that wasn't aligned
 444                  */
 445                 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
 446         else {
 447                 if (io_flags & B_PAGEIO) {
 448                         if (io_flags & B_READ)
 449                                 page_in  = 1;
 450                         else
 451                                 page_out = 1;
 452                 }
 453                 if (io_flags & B_CACHE)
 454                         /*
 455                          * leave pages in the cache unchanged on error
 456                          */
 457                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 458                 else if (page_out && (error != ENXIO))
 459                         /*
 460                          * transient error... leave pages unchanged
 461                          */
 462                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 463                 else if (page_in)
 464                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 465                 else
 466                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 467
 468                 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
 469         }
 470         return (upl_abort_code);
 471 }
 472
 473
 474 static int
 475 cluster_iodone(buf_t bp, void *callback_arg)
 476 {
 477         int     b_flags;
 478         int     error;
 479         int     total_size;
 480         int     total_resid;
 481         int     upl_offset;
 482         int     zero_offset;
 483         int     pg_offset = 0;
 484         int     commit_size = 0;
 485         int     upl_flags = 0;
 486         int     transaction_size = 0;
 487         upl_t   upl;
 488         buf_t   cbp;
 489         buf_t   cbp_head;
 490         buf_t   cbp_next;
 491         buf_t   real_bp;
 492         struct  clios *iostate;
 493         boolean_t       transaction_complete = FALSE;
 494
 495         cbp_head = (buf_t)(bp->b_trans_head);
 496
 497         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 498                      (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 499
 500         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 501                 /*
 502                  * all I/O requests that are part of this transaction
 503                  * have to complete before we can process it
 504                  */
 505                 if ( !(cbp->b_flags & B_DONE)) {
 506
 507                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 508                                      (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
 509
 510                         return 0;
 511                 }
 512                 if (cbp->b_flags & B_EOT)
 513                         transaction_complete = TRUE;
 514         }
 515         if (transaction_complete == FALSE) {
 516                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 517                              (int)cbp_head, 0, 0, 0, 0);
 518
 519                 return 0;
 520         }
 521         error       = 0;
 522         total_size  = 0;
 523         total_resid = 0;
 524
 525         cbp        = cbp_head;
 526         upl_offset = cbp->b_uploffset;
 527         upl        = cbp->b_upl;
 528         b_flags    = cbp->b_flags;
 529         real_bp    = cbp->b_real_bp;
 530         zero_offset= cbp->b_validend;
 531         iostate    = (struct clios *)cbp->b_iostate;
 532
 533         if (real_bp)
 534                 real_bp->b_dev = cbp->b_dev;
 535
 536         while (cbp) {
 537                 if ((cbp->b_flags & B_ERROR) && error == 0)
 538                         error = cbp->b_error;
 539
 540                 total_resid += cbp->b_resid;
 541                 total_size  += cbp->b_bcount;
 542
 543                 cbp_next = cbp->b_trans_next;
 544
 545                 if (cbp_next == NULL)
 546                         /*
 547                          * compute the overall size of the transaction
 548                          * in case we created one that has 'holes' in it
 549                          * 'total_size' represents the amount of I/O we
 550                          * did, not the span of the transaction w/r to the UPL
 551                          */
 552                         transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
 553
 554                 if (cbp != cbp_head)
 555                         free_io_buf(cbp);
 556
 557                 cbp = cbp_next;
 558         }
 559         if (error == 0 && total_resid)
 560                 error = EIO;
 561
 562         if (error == 0) {
 563                 int     (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone);
 564
 565                 if (cliodone_func != NULL) {
 566                         cbp_head->b_bcount = transaction_size;
 567
 568                         error = (*cliodone_func)(cbp_head, callback_arg);
 569                 }
 570         }
 571         if (zero_offset)
 572                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 573
 574         free_io_buf(cbp_head);
 575
 576         if (iostate) {
 577                 int need_wakeup = 0;
 578
 579                 /*
 580                  * someone has issued multiple I/Os asynchrounsly
 581                  * and is waiting for them to complete (streaming)
 582                  */
 583                 lck_mtx_lock_spin(cl_mtxp);
 584
 585                 if (error && iostate->io_error == 0)
 586                         iostate->io_error = error;
 587
 588                 iostate->io_completed += total_size;
 589
 590                 if (iostate->io_wanted) {
 591                         /*
 592                          * someone is waiting for the state of
 593                          * this io stream to change
 594                          */
 595                         iostate->io_wanted = 0;
 596                         need_wakeup = 1;
 597                 }
 598                 lck_mtx_unlock(cl_mtxp);
 599
 600                 if (need_wakeup)
 601                         wakeup((caddr_t)&iostate->io_wanted);
 602         }
 603
 604         if (b_flags & B_COMMIT_UPL) {
 605
 606                 pg_offset   = upl_offset & PAGE_MASK;
 607                 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 608
 609                 if (error)
 610                         upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags);
 611                 else {
 612                         upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
 613
 614                         if ((b_flags & B_PHYS) && (b_flags & B_READ))
 615                                 upl_flags |= UPL_COMMIT_SET_DIRTY;
 616
 617                         if (b_flags & B_AGE)
 618                                 upl_flags |= UPL_COMMIT_INACTIVATE;
 619
 620                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
 621                 }
 622         }
 623         if ((b_flags & B_NEED_IODONE) && real_bp) {
 624                 if (error) {
 625                         real_bp->b_flags |= B_ERROR;
 626                         real_bp->b_error = error;
 627                 }
 628                 real_bp->b_resid = total_resid;
 629
 630                 buf_biodone(real_bp);
 631         }
 632         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 633                      (int)upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0);
 634
 635         return (error);
 636 }
 637
 638
 639 void
 640 cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, buf_t bp)
 641 {
 642
 643         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
 644                      upl_offset, size, (int)bp, 0, 0);
 645
 646         if (bp == NULL || bp->b_datap == 0) {
 647                 upl_page_info_t *pl;
 648                 addr64_t        zero_addr;
 649
 650                 pl = ubc_upl_pageinfo(upl);
 651
 652                 if (upl_device_page(pl) == TRUE) {
 653                         zero_addr = ((addr64_t)upl_phys_page(pl, 0) << 12) + upl_offset;
 654
 655                         bzero_phys_nc(zero_addr, size);
 656                 } else {
 657                         while (size) {
 658                                 int     page_offset;
 659                                 int     page_index;
 660                                 int     zero_cnt;
 661
 662                                 page_index  = upl_offset / PAGE_SIZE;
 663                                 page_offset = upl_offset & PAGE_MASK;
 664
 665                                 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
 666                                 zero_cnt  = min(PAGE_SIZE - page_offset, size);
 667
 668                                 bzero_phys(zero_addr, zero_cnt);
 669
 670                                 size       -= zero_cnt;
 671                                 upl_offset += zero_cnt;
 672                         }
 673                 }
 674         } else
 675                 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
 676
 677         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
 678                      upl_offset, size, 0, 0, 0);
 679 }
 680
 681
 682 static void
 683 cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset)
 684 {
 685         cbp_head->b_validend = zero_offset;
 686         cbp_tail->b_flags |= B_EOT;
 687 }
 688
 689 static void
 690 cluster_wait_IO(buf_t cbp_head, int async)
 691 {
 692         buf_t   cbp;
 693
 694         if (async) {
 695                 /*
 696                  * async callback completion will not normally
 697                  * generate a wakeup upon I/O completion...
 698                  * by setting BL_WANTED, we will force a wakeup
 699                  * to occur as any outstanding I/Os complete...
 700                  * I/Os already completed will have BL_CALLDONE already
 701                  * set and we won't block in buf_biowait_callback..
 702                  * note that we're actually waiting for the bp to have
 703                  * completed the callback function... only then
 704                  * can we safely take back ownership of the bp
 705                  * need the main buf mutex in order to safely
 706                  * update b_lflags
 707                  */
 708                 buf_list_lock();
 709
 710                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 711                       cbp->b_lflags |= BL_WANTED;
 712
 713                 buf_list_unlock();
 714         }
 715         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 716                 if (async)
 717                         buf_biowait_callback(cbp);
 718                 else
 719                         buf_biowait(cbp);
 720         }
 721 }
 722
 723 static void
 724 cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait)
 725 {
 726         buf_t   cbp;
 727         int     error;
 728
 729         /*
 730          * cluster_complete_transaction will
 731          * only be called if we've issued a complete chain in synchronous mode
 732          * or, we've already done a cluster_wait_IO on an incomplete chain
 733          */
 734         if (needwait) {
 735                 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next)
 736                         buf_biowait(cbp);
 737         }
 738         error = cluster_iodone(*cbp_head, callback_arg);
 739
 740         if ( !(flags & CL_ASYNC) && error && *retval == 0) {
 741                 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO))
 742                         *retval = error;
 743         }
 744         *cbp_head = (buf_t)NULL;
 745 }
 746
 747
 748 static int
 749 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 750            int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
 751 {
 752         buf_t   cbp;
 753         u_int   size;
 754         u_int   io_size;
 755         int     io_flags;
 756         int     bmap_flags;
 757         int     error = 0;
 758         int     retval = 0;
 759         buf_t   cbp_head = NULL;
 760         buf_t   cbp_tail = NULL;
 761         int     trans_count = 0;
 762         int     max_trans_count;
 763         u_int   pg_count;
 764         int     pg_offset;
 765         u_int   max_iosize;
 766         u_int   max_vectors;
 767         int     priv;
 768         int     zero_offset = 0;
 769         int     async_throttle = 0;
 770         mount_t mp;
 771         vm_offset_t upl_end_offset;
 772         boolean_t   need_EOT = FALSE;
 773
 774         /*
 775          * we currently don't support buffers larger than a page
 776          */
 777         if (real_bp && non_rounded_size > PAGE_SIZE)
 778                 panic("%s(): Called with real buffer of size %d bytes which "
 779                                 "is greater than the maximum allowed size of "
 780                                 "%d bytes (the system PAGE_SIZE).\n",
 781                                 __FUNCTION__, non_rounded_size, PAGE_SIZE);
 782
 783         mp = vp->v_mount;
 784
 785         /*
 786          * we don't want to do any funny rounding of the size for IO requests
 787          * coming through the DIRECT or CONTIGUOUS paths...  those pages don't
 788          * belong to us... we can't extend (nor do we need to) the I/O to fill
 789          * out a page
 790          */
 791         if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) {
 792                 /*
 793                  * round the requested size up so that this I/O ends on a
 794                  * page boundary in case this is a 'write'... if the filesystem
 795                  * has blocks allocated to back the page beyond the EOF, we want to
 796                  * make sure to write out the zero's that are sitting beyond the EOF
 797                  * so that in case the filesystem doesn't explicitly zero this area
 798                  * if a hole is created via a lseek/write beyond the current EOF,
 799                  * it will return zeros when it's read back from the disk.  If the
 800                  * physical allocation doesn't extend for the whole page, we'll
 801                  * only write/read from the disk up to the end of this allocation
 802                  * via the extent info returned from the VNOP_BLOCKMAP call.
 803                  */
 804                 pg_offset = upl_offset & PAGE_MASK;
 805
 806                 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
 807         } else {
 808                 /*
 809                  * anyone advertising a blocksize of 1 byte probably
 810                  * can't deal with us rounding up the request size
 811                  * AFP is one such filesystem/device
 812                  */
 813                 size = non_rounded_size;
 814         }
 815         upl_end_offset = upl_offset + size;
 816
 817         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
 818
 819         /*
 820          * Set the maximum transaction size to the maximum desired number of
 821          * buffers.
 822          */
 823         max_trans_count = 8;
 824         if (flags & CL_DEV_MEMORY)
 825                 max_trans_count = 16;
 826
 827         if (flags & CL_READ) {
 828                 io_flags = B_READ;
 829                 bmap_flags = VNODE_READ;
 830
 831                 max_iosize  = mp->mnt_maxreadcnt;
 832                 max_vectors = mp->mnt_segreadcnt;
 833         } else {
 834                 io_flags = B_WRITE;
 835                 bmap_flags = VNODE_WRITE;
 836
 837                 max_iosize  = mp->mnt_maxwritecnt;
 838                 max_vectors = mp->mnt_segwritecnt;
 839         }
 840         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
 841
 842         /*
 843          * make sure the maximum iosize is a
 844          * multiple of the page size
 845          */
 846         max_iosize  &= ~PAGE_MASK;
 847
 848         /*
 849          * Ensure the maximum iosize is sensible.
 850          */
 851         if (!max_iosize)
 852                 max_iosize = PAGE_SIZE;
 853
 854         if (flags & CL_THROTTLE) {
 855                 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
 856                         if (max_iosize > HARD_THROTTLE_MAXSIZE)
 857                                 max_iosize = HARD_THROTTLE_MAXSIZE;
 858                         async_throttle = HARD_THROTTLE_MAXCNT;
 859                 } else {
 860                         if ( (flags & CL_DEV_MEMORY) )
 861                                 async_throttle = VNODE_ASYNC_THROTTLE;
 862                         else {
 863                                 u_int max_cluster;
 864                                 u_int max_cluster_size;
 865                                 u_int max_prefetch;
 866
 867                                 max_cluster_size = MAX_CLUSTER_SIZE(vp);
 868                                 max_prefetch = MAX_PREFETCH(vp);
 869
 870                                 if (max_iosize > max_cluster_size)
 871                                         max_cluster = max_cluster_size;
 872                                 else
 873                                         max_cluster = max_iosize;
 874
 875                                 if (size < max_cluster)
 876                                         max_cluster = size;
 877
 878                                 async_throttle = min(VNODE_ASYNC_THROTTLE, (max_prefetch / max_cluster) - 1);
 879                         }
 880                 }
 881         }
 882         if (flags & CL_AGE)
 883                 io_flags |= B_AGE;
 884         if (flags & (CL_PAGEIN | CL_PAGEOUT))
 885                 io_flags |= B_PAGEIO;
 886         if (flags & CL_COMMIT)
 887                 io_flags |= B_COMMIT_UPL;
 888         if (flags & CL_PRESERVE)
 889                 io_flags |= B_PHYS;
 890         if (flags & CL_KEEPCACHED)
 891                 io_flags |= B_CACHE;
 892         if (flags & CL_PASSIVE)
 893                 io_flags |= B_PASSIVE;
 894         if (vp->v_flag & VSYSTEM)
 895                 io_flags |= B_META;
 896
 897         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
 898                 /*
 899                  * then we are going to end up
 900                  * with a page that we can't complete (the file size wasn't a multiple
 901                  * of PAGE_SIZE and we're trying to read to the end of the file
 902                  * so we'll go ahead and zero out the portion of the page we can't
 903                  * read in from the file
 904                  */
 905                 zero_offset = upl_offset + non_rounded_size;
 906         }
 907         while (size) {
 908                 daddr64_t blkno;
 909                 daddr64_t lblkno;
 910                 u_int   io_size_wanted;
 911
 912                 if (size > max_iosize)
 913                         io_size = max_iosize;
 914                 else
 915                         io_size = size;
 916
 917                 io_size_wanted = io_size;
 918
 919                 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, bmap_flags, NULL)))
 920                         break;
 921
 922                 if (io_size > io_size_wanted)
 923                         io_size = io_size_wanted;
 924
 925                 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
 926                         real_bp->b_blkno = blkno;
 927
 928                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
 929                              (int)f_offset, (int)(blkno>>32), (int)blkno, io_size, 0);
 930
 931                 if (io_size == 0) {
 932                         /*
 933                          * vnop_blockmap didn't return an error... however, it did
 934                          * return an extent size of 0 which means we can't
 935                          * make forward progress on this I/O... a hole in the
 936                          * file would be returned as a blkno of -1 with a non-zero io_size
 937                          * a real extent is returned with a blkno != -1 and a non-zero io_size
 938                          */
 939                         error = EINVAL;
 940                         break;
 941                 }
 942                 if ( !(flags & CL_READ) && blkno == -1) {
 943                         off_t   e_offset;
 944                         int     pageout_flags;
 945
 946                         /*
 947                          * we're writing into a 'hole'
 948                          */
 949                         if (flags & CL_PAGEOUT) {
 950                                 /*
 951                                  * if we got here via cluster_pageout
 952                                  * then just error the request and return
 953                                  * the 'hole' should already have been covered
 954                                  */
 955                                 error = EINVAL;
 956                                 break;
 957                         }
 958                         /*
 959                          * we can get here if the cluster code happens to
 960                          * pick up a page that was dirtied via mmap vs
 961                          * a 'write' and the page targets a 'hole'...
 962                          * i.e. the writes to the cluster were sparse
 963                          * and the file was being written for the first time
 964                          *
 965                          * we can also get here if the filesystem supports
 966                          * 'holes' that are less than PAGE_SIZE.... because
 967                          * we can't know if the range in the page that covers
 968                          * the 'hole' has been dirtied via an mmap or not,
 969                          * we have to assume the worst and try to push the
 970                          * entire page to storage.
 971                          *
 972                          * Try paging out the page individually before
 973                          * giving up entirely and dumping it (the pageout
 974                          * path will insure that the zero extent accounting
 975                          * has been taken care of before we get back into cluster_io)
 976                          *
 977                          * go direct to vnode_pageout so that we don't have to
 978                          * unbusy the page from the UPL... we used to do this
 979                          * so that we could call ubc_sync_range, but that results
 980                          * in a potential deadlock if someone else races us to acquire
 981                          * that page and wins and in addition needs one of the pages
 982                          * we're continuing to hold in the UPL
 983                          */
 984                         pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT;
 985
 986                         if ( !(flags & CL_ASYNC))
 987                                 pageout_flags |= UPL_IOSYNC;
 988                         if ( !(flags & CL_COMMIT))
 989                                 pageout_flags |= UPL_NOCOMMIT;
 990
 991                         if (cbp_head) {
 992                                 buf_t last_cbp;
 993
 994                                 /*
 995                                  * first we have to wait for the the current outstanding I/Os
 996                                  * to complete... EOT hasn't been set yet on this transaction
 997                                  * so the pages won't be released just because all of the current
 998                                  * I/O linked to this transaction has completed...
 999                                  */
1000                                 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1001
1002                                 /*
1003                                  * we've got a transcation that
1004                                  * includes the page we're about to push out through vnode_pageout...
1005                                  * find the last bp in the list which will be the one that
1006                                  * includes the head of this page and round it's iosize down
1007                                  * to a page boundary...
1008                                  */
1009                                 for (last_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next)
1010                                         last_cbp = cbp;
1011
1012                                 cbp->b_bcount &= ~PAGE_MASK;
1013
1014                                 if (cbp->b_bcount == 0) {
1015                                         /*
1016                                          * this buf no longer has any I/O associated with it
1017                                          */
1018                                         free_io_buf(cbp);
1019
1020                                         if (cbp == cbp_head) {
1021                                                 /*
1022                                                  * the buf we just freed was the only buf in
1023                                                  * this transaction... so there's no I/O to do
1024                                                  */
1025                                                 cbp_head = NULL;
1026                                         } else {
1027                                                 /*
1028                                                  * remove the buf we just freed from
1029                                                  * the transaction list
1030                                                  */
1031                                                 last_cbp->b_trans_next = NULL;
1032                                                 cbp_tail = last_cbp;
1033                                         }
1034                                 }
1035                                 if (cbp_head) {
1036                                         /*
1037                                          * there was more to the current transaction
1038                                          * than just the page we are pushing out via vnode_pageout...
1039                                          * mark it as finished and complete it... we've already
1040                                          * waited for the I/Os to complete above in the call to cluster_wait_IO
1041                                          */
1042                                         cluster_EOT(cbp_head, cbp_tail, 0);
1043
1044                                         cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1045
1046                                         trans_count = 0;
1047                                 }
1048                         }
1049                         if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
1050                                 error = EINVAL;
1051                                 break;
1052                         }
1053                         e_offset = round_page_64(f_offset + 1);
1054                         io_size = e_offset - f_offset;
1055
1056                         f_offset   += io_size;
1057                         upl_offset += io_size;
1058
1059                         if (size >= io_size)
1060                                 size -= io_size;
1061                         else
1062                                 size = 0;
1063                         /*
1064                          * keep track of how much of the original request
1065                          * that we've actually completed... non_rounded_size
1066                          * may go negative due to us rounding the request
1067                          * to a page size multiple (i.e.  size > non_rounded_size)
1068                          */
1069                         non_rounded_size -= io_size;
1070
1071                         if (non_rounded_size <= 0) {
1072                                 /*
1073                                  * we've transferred all of the data in the original
1074                                  * request, but we were unable to complete the tail
1075                                  * of the last page because the file didn't have
1076                                  * an allocation to back that portion... this is ok.
1077                                  */
1078                                 size = 0;
1079                         }
1080                         continue;
1081                 }
1082                 lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64);
1083                 /*
1084                  * we have now figured out how much I/O we can do - this is in 'io_size'
1085                  * pg_offset is the starting point in the first page for the I/O
1086                  * pg_count is the number of full and partial pages that 'io_size' encompasses
1087                  */
1088                 pg_offset = upl_offset & PAGE_MASK;
1089
1090                 if (flags & CL_DEV_MEMORY) {
1091                         /*
1092                          * treat physical requests as one 'giant' page
1093                          */
1094                         pg_count = 1;
1095                 } else
1096                         pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
1097
1098                 if ((flags & CL_READ) && blkno == -1) {
1099                         vm_offset_t  commit_offset;
1100                         int bytes_to_zero;
1101                         int complete_transaction_now = 0;
1102
1103                         /*
1104                          * if we're reading and blkno == -1, then we've got a
1105                          * 'hole' in the file that we need to deal with by zeroing
1106                          * out the affected area in the upl
1107                          */
1108                         if (io_size >= (u_int)non_rounded_size) {
1109                                 /*
1110                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1111                                  * than 'zero_offset' will be non-zero
1112                                  * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1113                                  * (indicated by the io_size finishing off the I/O request for this UPL)
1114                                  * than we're not going to issue an I/O for the
1115                                  * last page in this upl... we need to zero both the hole and the tail
1116                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
1117                                  */
1118                                 bytes_to_zero = non_rounded_size;
1119                                 if (!(flags & CL_NOZERO))
1120                                         bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
1121
1122                                 zero_offset = 0;
1123                         } else
1124                                 bytes_to_zero = io_size;
1125
1126                         pg_count = 0;
1127
1128                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
1129
1130                         if (cbp_head) {
1131                                 int     pg_resid;
1132
1133                                 /*
1134                                  * if there is a current I/O chain pending
1135                                  * then the first page of the group we just zero'd
1136                                  * will be handled by the I/O completion if the zero
1137                                  * fill started in the middle of the page
1138                                  */
1139                                 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1140
1141                                 pg_resid = commit_offset - upl_offset;
1142
1143                                 if (bytes_to_zero >= pg_resid) {
1144                                         /*
1145                                          * the last page of the current I/O
1146                                          * has been completed...
1147                                          * compute the number of fully zero'd
1148                                          * pages that are beyond it
1149                                          * plus the last page if its partial
1150                                          * and we have no more I/O to issue...
1151                                          * otherwise a partial page is left
1152                                          * to begin the next I/O
1153                                          */
1154                                         if ((int)io_size >= non_rounded_size)
1155                                                 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
1156                                         else
1157                                                 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
1158
1159                                         complete_transaction_now = 1;
1160                                 }
1161                         } else {
1162                                 /*
1163                                  * no pending I/O to deal with
1164                                  * so, commit all of the fully zero'd pages
1165                                  * plus the last page if its partial
1166                                  * and we have no more I/O to issue...
1167                                  * otherwise a partial page is left
1168                                  * to begin the next I/O
1169                                  */
1170                                 if ((int)io_size >= non_rounded_size)
1171                                         pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
1172                                 else
1173                                         pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
1174
1175                                 commit_offset = upl_offset & ~PAGE_MASK;
1176                         }
1177                         if ( (flags & CL_COMMIT) && pg_count) {
1178                                 ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE,
1179                                                      UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1180                         }
1181                         upl_offset += io_size;
1182                         f_offset   += io_size;
1183                         size       -= io_size;
1184
1185                         /*
1186                          * keep track of how much of the original request
1187                          * that we've actually completed... non_rounded_size
1188                          * may go negative due to us rounding the request
1189                          * to a page size multiple (i.e.  size > non_rounded_size)
1190                          */
1191                         non_rounded_size -= io_size;
1192
1193                         if (non_rounded_size <= 0) {
1194                                 /*
1195                                  * we've transferred all of the data in the original
1196                                  * request, but we were unable to complete the tail
1197                                  * of the last page because the file didn't have
1198                                  * an allocation to back that portion... this is ok.
1199                                  */
1200                                 size = 0;
1201                         }
1202                         if (cbp_head && (complete_transaction_now || size == 0))  {
1203                                 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1204
1205                                 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
1206
1207                                 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1208
1209                                 trans_count = 0;
1210                         }
1211                         continue;
1212                 }
1213                 if (pg_count > max_vectors) {
1214                         if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
1215                                 io_size = PAGE_SIZE - pg_offset;
1216                                 pg_count = 1;
1217                         } else {
1218                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
1219                                 pg_count = max_vectors;
1220                         }
1221                 }
1222                 /*
1223                  * If the transaction is going to reach the maximum number of
1224                  * desired elements, truncate the i/o to the nearest page so
1225                  * that the actual i/o is initiated after this buffer is
1226                  * created and added to the i/o chain.
1227                  *
1228                  * I/O directed to physically contiguous memory
1229                  * doesn't have a requirement to make sure we 'fill' a page
1230                  */
1231                 if ( !(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
1232                                 ((upl_offset + io_size) & PAGE_MASK)) {
1233                         vm_offset_t aligned_ofs;
1234
1235                         aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
1236                         /*
1237                          * If the io_size does not actually finish off even a
1238                          * single page we have to keep adding buffers to the
1239                          * transaction despite having reached the desired limit.
1240                          *
1241                          * Eventually we get here with the page being finished
1242                          * off (and exceeded) and then we truncate the size of
1243                          * this i/o request so that it is page aligned so that
1244                          * we can finally issue the i/o on the transaction.
1245                          */
1246                         if (aligned_ofs > upl_offset) {
1247                                 io_size = aligned_ofs - upl_offset;
1248                                 pg_count--;
1249                         }
1250                 }
1251
1252                 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
1253                         /*
1254                          * if we're not targeting a virtual device i.e. a disk image
1255                          * it's safe to dip into the reserve pool since real devices
1256                          * can complete this I/O request without requiring additional
1257                          * bufs from the alloc_io_buf pool
1258                          */
1259                         priv = 1;
1260                 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
1261                         /*
1262                          * Throttle the speculative IO
1263                          */
1264                         priv = 0;
1265                 else
1266                         priv = 1;
1267
1268                 cbp = alloc_io_buf(vp, priv);
1269
1270                 if (flags & CL_PAGEOUT) {
1271                         u_int i;
1272
1273                         for (i = 0; i < pg_count; i++) {
1274                                 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
1275                                         panic("BUSY bp found in cluster_io");
1276                         }
1277                 }
1278                 if (flags & CL_ASYNC) {
1279                         if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg))
1280                                 panic("buf_setcallback failed\n");
1281                 }
1282                 cbp->b_cliodone = (void *)callback;
1283                 cbp->b_flags |= io_flags;
1284
1285                 cbp->b_lblkno = lblkno;
1286                 cbp->b_blkno  = blkno;
1287                 cbp->b_bcount = io_size;
1288
1289                 if (buf_setupl(cbp, upl, upl_offset))
1290                         panic("buf_setupl failed\n");
1291
1292                 cbp->b_trans_next = (buf_t)NULL;
1293
1294                 if ((cbp->b_iostate = (void *)iostate))
1295                         /*
1296                          * caller wants to track the state of this
1297                          * io... bump the amount issued against this stream
1298                          */
1299                         iostate->io_issued += io_size;
1300
1301                 if (flags & CL_READ) {
1302                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
1303                                      (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1304                 }
1305                 else {
1306                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
1307                                      (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1308                 }
1309
1310                 if (cbp_head) {
1311                         cbp_tail->b_trans_next = cbp;
1312                         cbp_tail = cbp;
1313                 } else {
1314                         cbp_head = cbp;
1315                         cbp_tail = cbp;
1316
1317                         if ( (cbp_head->b_real_bp = real_bp) ) {
1318                                 cbp_head->b_flags |= B_NEED_IODONE;
1319                                 real_bp = (buf_t)NULL;
1320                         }
1321                 }
1322                 *(buf_t *)(&cbp->b_trans_head) = cbp_head;
1323
1324                 trans_count++;
1325
1326                 upl_offset += io_size;
1327                 f_offset   += io_size;
1328                 size       -= io_size;
1329                 /*
1330                  * keep track of how much of the original request
1331                  * that we've actually completed... non_rounded_size
1332                  * may go negative due to us rounding the request
1333                  * to a page size multiple (i.e.  size > non_rounded_size)
1334                  */
1335                 non_rounded_size -= io_size;
1336
1337                 if (non_rounded_size <= 0) {
1338                         /*
1339                          * we've transferred all of the data in the original
1340                          * request, but we were unable to complete the tail
1341                          * of the last page because the file didn't have
1342                          * an allocation to back that portion... this is ok.
1343                          */
1344                         size = 0;
1345                 }
1346                 if (size == 0) {
1347                         /*
1348                          * we have no more I/O to issue, so go
1349                          * finish the final transaction
1350                          */
1351                         need_EOT = TRUE;
1352                 } else if ( ((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) &&
1353                             ((flags & CL_ASYNC) || trans_count > max_trans_count) ) {
1354                         /*
1355                          * I/O directed to physically contiguous memory...
1356                          * which doesn't have a requirement to make sure we 'fill' a page
1357                          * or...
1358                          * the current I/O we've prepared fully
1359                          * completes the last page in this request
1360                          * and ...
1361                          * it's either an ASYNC request or
1362                          * we've already accumulated more than 8 I/O's into
1363                          * this transaction so mark it as complete so that
1364                          * it can finish asynchronously or via the cluster_complete_transaction
1365                          * below if the request is synchronous
1366                          */
1367                         need_EOT = TRUE;
1368                 }
1369                 if (need_EOT == TRUE)
1370                         cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
1371
1372                 if (flags & CL_THROTTLE)
1373                         (void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
1374
1375                 if ( !(io_flags & B_READ))
1376                         vnode_startwrite(vp);
1377
1378                 (void) VNOP_STRATEGY(cbp);
1379
1380                 if (need_EOT == TRUE) {
1381                         if ( !(flags & CL_ASYNC))
1382                                 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
1383
1384                         need_EOT = FALSE;
1385                         trans_count = 0;
1386                         cbp_head = NULL;
1387                 }
1388         }
1389         if (error) {
1390                 int abort_size;
1391
1392                 io_size = 0;
1393
1394                 if (cbp_head) {
1395                          /*
1396                           * first wait until all of the outstanding I/O
1397                           * for this partial transaction has completed
1398                           */
1399                         cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1400
1401                         /*
1402                          * Rewind the upl offset to the beginning of the
1403                          * transaction.
1404                          */
1405                         upl_offset = cbp_head->b_uploffset;
1406
1407                         for (cbp = cbp_head; cbp;) {
1408                                 buf_t   cbp_next;
1409
1410                                 size       += cbp->b_bcount;
1411                                 io_size    += cbp->b_bcount;
1412
1413                                 cbp_next = cbp->b_trans_next;
1414                                 free_io_buf(cbp);
1415                                 cbp = cbp_next;
1416                         }
1417                 }
1418                 if (iostate) {
1419                         int need_wakeup = 0;
1420
1421                         /*
1422                          * update the error condition for this stream
1423                          * since we never really issued the io
1424                          * just go ahead and adjust it back
1425                          */
1426                         lck_mtx_lock_spin(cl_mtxp);
1427
1428                         if (iostate->io_error == 0)
1429                                 iostate->io_error = error;
1430                         iostate->io_issued -= io_size;
1431
1432                         if (iostate->io_wanted) {
1433                                 /*
1434                                  * someone is waiting for the state of
1435                                  * this io stream to change
1436                                  */
1437                                 iostate->io_wanted = 0;
1438                                 need_wakeup = 1;
1439                         }
1440                         lck_mtx_unlock(cl_mtxp);
1441
1442                         if (need_wakeup)
1443                                 wakeup((caddr_t)&iostate->io_wanted);
1444                 }
1445                 if (flags & CL_COMMIT) {
1446                         int     upl_flags;
1447
1448                         pg_offset  = upl_offset & PAGE_MASK;
1449                         abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK;
1450
1451                         upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags);
1452
1453                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
1454                                      (int)upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0);
1455                 }
1456                 if (retval == 0)
1457                         retval = error;
1458         } else if (cbp_head)
1459                         panic("%s(): cbp_head is not NULL.\n", __FUNCTION__);
1460
1461         if (real_bp) {
1462                 /*
1463                  * can get here if we either encountered an error
1464                  * or we completely zero-filled the request and
1465                  * no I/O was issued
1466                  */
1467                 if (error) {
1468                         real_bp->b_flags |= B_ERROR;
1469                         real_bp->b_error = error;
1470                 }
1471                 buf_biodone(real_bp);
1472         }
1473         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
1474
1475         return (retval);
1476 }
1477
1478
1479 static int
1480 cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
1481 {
1482         int           pages_in_prefetch;
1483
1484         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
1485                      (int)f_offset, size, (int)filesize, 0, 0);
1486
1487         if (f_offset >= filesize) {
1488                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1489                              (int)f_offset, 0, 0, 0, 0);
1490                 return(0);
1491         }
1492         if ((off_t)size > (filesize - f_offset))
1493                 size = filesize - f_offset;
1494         pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1495
1496         advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
1497
1498         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1499                      (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1500
1501         return (pages_in_prefetch);
1502 }
1503
1504
1505
1506 static void
1507 cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg,
1508                    int bflag)
1509 {
1510         daddr64_t       r_addr;
1511         off_t           f_offset;
1512         int             size_of_prefetch;
1513         u_int           max_prefetch;
1514
1515
1516         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
1517                      (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
1518
1519         if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
1520                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1521                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
1522                 return;
1523         }
1524         if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
1525                 rap->cl_ralen = 0;
1526                 rap->cl_maxra = 0;
1527
1528                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1529                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
1530
1531                 return;
1532         }
1533         max_prefetch = MAX_PREFETCH(vp);
1534
1535         if (extent->e_addr < rap->cl_maxra) {
1536                 if ((rap->cl_maxra - extent->e_addr) > ((max_prefetch / PAGE_SIZE) / 4)) {
1537
1538                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1539                                      rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
1540                         return;
1541                 }
1542         }
1543         r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
1544         f_offset = (off_t)(r_addr * PAGE_SIZE_64);
1545
1546         size_of_prefetch = 0;
1547
1548         ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
1549
1550         if (size_of_prefetch) {
1551                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1552                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
1553                 return;
1554         }
1555         if (f_offset < filesize) {
1556                 daddr64_t read_size;
1557
1558                 rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
1559
1560                 read_size = (extent->e_addr + 1) - extent->b_addr;
1561
1562                 if (read_size > rap->cl_ralen) {
1563                         if (read_size > max_prefetch / PAGE_SIZE)
1564                                 rap->cl_ralen = max_prefetch / PAGE_SIZE;
1565                         else
1566                                 rap->cl_ralen = read_size;
1567                 }
1568                 size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
1569
1570                 if (size_of_prefetch)
1571                         rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
1572         }
1573         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1574                      rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
1575 }
1576
1577
1578 int
1579 cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1580                 int size, off_t filesize, int flags)
1581 {
1582         return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
1583
1584 }
1585
1586
1587 int
1588 cluster_pageout_ext(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1589                 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
1590 {
1591         int           io_size;
1592         int           rounded_size;
1593         off_t         max_size;
1594         int           local_flags;
1595
1596         if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
1597                 /*
1598                  * if we know we're issuing this I/O to a virtual device (i.e. disk image)
1599                  * then we don't want to enforce this throttle... if we do, we can
1600                  * potentially deadlock since we're stalling the pageout thread at a time
1601                  * when the disk image might need additional memory (which won't be available
1602                  * if the pageout thread can't run)... instead we'll just depend on the throttle
1603                  * that the pageout thread now has in place to deal with external files
1604                  */
1605                 local_flags = CL_PAGEOUT;
1606         else
1607                 local_flags = CL_PAGEOUT | CL_THROTTLE;
1608
1609         if ((flags & UPL_IOSYNC) == 0)
1610                 local_flags |= CL_ASYNC;
1611         if ((flags & UPL_NOCOMMIT) == 0)
1612                 local_flags |= CL_COMMIT;
1613         if ((flags & UPL_KEEPCACHED))
1614                 local_flags |= CL_KEEPCACHED;
1615         if (flags & IO_PASSIVE)
1616                 local_flags |= CL_PASSIVE;
1617
1618
1619         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
1620                      (int)f_offset, size, (int)filesize, local_flags, 0);
1621
1622         /*
1623          * If they didn't specify any I/O, then we are done...
1624          * we can't issue an abort because we don't know how
1625          * big the upl really is
1626          */
1627         if (size <= 0)
1628                 return (EINVAL);
1629
1630         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1631                 if (local_flags & CL_COMMIT)
1632                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1633                 return (EROFS);
1634         }
1635         /*
1636          * can't page-in from a negative offset
1637          * or if we're starting beyond the EOF
1638          * or if the file offset isn't page aligned
1639          * or the size requested isn't a multiple of PAGE_SIZE
1640          */
1641         if (f_offset < 0 || f_offset >= filesize ||
1642            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
1643                 if (local_flags & CL_COMMIT)
1644                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1645                 return (EINVAL);
1646         }
1647         max_size = filesize - f_offset;
1648
1649         if (size < max_size)
1650                 io_size = size;
1651         else
1652                 io_size = max_size;
1653
1654         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1655
1656         if (size > rounded_size) {
1657                 if (local_flags & CL_COMMIT)
1658                         ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
1659                                         UPL_ABORT_FREE_ON_EMPTY);
1660         }
1661         return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
1662                            local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg));
1663 }
1664
1665
1666 int
1667 cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1668                int size, off_t filesize, int flags)
1669 {
1670         return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
1671 }
1672
1673
1674 int
1675 cluster_pagein_ext(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1676                int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
1677 {
1678         u_int         io_size;
1679         int           rounded_size;
1680         off_t         max_size;
1681         int           retval;
1682         int           local_flags = 0;
1683
1684         if (upl == NULL || size < 0)
1685                 panic("cluster_pagein: NULL upl passed in");
1686
1687         if ((flags & UPL_IOSYNC) == 0)
1688                 local_flags |= CL_ASYNC;
1689         if ((flags & UPL_NOCOMMIT) == 0)
1690                 local_flags |= CL_COMMIT;
1691         if (flags & IO_PASSIVE)
1692                 local_flags |= CL_PASSIVE;
1693
1694
1695         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1696                      (int)f_offset, size, (int)filesize, local_flags, 0);
1697
1698         /*
1699          * can't page-in from a negative offset
1700          * or if we're starting beyond the EOF
1701          * or if the file offset isn't page aligned
1702          * or the size requested isn't a multiple of PAGE_SIZE
1703          */
1704         if (f_offset < 0 || f_offset >= filesize ||
1705            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1706                 if (local_flags & CL_COMMIT)
1707                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1708                 return (EINVAL);
1709         }
1710         max_size = filesize - f_offset;
1711
1712         if (size < max_size)
1713                 io_size = size;
1714         else
1715                 io_size = max_size;
1716
1717         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1718
1719         if (size > rounded_size && (local_flags & CL_COMMIT))
1720                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1721                                     size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1722
1723         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
1724                             local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
1725
1726         return (retval);
1727 }
1728
1729
1730 int
1731 cluster_bp(buf_t bp)
1732 {
1733        return cluster_bp_ext(bp, NULL, NULL);
1734 }
1735
1736
1737 int
1738 cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg)
1739 {
1740         off_t  f_offset;
1741         int    flags;
1742
1743         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1744                      (int)bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1745
1746         if (bp->b_flags & B_READ)
1747                 flags = CL_ASYNC | CL_READ;
1748         else
1749                 flags = CL_ASYNC;
1750         if (bp->b_flags & B_PASSIVE)
1751                 flags |= CL_PASSIVE;
1752
1753         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1754
1755         return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg));
1756 }
1757
1758
1759
1760 int
1761 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
1762 {
1763         return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
1764 }
1765
1766
1767 int
1768 cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
1769                   int xflags, int (*callback)(buf_t, void *), void *callback_arg)
1770 {
1771         user_ssize_t    cur_resid;
1772         int             retval = 0;
1773         int             flags;
1774         int             zflags;
1775         int             bflag;
1776         int             write_type = IO_COPY;
1777         u_int32_t       write_length;
1778
1779         flags = xflags;
1780
1781         if (flags & IO_PASSIVE)
1782             bflag = CL_PASSIVE;
1783         else
1784             bflag = 0;
1785
1786         if (vp->v_flag & VNOCACHE_DATA)
1787                 flags |= IO_NOCACHE;
1788
1789         if (uio == NULL) {
1790                 /*
1791                  * no user data...
1792                  * this call is being made to zero-fill some range in the file
1793                  */
1794                 retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
1795
1796                 return(retval);
1797         }
1798         /*
1799          * do a write through the cache if one of the following is true....
1800          *   NOCACHE is not true and
1801          *   the uio request doesn't target USERSPACE
1802          * otherwise, find out if we want the direct or contig variant for
1803          * the first vector in the uio request
1804          */
1805         if ( (flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) )
1806                 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
1807
1808         if ( (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT)
1809                 /*
1810                  * must go through the cached variant in this case
1811                  */
1812                 write_type = IO_COPY;
1813
1814         while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
1815
1816                 switch (write_type) {
1817
1818                 case IO_COPY:
1819                         /*
1820                          * make sure the uio_resid isn't too big...
1821                          * internally, we want to handle all of the I/O in
1822                          * chunk sizes that fit in a 32 bit int
1823                          */
1824                         if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
1825                                 /*
1826                                  * we're going to have to call cluster_write_copy
1827                                  * more than once...
1828                                  *
1829                                  * only want the last call to cluster_write_copy to
1830                                  * have the IO_TAILZEROFILL flag set and only the
1831                                  * first call should have IO_HEADZEROFILL
1832                                  */
1833                                 zflags = flags & ~IO_TAILZEROFILL;
1834                                 flags &= ~IO_HEADZEROFILL;
1835
1836                                 write_length = MAX_IO_REQUEST_SIZE;
1837                         } else {
1838                                 /*
1839                                  * last call to cluster_write_copy
1840                                  */
1841                                 zflags = flags;
1842
1843                                 write_length = (u_int32_t)cur_resid;
1844                         }
1845                         retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
1846                         break;
1847
1848                 case IO_CONTIG:
1849                         zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL);
1850
1851                         if (flags & IO_HEADZEROFILL) {
1852                                 /*
1853                                  * only do this once per request
1854                                  */
1855                                 flags &= ~IO_HEADZEROFILL;
1856
1857                                 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
1858                                                             headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
1859                                 if (retval)
1860                                         break;
1861                         }
1862                         retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
1863
1864                         if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
1865                                 /*
1866                                  * we're done with the data from the user specified buffer(s)
1867                                  * and we've been requested to zero fill at the tail
1868                                  * treat this as an IO_HEADZEROFILL which doesn't require a uio
1869                                  * by rearranging the args and passing in IO_HEADZEROFILL
1870                                  */
1871                                 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset,
1872                                                             (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
1873                         }
1874                         break;
1875
1876                 case IO_DIRECT:
1877                         /*
1878                          * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
1879                          */
1880                         retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
1881                         break;
1882
1883                 case IO_UNKNOWN:
1884                         retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
1885                         break;
1886                 }
1887         }
1888         return (retval);
1889 }
1890
1891
1892 static int
1893 cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
1894                      int flags, int (*callback)(buf_t, void *), void *callback_arg)
1895 {
1896         upl_t            upl;
1897         upl_page_info_t  *pl;
1898         vm_offset_t      upl_offset;
1899         u_int32_t        io_req_size;
1900         u_int32_t        offset_in_file;
1901         u_int32_t        offset_in_iovbase;
1902         u_int32_t        io_size;
1903         int              io_flag;
1904         int              bflag;
1905         vm_size_t        upl_size;
1906         vm_size_t        upl_needed_size;
1907         mach_msg_type_number_t  pages_in_pl;
1908         int              upl_flags;
1909         kern_return_t    kret;
1910         mach_msg_type_number_t  i;
1911         int              force_data_sync;
1912         int              retval = 0;
1913         int              first_IO = 1;
1914         struct clios     iostate;
1915         user_addr_t      iov_base;
1916         u_int32_t        mem_alignment_mask;
1917         u_int32_t        devblocksize;
1918         u_int32_t        max_upl_size;
1919
1920
1921         max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
1922
1923         if (flags & IO_PASSIVE)
1924                 bflag = CL_PASSIVE;
1925         else
1926                 bflag = 0;
1927
1928         /*
1929          * When we enter this routine, we know
1930          *  -- the resid will not exceed iov_len
1931          */
1932         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1933                      (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
1934
1935         iostate.io_completed = 0;
1936         iostate.io_issued = 0;
1937         iostate.io_error = 0;
1938         iostate.io_wanted = 0;
1939
1940         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
1941         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
1942
1943         if (devblocksize == 1) {
1944                /*
1945                 * the AFP client advertises a devblocksize of 1
1946                 * however, its BLOCKMAP routine maps to physical
1947                 * blocks that are PAGE_SIZE in size...
1948                 * therefore we can't ask for I/Os that aren't page aligned
1949                 * or aren't multiples of PAGE_SIZE in size
1950                 * by setting devblocksize to PAGE_SIZE, we re-instate
1951                 * the old behavior we had before the mem_alignment_mask
1952                 * changes went in...
1953                 */
1954                devblocksize = PAGE_SIZE;
1955         }
1956
1957 next_dwrite:
1958         io_req_size = *write_length;
1959         iov_base = uio_curriovbase(uio);
1960
1961         offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
1962         offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
1963
1964         if (offset_in_file || offset_in_iovbase) {
1965                 /*
1966                  * one of the 2 important offsets is misaligned
1967                  * so fire an I/O through the cache for this entire vector
1968                  */
1969                 goto wait_for_dwrites;
1970         }
1971         if (iov_base & (devblocksize - 1)) {
1972                 /*
1973                  * the offset in memory must be on a device block boundary
1974                  * so that we can guarantee that we can generate an
1975                  * I/O that ends on a page boundary in cluster_io
1976                  */
1977                 goto wait_for_dwrites;
1978         }
1979
1980         while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
1981
1982                 if (first_IO) {
1983                         cluster_syncup(vp, newEOF, callback, callback_arg);
1984                         first_IO = 0;
1985                 }
1986                 io_size  = io_req_size & ~PAGE_MASK;
1987                 iov_base = uio_curriovbase(uio);
1988
1989                 if (io_size > max_upl_size)
1990                         io_size = max_upl_size;
1991
1992                 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
1993                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1994
1995                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1996                              (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
1997
1998                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1999                         pages_in_pl = 0;
2000                         upl_size = upl_needed_size;
2001                         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2002                                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2003
2004                         kret = vm_map_get_upl(current_map(),
2005                                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2006                                               &upl_size,
2007                                               &upl,
2008                                               NULL,
2009                                               &pages_in_pl,
2010                                               &upl_flags,
2011                                               force_data_sync);
2012
2013                         if (kret != KERN_SUCCESS) {
2014                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2015                                              0, 0, 0, kret, 0);
2016                                 /*
2017                                  * failed to get pagelist
2018                                  *
2019                                  * we may have already spun some portion of this request
2020                                  * off as async requests... we need to wait for the I/O
2021                                  * to complete before returning
2022                                  */
2023                                 goto wait_for_dwrites;
2024                         }
2025                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2026                         pages_in_pl = upl_size / PAGE_SIZE;
2027
2028                         for (i = 0; i < pages_in_pl; i++) {
2029                                 if (!upl_valid_page(pl, i))
2030                                         break;
2031                         }
2032                         if (i == pages_in_pl)
2033                                 break;
2034
2035                         /*
2036                          * didn't get all the pages back that we
2037                          * needed... release this upl and try again
2038                          */
2039                         ubc_upl_abort(upl, 0);
2040                 }
2041                 if (force_data_sync >= 3) {
2042                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2043                                      i, pages_in_pl, upl_size, kret, 0);
2044                         /*
2045                          * for some reason, we couldn't acquire a hold on all
2046                          * the pages needed in the user's address space
2047                          *
2048                          * we may have already spun some portion of this request
2049                          * off as async requests... we need to wait for the I/O
2050                          * to complete before returning
2051                          */
2052                         goto wait_for_dwrites;
2053                 }
2054
2055                 /*
2056                  * Consider the possibility that upl_size wasn't satisfied.
2057                  */
2058                 if (upl_size < upl_needed_size) {
2059                         if (upl_size && upl_offset == 0)
2060                                 io_size = upl_size;
2061                         else
2062                                 io_size = 0;
2063                 }
2064                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2065                              (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
2066
2067                 if (io_size == 0) {
2068                         ubc_upl_abort(upl, 0);
2069                         /*
2070                          * we may have already spun some portion of this request
2071                          * off as async requests... we need to wait for the I/O
2072                          * to complete before returning
2073                          */
2074                         goto wait_for_dwrites;
2075                 }
2076
2077                 /*
2078                  * Now look for pages already in the cache
2079                  * and throw them away.
2080                  * uio->uio_offset is page aligned within the file
2081                  * io_size is a multiple of PAGE_SIZE
2082                  */
2083                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
2084
2085                 /*
2086                  * we want push out these writes asynchronously so that we can overlap
2087                  * the preparation of the next I/O
2088                  * if there are already too many outstanding writes
2089                  * wait until some complete before issuing the next
2090                  */
2091                 lck_mtx_lock(cl_mtxp);
2092
2093                 while ((iostate.io_issued - iostate.io_completed) > (2 * max_upl_size)) {
2094
2095                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
2096                                         iostate.io_issued, iostate.io_completed, 2 * max_upl_size, 0, 0);
2097
2098                         iostate.io_wanted = 1;
2099                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL);
2100
2101                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
2102                                         iostate.io_issued, iostate.io_completed, 2 * max_upl_size, 0, 0);
2103                 }
2104                 lck_mtx_unlock(cl_mtxp);
2105
2106                 if (iostate.io_error) {
2107                         /*
2108                          * one of the earlier writes we issued ran into a hard error
2109                          * don't issue any more writes, cleanup the UPL
2110                          * that was just created but not used, then
2111                          * go wait for all writes that are part of this stream
2112                          * to complete before returning the error to the caller
2113                          */
2114                         ubc_upl_abort(upl, 0);
2115
2116                         goto wait_for_dwrites;
2117                 }
2118                 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO | bflag;
2119
2120                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
2121                              (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
2122
2123                 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2124                                    io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2125
2126                 /*
2127                  * update the uio structure to
2128                  * reflect the I/O that we just issued
2129                  */
2130                 uio_update(uio, (user_size_t)io_size);
2131
2132                 io_req_size -= io_size;
2133
2134                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
2135                              (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
2136
2137         } /* end while */
2138
2139         if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
2140
2141                 retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
2142
2143                 if (retval == 0 && *write_type == IO_DIRECT) {
2144
2145                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE,
2146                                      (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2147
2148                         goto next_dwrite;
2149                 }
2150         }
2151
2152 wait_for_dwrites:
2153         if (iostate.io_issued) {
2154                 /*
2155                  * make sure all async writes issued as part of this stream
2156                  * have completed before we return
2157                  */
2158                 lck_mtx_lock(cl_mtxp);
2159
2160                 while (iostate.io_issued != iostate.io_completed) {
2161                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
2162                                         iostate.io_issued, iostate.io_completed, 0, 0, 0);
2163
2164                         iostate.io_wanted = 1;
2165                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL);
2166
2167                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
2168                                         iostate.io_issued, iostate.io_completed, 0, 0, 0);
2169                 }
2170                 lck_mtx_unlock(cl_mtxp);
2171         }
2172         if (iostate.io_error)
2173                 retval = iostate.io_error;
2174
2175         if (io_req_size && retval == 0) {
2176                 /*
2177                  * we couldn't handle the tail of this request in DIRECT mode
2178                  * so fire it through the copy path
2179                  *
2180                  * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2181                  * so we can just pass 0 in for the headOff and tailOff
2182                  */
2183                 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
2184
2185                 *write_type = IO_UNKNOWN;
2186         }
2187         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
2188                      (int)uio->uio_offset, io_req_size, retval, 4, 0);
2189
2190         return (retval);
2191 }
2192
2193
2194 static int
2195 cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length,
2196                      int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2197 {
2198         upl_page_info_t *pl;
2199         addr64_t         src_paddr = 0;
2200         upl_t            upl[MAX_VECTS];
2201         vm_offset_t      upl_offset;
2202         u_int32_t        tail_size = 0;
2203         u_int32_t        io_size;
2204         u_int32_t        xsize;
2205         vm_size_t        upl_size;
2206         vm_size_t        upl_needed_size;
2207         mach_msg_type_number_t  pages_in_pl;
2208         int              upl_flags;
2209         kern_return_t    kret;
2210         struct clios     iostate;
2211         int              error  = 0;
2212         int              cur_upl = 0;
2213         int              num_upl = 0;
2214         int              n;
2215         user_addr_t      iov_base;
2216         u_int32_t        devblocksize;
2217         u_int32_t        mem_alignment_mask;
2218
2219         /*
2220          * When we enter this routine, we know
2221          *  -- the io_req_size will not exceed iov_len
2222          *  -- the target address is physically contiguous
2223          */
2224         cluster_syncup(vp, newEOF, callback, callback_arg);
2225
2226         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2227         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2228
2229         iostate.io_completed = 0;
2230         iostate.io_issued = 0;
2231         iostate.io_error = 0;
2232         iostate.io_wanted = 0;
2233
2234 next_cwrite:
2235         io_size = *write_length;
2236
2237         iov_base = uio_curriovbase(uio);
2238
2239         upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2240         upl_needed_size = upl_offset + io_size;
2241
2242         pages_in_pl = 0;
2243         upl_size = upl_needed_size;
2244         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2245                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2246
2247         kret = vm_map_get_upl(current_map(),
2248                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2249                               &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0);
2250
2251         if (kret != KERN_SUCCESS) {
2252                 /*
2253                  * failed to get pagelist
2254                  */
2255                 error = EINVAL;
2256                 goto wait_for_cwrites;
2257         }
2258         num_upl++;
2259
2260         /*
2261          * Consider the possibility that upl_size wasn't satisfied.
2262          */
2263         if (upl_size < upl_needed_size) {
2264                 /*
2265                  * This is a failure in the physical memory case.
2266                  */
2267                 error = EINVAL;
2268                 goto wait_for_cwrites;
2269         }
2270         pl = ubc_upl_pageinfo(upl[cur_upl]);
2271
2272         src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
2273
2274         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
2275                 u_int32_t   head_size;
2276
2277                 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
2278
2279                 if (head_size > io_size)
2280                         head_size = io_size;
2281
2282                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
2283
2284                 if (error)
2285                         goto wait_for_cwrites;
2286
2287                 upl_offset += head_size;
2288                 src_paddr  += head_size;
2289                 io_size    -= head_size;
2290
2291                 iov_base   += head_size;
2292         }
2293         if ((u_int32_t)iov_base & mem_alignment_mask) {
2294                 /*
2295                  * request doesn't set up on a memory boundary
2296                  * the underlying DMA engine can handle...
2297                  * return an error instead of going through
2298                  * the slow copy path since the intent of this
2299                  * path is direct I/O from device memory
2300                  */
2301                 error = EINVAL;
2302                 goto wait_for_cwrites;
2303         }
2304
2305         tail_size = io_size & (devblocksize - 1);
2306         io_size  -= tail_size;
2307
2308         while (io_size && error == 0) {
2309
2310                 if (io_size > MAX_IO_CONTIG_SIZE)
2311                         xsize = MAX_IO_CONTIG_SIZE;
2312                 else
2313                         xsize = io_size;
2314                 /*
2315                  * request asynchronously so that we can overlap
2316                  * the preparation of the next I/O... we'll do
2317                  * the commit after all the I/O has completed
2318                  * since its all issued against the same UPL
2319                  * if there are already too many outstanding writes
2320                  * wait until some have completed before issuing the next
2321                  */
2322                 if (iostate.io_issued) {
2323                         lck_mtx_lock(cl_mtxp);
2324
2325                         while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_IO_CONTIG_SIZE)) {
2326
2327                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
2328                                                 iostate.io_issued, iostate.io_completed, 2 * MAX_IO_CONTIG_SIZE, 0, 0);
2329
2330                                 iostate.io_wanted = 1;
2331                                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL);
2332
2333                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
2334                                                 iostate.io_issued, iostate.io_completed, 2 * MAX_IO_CONTIG_SIZE, 0, 0);
2335                         }
2336                         lck_mtx_unlock(cl_mtxp);
2337                 }
2338                 if (iostate.io_error) {
2339                         /*
2340                          * one of the earlier writes we issued ran into a hard error
2341                          * don't issue any more writes...
2342                          * go wait for all writes that are part of this stream
2343                          * to complete before returning the error to the caller
2344                          */
2345                         goto wait_for_cwrites;
2346                 }
2347                 /*
2348                  * issue an asynchronous write to cluster_io
2349                  */
2350                 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
2351                                    xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
2352
2353                 if (error == 0) {
2354                         /*
2355                          * The cluster_io write completed successfully,
2356                          * update the uio structure
2357                          */
2358                         uio_update(uio, (user_size_t)xsize);
2359
2360                         upl_offset += xsize;
2361                         src_paddr  += xsize;
2362                         io_size    -= xsize;
2363                 }
2364         }
2365         if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
2366
2367                 error = cluster_io_type(uio, write_type, write_length, 0);
2368
2369                 if (error == 0 && *write_type == IO_CONTIG) {
2370                         cur_upl++;
2371                         goto next_cwrite;
2372                 }
2373         } else
2374                 *write_type = IO_UNKNOWN;
2375
2376 wait_for_cwrites:
2377         /*
2378          * make sure all async writes that are part of this stream
2379          * have completed before we proceed
2380          */
2381         lck_mtx_lock(cl_mtxp);
2382
2383         while (iostate.io_issued != iostate.io_completed) {
2384                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
2385                                 iostate.io_issued, iostate.io_completed, 0, 0, 0);
2386
2387                 iostate.io_wanted = 1;
2388                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL);
2389
2390                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
2391                                 iostate.io_issued, iostate.io_completed, 0, 0, 0);
2392         }
2393         lck_mtx_unlock(cl_mtxp);
2394
2395         if (iostate.io_error)
2396                 error = iostate.io_error;
2397
2398         if (error == 0 && tail_size)
2399                 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
2400
2401         for (n = 0; n < num_upl; n++)
2402                 /*
2403                  * just release our hold on each physically contiguous
2404                  * region without changing any state
2405                  */
2406                 ubc_upl_abort(upl[n], 0);
2407
2408         return (error);
2409 }
2410
2411
2412 static int
2413 cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
2414                    off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2415 {
2416         upl_page_info_t *pl;
2417         upl_t            upl;
2418         vm_offset_t      upl_offset = 0;
2419         vm_size_t        upl_size;
2420         off_t            upl_f_offset;
2421         int              pages_in_upl;
2422         int              start_offset;
2423         int              xfer_resid;
2424         int              io_size;
2425         int              io_offset;
2426         int              bytes_to_zero;
2427         int              bytes_to_move;
2428         kern_return_t    kret;
2429         int              retval = 0;
2430         int              io_resid;
2431         long long        total_size;
2432         long long        zero_cnt;
2433         off_t            zero_off;
2434         long long        zero_cnt1;
2435         off_t            zero_off1;
2436         struct cl_extent cl;
2437         struct cl_writebehind *wbp;
2438         int              bflag;
2439         u_int            max_cluster_pgcount;
2440         u_int            max_io_size;
2441
2442         if (flags & IO_PASSIVE)
2443                 bflag = CL_PASSIVE;
2444         else
2445                 bflag = 0;
2446
2447         if (uio) {
2448                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
2449                              (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
2450
2451                 io_resid = io_req_size;
2452         } else {
2453                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
2454                              0, 0, (int)oldEOF, (int)newEOF, 0);
2455
2456                 io_resid = 0;
2457         }
2458         zero_cnt  = 0;
2459         zero_cnt1 = 0;
2460         zero_off  = 0;
2461         zero_off1 = 0;
2462
2463         max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
2464         max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
2465
2466         if (flags & IO_HEADZEROFILL) {
2467                 /*
2468                  * some filesystems (HFS is one) don't support unallocated holes within a file...
2469                  * so we zero fill the intervening space between the old EOF and the offset
2470                  * where the next chunk of real data begins.... ftruncate will also use this
2471                  * routine to zero fill to the new EOF when growing a file... in this case, the
2472                  * uio structure will not be provided
2473                  */
2474                 if (uio) {
2475                         if (headOff < uio->uio_offset) {
2476                                 zero_cnt = uio->uio_offset - headOff;
2477                                 zero_off = headOff;
2478                         }
2479                 } else if (headOff < newEOF) {
2480                         zero_cnt = newEOF - headOff;
2481                         zero_off = headOff;
2482                 }
2483         }
2484         if (flags & IO_TAILZEROFILL) {
2485                 if (uio) {
2486                         zero_off1 = uio->uio_offset + io_req_size;
2487
2488                         if (zero_off1 < tailOff)
2489                                 zero_cnt1 = tailOff - zero_off1;
2490                 }
2491         }
2492         if (zero_cnt == 0 && uio == (struct uio *) 0) {
2493                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2494                              retval, 0, 0, 0, 0);
2495                 return (0);
2496         }
2497
2498         while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
2499                 /*
2500                  * for this iteration of the loop, figure out where our starting point is
2501                  */
2502                 if (zero_cnt) {
2503                         start_offset = (int)(zero_off & PAGE_MASK_64);
2504                         upl_f_offset = zero_off - start_offset;
2505                 } else if (io_resid) {
2506                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2507                         upl_f_offset = uio->uio_offset - start_offset;
2508                 } else {
2509                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
2510                         upl_f_offset = zero_off1 - start_offset;
2511                 }
2512                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
2513                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
2514
2515                 if (total_size > max_io_size)
2516                         total_size = max_io_size;
2517
2518                 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
2519
2520                 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
2521                         /*
2522                          * assumption... total_size <= io_resid
2523                          * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
2524                          */
2525                         if ((start_offset + total_size) > max_io_size)
2526                                 total_size -= start_offset;
2527                         xfer_resid = total_size;
2528
2529                         retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
2530
2531                         if (retval)
2532                                 break;
2533
2534                         io_resid    -= (total_size - xfer_resid);
2535                         total_size   = xfer_resid;
2536                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2537                         upl_f_offset = uio->uio_offset - start_offset;
2538
2539                         if (total_size == 0) {
2540                                 if (start_offset) {
2541                                         /*
2542                                          * the write did not finish on a page boundary
2543                                          * which will leave upl_f_offset pointing to the
2544                                          * beginning of the last page written instead of
2545                                          * the page beyond it... bump it in this case
2546                                          * so that the cluster code records the last page
2547                                          * written as dirty
2548                                          */
2549                                         upl_f_offset += PAGE_SIZE_64;
2550                                 }
2551                                 upl_size = 0;
2552
2553                                 goto check_cluster;
2554                         }
2555                 }
2556                 /*
2557                  * compute the size of the upl needed to encompass
2558                  * the requested write... limit each call to cluster_io
2559                  * to the maximum UPL size... cluster_io will clip if
2560                  * this exceeds the maximum io_size for the device,
2561                  * make sure to account for
2562                  * a starting offset that's not page aligned
2563                  */
2564                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2565
2566                 if (upl_size > max_io_size)
2567                         upl_size = max_io_size;
2568
2569                 pages_in_upl = upl_size / PAGE_SIZE;
2570                 io_size      = upl_size - start_offset;
2571
2572                 if ((long long)io_size > total_size)
2573                         io_size = total_size;
2574
2575                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
2576
2577
2578                 /*
2579                  * Gather the pages from the buffer cache.
2580                  * The UPL_WILL_MODIFY flag lets the UPL subsystem know
2581                  * that we intend to modify these pages.
2582                  */
2583                 kret = ubc_create_upl(vp,
2584                                       upl_f_offset,
2585                                       upl_size,
2586                                       &upl,
2587                                       &pl,
2588                                       UPL_SET_LITE | UPL_WILL_MODIFY);
2589                 if (kret != KERN_SUCCESS)
2590                         panic("cluster_write_copy: failed to get pagelist");
2591
2592                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
2593                         (int)upl, (int)upl_f_offset, start_offset, 0, 0);
2594
2595                 if (start_offset && !upl_valid_page(pl, 0)) {
2596                         int   read_size;
2597
2598                         /*
2599                          * we're starting in the middle of the first page of the upl
2600                          * and the page isn't currently valid, so we're going to have
2601                          * to read it in first... this is a synchronous operation
2602                          */
2603                         read_size = PAGE_SIZE;
2604
2605                         if ((upl_f_offset + read_size) > newEOF)
2606                                 read_size = newEOF - upl_f_offset;
2607
2608                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
2609                                             CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2610                         if (retval) {
2611                                 /*
2612                                  * we had an error during the read which causes us to abort
2613                                  * the current cluster_write request... before we do, we need
2614                                  * to release the rest of the pages in the upl without modifying
2615                                  * there state and mark the failed page in error
2616                                  */
2617                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY);
2618
2619                                 if (upl_size > PAGE_SIZE)
2620                                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2621
2622                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2623                                              (int)upl, 0, 0, retval, 0);
2624                                 break;
2625                         }
2626                 }
2627                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
2628                         /*
2629                          * the last offset we're writing to in this upl does not end on a page
2630                          * boundary... if it's not beyond the old EOF, then we'll also need to
2631                          * pre-read this page in if it isn't already valid
2632                          */
2633                         upl_offset = upl_size - PAGE_SIZE;
2634
2635                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
2636                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
2637                                 int   read_size;
2638
2639                                 read_size = PAGE_SIZE;
2640
2641                                 if ((upl_f_offset + upl_offset + read_size) > newEOF)
2642                                         read_size = newEOF - (upl_f_offset + upl_offset);
2643
2644                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
2645                                                     CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2646                                 if (retval) {
2647                                         /*
2648                                          * we had an error during the read which causes us to abort
2649                                          * the current cluster_write request... before we do, we
2650                                          * need to release the rest of the pages in the upl without
2651                                          * modifying there state and mark the failed page in error
2652                                          */
2653                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY);
2654
2655                                         if (upl_size > PAGE_SIZE)
2656                                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2657
2658                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2659                                                      (int)upl, 0, 0, retval, 0);
2660                                         break;
2661                                 }
2662                         }
2663                 }
2664                 xfer_resid = io_size;
2665                 io_offset = start_offset;
2666
2667                 while (zero_cnt && xfer_resid) {
2668
2669                         if (zero_cnt < (long long)xfer_resid)
2670                                 bytes_to_zero = zero_cnt;
2671                         else
2672                                 bytes_to_zero = xfer_resid;
2673
2674                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2675                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2676                         } else {
2677                                 int zero_pg_index;
2678
2679                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
2680                                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
2681
2682                                 if ( !upl_valid_page(pl, zero_pg_index)) {
2683                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2684
2685                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2686                                            !upl_dirty_page(pl, zero_pg_index)) {
2687                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2688                                 }
2689                         }
2690                         xfer_resid -= bytes_to_zero;
2691                         zero_cnt   -= bytes_to_zero;
2692                         zero_off   += bytes_to_zero;
2693                         io_offset  += bytes_to_zero;
2694                 }
2695                 if (xfer_resid && io_resid) {
2696                         u_int32_t  io_requested;
2697
2698                         bytes_to_move = min(io_resid, xfer_resid);
2699                         io_requested = bytes_to_move;
2700
2701                         retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
2702
2703                         if (retval) {
2704
2705                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2706
2707                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2708                                              (int)upl, 0, 0, retval, 0);
2709                         } else {
2710                                 io_resid   -= bytes_to_move;
2711                                 xfer_resid -= bytes_to_move;
2712                                 io_offset  += bytes_to_move;
2713                         }
2714                 }
2715                 while (xfer_resid && zero_cnt1 && retval == 0) {
2716
2717                         if (zero_cnt1 < (long long)xfer_resid)
2718                                 bytes_to_zero = zero_cnt1;
2719                         else
2720                                 bytes_to_zero = xfer_resid;
2721
2722                         if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2723                                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2724                         } else {
2725                                 int zero_pg_index;
2726
2727                                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
2728                                 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
2729
2730                                 if ( !upl_valid_page(pl, zero_pg_index)) {
2731                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2732                                 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2733                                            !upl_dirty_page(pl, zero_pg_index)) {
2734                                         cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2735                                 }
2736                         }
2737                         xfer_resid -= bytes_to_zero;
2738                         zero_cnt1  -= bytes_to_zero;
2739                         zero_off1  += bytes_to_zero;
2740                         io_offset  += bytes_to_zero;
2741                 }
2742
2743                 if (retval == 0) {
2744                         int cl_index;
2745                         int ret_cluster_try_push;
2746
2747                         io_size += start_offset;
2748
2749                         if ((upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
2750                                 /*
2751                                  * if we're extending the file with this write
2752                                  * we'll zero fill the rest of the page so that
2753                                  * if the file gets extended again in such a way as to leave a
2754                                  * hole starting at this EOF, we'll have zero's in the correct spot
2755                                  */
2756                                 cluster_zero(upl, io_size, upl_size - io_size, NULL);
2757                         }
2758                         /*
2759                          * release the upl now if we hold one since...
2760                          * 1) pages in it may be present in the sparse cluster map
2761                          *    and may span 2 separate buckets there... if they do and
2762                          *    we happen to have to flush a bucket to make room and it intersects
2763                          *    this upl, a deadlock may result on page BUSY
2764                          * 2) we're delaying the I/O... from this point forward we're just updating
2765                          *    the cluster state... no need to hold the pages, so commit them
2766                          * 3) IO_SYNC is set...
2767                          *    because we had to ask for a UPL that provides currenty non-present pages, the
2768                          *    UPL has been automatically set to clear the dirty flags (both software and hardware)
2769                          *    upon committing it... this is not the behavior we want since it's possible for
2770                          *    pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
2771                          *    we'll pick these pages back up later with the correct behavior specified.
2772                          * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
2773                          *    of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
2774                          *    we hold since the flushing context is holding the cluster lock.
2775                          */
2776                         ubc_upl_commit_range(upl, 0, upl_size,
2777                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2778 check_cluster:
2779                         /*
2780                          * calculate the last logical block number
2781                          * that this delayed I/O encompassed
2782                          */
2783                         cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
2784
2785                         if (flags & IO_SYNC)
2786                                 /*
2787                                  * if the IO_SYNC flag is set than we need to
2788                                  * bypass any clusters and immediately issue
2789                                  * the I/O
2790                                  */
2791                                 goto issue_io;
2792
2793                         /*
2794                          * take the lock to protect our accesses
2795                          * of the writebehind and sparse cluster state
2796                          */
2797                         wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
2798
2799                         if (wbp->cl_scmap) {
2800
2801                                 if ( !(flags & IO_NOCACHE)) {
2802                                         /*
2803                                          * we've fallen into the sparse
2804                                          * cluster method of delaying dirty pages
2805                                          */
2806                                         sparse_cluster_add(wbp, vp, &cl, newEOF, callback, callback_arg);
2807
2808                                         lck_mtx_unlock(&wbp->cl_lockw);
2809
2810                                         continue;
2811                                 }
2812                                 /*
2813                                  * must have done cached writes that fell into
2814                                  * the sparse cluster mechanism... we've switched
2815                                  * to uncached writes on the file, so go ahead
2816                                  * and push whatever's in the sparse map
2817                                  * and switch back to normal clustering
2818                                  */
2819                                 wbp->cl_number = 0;
2820
2821                                 sparse_cluster_push(wbp, vp, newEOF, PUSH_ALL, callback, callback_arg);
2822                                 /*
2823                                  * no clusters of either type present at this point
2824                                  * so just go directly to start_new_cluster since
2825                                  * we know we need to delay this I/O since we've
2826                                  * already released the pages back into the cache
2827                                  * to avoid the deadlock with sparse_cluster_push
2828                                  */
2829                                 goto start_new_cluster;
2830                         }
2831                         if (wbp->cl_number == 0)
2832                                 /*
2833                                  * no clusters currently present
2834                                  */
2835                                 goto start_new_cluster;
2836
2837                         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
2838                                 /*
2839                                  * check each cluster that we currently hold
2840                                  * try to merge some or all of this write into
2841                                  * one or more of the existing clusters... if
2842                                  * any portion of the write remains, start a
2843                                  * new cluster
2844                                  */
2845                                 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
2846                                         /*
2847                                          * the current write starts at or after the current cluster
2848                                          */
2849                                         if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
2850                                                 /*
2851                                                  * we have a write that fits entirely
2852                                                  * within the existing cluster limits
2853                                                  */
2854                                                 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
2855                                                         /*
2856                                                          * update our idea of where the cluster ends
2857                                                          */
2858                                                         wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2859                                                 break;
2860                                         }
2861                                         if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
2862                                                 /*
2863                                                  * we have a write that starts in the middle of the current cluster
2864                                                  * but extends beyond the cluster's limit... we know this because
2865                                                  * of the previous checks
2866                                                  * we'll extend the current cluster to the max
2867                                                  * and update the b_addr for the current write to reflect that
2868                                                  * the head of it was absorbed into this cluster...
2869                                                  * note that we'll always have a leftover tail in this case since
2870                                                  * full absorbtion would have occurred in the clause above
2871                                                  */
2872                                                 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
2873
2874                                                 cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
2875                                         }
2876                                         /*
2877                                          * we come here for the case where the current write starts
2878                                          * beyond the limit of the existing cluster or we have a leftover
2879                                          * tail after a partial absorbtion
2880                                          *
2881                                          * in either case, we'll check the remaining clusters before
2882                                          * starting a new one
2883                                          */
2884                                 } else {
2885                                         /*
2886                                          * the current write starts in front of the cluster we're currently considering
2887                                          */
2888                                         if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= max_cluster_pgcount) {
2889                                                 /*
2890                                                  * we can just merge the new request into
2891                                                  * this cluster and leave it in the cache
2892                                                  * since the resulting cluster is still
2893                                                  * less than the maximum allowable size
2894                                                  */
2895                                                 wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
2896
2897                                                 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
2898                                                         /*
2899                                                          * the current write completely
2900                                                          * envelops the existing cluster and since
2901                                                          * each write is limited to at most max_cluster_pgcount pages
2902                                                          * we can just use the start and last blocknos of the write
2903                                                          * to generate the cluster limits
2904                                                          */
2905                                                         wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2906                                                 }
2907                                                 break;
2908                                         }
2909
2910                                         /*
2911                                          * if we were to combine this write with the current cluster
2912                                          * we would exceed the cluster size limit.... so,
2913                                          * let's see if there's any overlap of the new I/O with
2914                                          * the cluster we're currently considering... in fact, we'll
2915                                          * stretch the cluster out to it's full limit and see if we
2916                                          * get an intersection with the current write
2917                                          *
2918                                          */
2919                                         if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
2920                                                 /*
2921                                                  * the current write extends into the proposed cluster
2922                                                  * clip the length of the current write after first combining it's
2923                                                  * tail with the newly shaped cluster
2924                                                  */
2925                                                 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
2926
2927                                                 cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
2928                                         }
2929                                         /*
2930                                          * if we get here, there was no way to merge
2931                                          * any portion of this write with this cluster
2932                                          * or we could only merge part of it which
2933                                          * will leave a tail...
2934                                          * we'll check the remaining clusters before starting a new one
2935                                          */
2936                                 }
2937                         }
2938                         if (cl_index < wbp->cl_number)
2939                                 /*
2940                                  * we found an existing cluster(s) that we
2941                                  * could entirely merge this I/O into
2942                                  */
2943                                 goto delay_io;
2944
2945                         if (wbp->cl_number < MAX_CLUSTERS)
2946                                 /*
2947                                  * we didn't find an existing cluster to
2948                                  * merge into, but there's room to start
2949                                  * a new one
2950                                  */
2951                                 goto start_new_cluster;
2952
2953                         /*
2954                          * no exisitng cluster to merge with and no
2955                          * room to start a new one... we'll try
2956                          * pushing one of the existing ones... if none of
2957                          * them are able to be pushed, we'll switch
2958                          * to the sparse cluster mechanism
2959                          * cluster_try_push updates cl_number to the
2960                          * number of remaining clusters... and
2961                          * returns the number of currently unused clusters
2962                          */
2963                         ret_cluster_try_push = 0;
2964
2965                         /*
2966                          * if writes are not deferred, call cluster push immediately
2967                          */
2968                         if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2969
2970                                 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, callback, callback_arg);
2971                         }
2972
2973                         /*
2974                          * execute following regardless of writes being deferred or not
2975                          */
2976                         if (ret_cluster_try_push == 0) {
2977                                 /*
2978                                  * no more room in the normal cluster mechanism
2979                                  * so let's switch to the more expansive but expensive
2980                                  * sparse mechanism....
2981                                  */
2982                                 sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg);
2983                                 sparse_cluster_add(wbp, vp, &cl, newEOF, callback, callback_arg);
2984
2985                                 lck_mtx_unlock(&wbp->cl_lockw);
2986
2987                                 continue;
2988                         }
2989                         /*
2990                          * we pushed one cluster successfully, so we must be sequentially writing this file
2991                          * otherwise, we would have failed and fallen into the sparse cluster support
2992                          * so let's take the opportunity to push out additional clusters...
2993                          * this will give us better I/O locality if we're in a copy loop
2994                          * (i.e.  we won't jump back and forth between the read and write points
2995                          */
2996                         if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2997                                 while (wbp->cl_number)
2998                                         cluster_try_push(wbp, vp, newEOF, 0, callback, callback_arg);
2999                         }
3000
3001 start_new_cluster:
3002                         wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
3003                         wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
3004
3005                         wbp->cl_clusters[wbp->cl_number].io_flags = 0;
3006
3007                         if (flags & IO_NOCACHE)
3008                                 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
3009
3010                         if (bflag & CL_PASSIVE)
3011                                 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
3012
3013                         wbp->cl_number++;
3014 delay_io:
3015                         lck_mtx_unlock(&wbp->cl_lockw);
3016
3017                         continue;
3018 issue_io:
3019                         /*
3020                          * we don't hold the lock at this point
3021                          *
3022                          * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3023                          * so that we correctly deal with a change in state of the hardware modify bit...
3024                          * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3025                          * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3026                          * responsible for generating the correct sized I/O(s)
3027                          */
3028                         retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg);
3029                 }
3030         }
3031         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0);
3032
3033         return (retval);
3034 }
3035
3036
3037
3038 int
3039 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
3040 {
3041         return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
3042 }
3043
3044
3045 int
3046 cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg)
3047 {
3048         int             retval = 0;
3049         int             flags;
3050         user_ssize_t    cur_resid;
3051         u_int32_t       io_size;
3052         u_int32_t       read_length = 0;
3053         int             read_type = IO_COPY;
3054
3055         flags = xflags;
3056
3057         if (vp->v_flag & VNOCACHE_DATA)
3058                 flags |= IO_NOCACHE;
3059         if ((vp->v_flag & VRAOFF) || speculative_reads_disabled)
3060                 flags |= IO_RAOFF;
3061
3062         /*
3063          * do a read through the cache if one of the following is true....
3064          *   NOCACHE is not true
3065          *   the uio request doesn't target USERSPACE
3066          * otherwise, find out if we want the direct or contig variant for
3067          * the first vector in the uio request
3068          */
3069         if ( (flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) )
3070                 retval = cluster_io_type(uio, &read_type, &read_length, 0);
3071
3072         while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
3073
3074                 switch (read_type) {
3075
3076                 case IO_COPY:
3077                         /*
3078                          * make sure the uio_resid isn't too big...
3079                          * internally, we want to handle all of the I/O in
3080                          * chunk sizes that fit in a 32 bit int
3081                          */
3082                         if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE))
3083                                 io_size = MAX_IO_REQUEST_SIZE;
3084                         else
3085                                 io_size = (u_int32_t)cur_resid;
3086
3087                         retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
3088                         break;
3089
3090                 case IO_DIRECT:
3091                         retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
3092                         break;
3093
3094                 case IO_CONTIG:
3095                         retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
3096                         break;
3097
3098                 case IO_UNKNOWN:
3099                         retval = cluster_io_type(uio, &read_type, &read_length, 0);
3100                         break;
3101                 }
3102         }
3103         return (retval);
3104 }
3105
3106
3107
3108 static void
3109 cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int flags)
3110 {
3111         int range;
3112         int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
3113
3114         if ((range = last_pg - start_pg)) {
3115                 if ( !(flags & IO_NOCACHE))
3116                         abort_flags |= UPL_ABORT_REFERENCE;
3117
3118                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
3119         }
3120 }
3121
3122
3123 static int
3124 cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3125 {
3126         upl_page_info_t *pl;
3127         upl_t            upl;
3128         vm_offset_t      upl_offset;
3129         u_int32_t        upl_size;
3130         off_t            upl_f_offset;
3131         int              start_offset;
3132         int              start_pg;
3133         int              last_pg;
3134         int              uio_last = 0;
3135         int              pages_in_upl;
3136         off_t            max_size;
3137         off_t            last_ioread_offset;
3138         off_t            last_request_offset;
3139         kern_return_t    kret;
3140         int              error  = 0;
3141         int              retval = 0;
3142         u_int32_t        size_of_prefetch;
3143         u_int32_t        xsize;
3144         u_int32_t        io_size;
3145         u_int32_t        max_rd_size;
3146         u_int32_t        max_io_size;
3147         u_int32_t        max_prefetch;
3148         u_int            rd_ahead_enabled = 1;
3149         u_int            prefetch_enabled = 1;
3150         struct cl_readahead *   rap;
3151         struct clios            iostate;
3152         struct cl_extent        extent;
3153         int              bflag;
3154         int              take_reference = 1;
3155         struct uthread  *ut;
3156         int              policy = IOPOL_DEFAULT;
3157
3158         policy = current_proc()->p_iopol_disk;
3159
3160         ut = get_bsdthread_info(current_thread());
3161
3162         if (ut->uu_iopol_disk != IOPOL_DEFAULT)
3163                 policy = ut->uu_iopol_disk;
3164
3165         if (policy == IOPOL_THROTTLE)
3166                 take_reference = 0;
3167
3168         if (flags & IO_PASSIVE)
3169                 bflag = CL_PASSIVE;
3170         else
3171                 bflag = 0;
3172
3173         max_prefetch = MAX_PREFETCH(vp);
3174         max_rd_size = max_prefetch;
3175         max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
3176
3177         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
3178                      (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
3179
3180         last_request_offset = uio->uio_offset + io_req_size;
3181
3182         if ((flags & (IO_RAOFF|IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
3183                 rd_ahead_enabled = 0;
3184                 rap = NULL;
3185         } else {
3186                 if (cluster_hard_throttle_on(vp)) {
3187                         rd_ahead_enabled = 0;
3188                         prefetch_enabled = 0;
3189
3190                         max_rd_size = HARD_THROTTLE_MAXSIZE;
3191                 }
3192                 if ((rap = cluster_get_rap(vp)) == NULL)
3193                         rd_ahead_enabled = 0;
3194         }
3195         if (last_request_offset > filesize)
3196                 last_request_offset = filesize;
3197         extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
3198         extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
3199
3200         if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
3201                 /*
3202                  * determine if we already have a read-ahead in the pipe courtesy of the
3203                  * last read systemcall that was issued...
3204                  * if so, pick up it's extent to determine where we should start
3205                  * with respect to any read-ahead that might be necessary to
3206                  * garner all the data needed to complete this read systemcall
3207                  */
3208                 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
3209
3210                 if (last_ioread_offset < uio->uio_offset)
3211                         last_ioread_offset = (off_t)0;
3212                 else if (last_ioread_offset > last_request_offset)
3213                         last_ioread_offset = last_request_offset;
3214         } else
3215                 last_ioread_offset = (off_t)0;
3216
3217         while (io_req_size && uio->uio_offset < filesize && retval == 0) {
3218                 /*
3219                  * compute the size of the upl needed to encompass
3220                  * the requested read... limit each call to cluster_io
3221                  * to the maximum UPL size... cluster_io will clip if
3222                  * this exceeds the maximum io_size for the device,
3223                  * make sure to account for
3224                  * a starting offset that's not page aligned
3225                  */
3226                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3227                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
3228                 max_size     = filesize - uio->uio_offset;
3229
3230                 if ((off_t)(io_req_size) < max_size)
3231                         io_size = io_req_size;
3232                 else
3233                         io_size = max_size;
3234
3235                 if (!(flags & IO_NOCACHE)) {
3236
3237                         while (io_size) {
3238                                 u_int32_t io_resid;
3239                                 u_int32_t io_requested;
3240
3241                                 /*
3242                                  * if we keep finding the pages we need already in the cache, then
3243                                  * don't bother to call cluster_read_prefetch since it costs CPU cycles
3244                                  * to determine that we have all the pages we need... once we miss in
3245                                  * the cache and have issued an I/O, than we'll assume that we're likely
3246                                  * to continue to miss in the cache and it's to our advantage to try and prefetch
3247                                  */
3248                                 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
3249                                         if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
3250                                                 /*
3251                                                  * we've already issued I/O for this request and
3252                                                  * there's still work to do and
3253                                                  * our prefetch stream is running dry, so issue a
3254                                                  * pre-fetch I/O... the I/O latency will overlap
3255                                                  * with the copying of the data
3256                                                  */
3257                                                 if (size_of_prefetch > max_rd_size)
3258                                                         size_of_prefetch = max_rd_size;
3259
3260                                                 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
3261
3262                                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
3263
3264                                                 if (last_ioread_offset > last_request_offset)
3265                                                         last_ioread_offset = last_request_offset;
3266                                         }
3267                                 }
3268                                 /*
3269                                  * limit the size of the copy we're about to do so that
3270                                  * we can notice that our I/O pipe is running dry and
3271                                  * get the next I/O issued before it does go dry
3272                                  */
3273                                 if (last_ioread_offset && io_size > (max_io_size / 4))
3274                                         io_resid = (max_io_size / 4);
3275                                 else
3276                                         io_resid = io_size;
3277
3278                                 io_requested = io_resid;
3279
3280                                 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
3281
3282                                 xsize = io_requested - io_resid;
3283
3284                                 io_size -= xsize;
3285                                 io_req_size -= xsize;
3286
3287                                 if (retval || io_resid)
3288                                         /*
3289                                          * if we run into a real error or
3290                                          * a page that is not in the cache
3291                                          * we need to leave streaming mode
3292                                          */
3293                                         break;
3294
3295                                 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
3296                                         /*
3297                                          * we're already finished the I/O for this read request
3298                                          * let's see if we should do a read-ahead
3299                                          */
3300                                         cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
3301                                 }
3302                         }
3303                         if (retval)
3304                                 break;
3305                         if (io_size == 0) {
3306                                 if (rap != NULL) {
3307                                         if (extent.e_addr < rap->cl_lastr)
3308                                                 rap->cl_maxra = 0;
3309                                         rap->cl_lastr = extent.e_addr;
3310                                 }
3311                                 break;
3312                         }
3313                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3314                         upl_f_offset = uio->uio_offset - (off_t)start_offset;
3315                         max_size     = filesize - uio->uio_offset;
3316                 }
3317                 if (io_size > max_rd_size)
3318                         io_size = max_rd_size;
3319
3320                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3321
3322                 if (flags & IO_NOCACHE) {
3323                         if (upl_size > max_io_size)
3324                                 upl_size = max_io_size;
3325                 } else {
3326                         if (upl_size > max_io_size / 4)
3327                                 upl_size = max_io_size / 4;
3328                 }
3329                 pages_in_upl = upl_size / PAGE_SIZE;
3330
3331                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
3332                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3333
3334                 kret = ubc_create_upl(vp,
3335                                       upl_f_offset,
3336                                       upl_size,
3337                                       &upl,
3338                                       &pl,
3339                                       UPL_FILE_IO | UPL_SET_LITE);
3340                 if (kret != KERN_SUCCESS)
3341                         panic("cluster_read_copy: failed to get pagelist");
3342
3343                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
3344                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3345
3346                 /*
3347                  * scan from the beginning of the upl looking for the first
3348                  * non-valid page.... this will become the first page in
3349                  * the request we're going to make to 'cluster_io'... if all
3350                  * of the pages are valid, we won't call through to 'cluster_io'
3351                  */
3352                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
3353                         if (!upl_valid_page(pl, start_pg))
3354                                 break;
3355                 }
3356
3357                 /*
3358                  * scan from the starting invalid page looking for a valid
3359                  * page before the end of the upl is reached, if we
3360                  * find one, then it will be the last page of the request to
3361                  * 'cluster_io'
3362                  */
3363                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3364                         if (upl_valid_page(pl, last_pg))
3365                                 break;
3366                 }
3367                 iostate.io_completed = 0;
3368                 iostate.io_issued = 0;
3369                 iostate.io_error = 0;
3370                 iostate.io_wanted = 0;
3371
3372                 if (start_pg < last_pg) {
3373                         /*
3374                          * we found a range of 'invalid' pages that must be filled
3375                          * if the last page in this range is the last page of the file
3376                          * we may have to clip the size of it to keep from reading past
3377                          * the end of the last physical block associated with the file
3378                          */
3379                         upl_offset = start_pg * PAGE_SIZE;
3380                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
3381
3382                         if ((upl_f_offset + upl_offset + io_size) > filesize)
3383                                 io_size = filesize - (upl_f_offset + upl_offset);
3384
3385                         /*
3386                          * issue an asynchronous read to cluster_io
3387                          */
3388
3389                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
3390                                            io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg);
3391                 }
3392                 if (error == 0) {
3393                         /*
3394                          * if the read completed successfully, or there was no I/O request
3395                          * issued, than copy the data into user land via 'cluster_upl_copy_data'
3396                          * we'll first add on any 'valid'
3397                          * pages that were present in the upl when we acquired it.
3398                          */
3399                         u_int  val_size;
3400
3401                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
3402                                 if (!upl_valid_page(pl, uio_last))
3403                                         break;
3404                         }
3405                         if (uio_last < pages_in_upl) {
3406                                 /*
3407                                  * there were some invalid pages beyond the valid pages
3408                                  * that we didn't issue an I/O for, just release them
3409                                  * unchanged now, so that any prefetch/readahed can
3410                                  * include them
3411                                  */
3412                                 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
3413                                                     (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
3414                         }
3415
3416                         /*
3417                          * compute size to transfer this round,  if io_req_size is
3418                          * still non-zero after this attempt, we'll loop around and
3419                          * set up for another I/O.
3420                          */
3421                         val_size = (uio_last * PAGE_SIZE) - start_offset;
3422
3423                         if (val_size > max_size)
3424                                 val_size = max_size;
3425
3426                         if (val_size > io_req_size)
3427                                 val_size = io_req_size;
3428
3429                         if ((uio->uio_offset + val_size) > last_ioread_offset)
3430                                 last_ioread_offset = uio->uio_offset + val_size;
3431
3432                         if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
3433
3434                                 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
3435                                         /*
3436                                          * if there's still I/O left to do for this request, and...
3437                                          * we're not in hard throttle mode, and...
3438                                          * we're close to using up the previous prefetch, then issue a
3439                                          * new pre-fetch I/O... the I/O latency will overlap
3440                                          * with the copying of the data
3441                                          */
3442                                         if (size_of_prefetch > max_rd_size)
3443                                                 size_of_prefetch = max_rd_size;
3444
3445                                         size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
3446
3447                                         last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
3448
3449                                         if (last_ioread_offset > last_request_offset)
3450                                                 last_ioread_offset = last_request_offset;
3451                                 }
3452
3453                         } else if ((uio->uio_offset + val_size) == last_request_offset) {
3454                                 /*
3455                                  * this transfer will finish this request, so...
3456                                  * let's try to read ahead if we're in
3457                                  * a sequential access pattern and we haven't
3458                                  * explicitly disabled it
3459                                  */
3460                                 if (rd_ahead_enabled)
3461                                         cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
3462
3463                                 if (rap != NULL) {
3464                                         if (extent.e_addr < rap->cl_lastr)
3465                                                 rap->cl_maxra = 0;
3466                                         rap->cl_lastr = extent.e_addr;
3467                                 }
3468                         }
3469                         lck_mtx_lock(cl_mtxp);
3470
3471                         while (iostate.io_issued != iostate.io_completed) {
3472                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
3473                                                 iostate.io_issued, iostate.io_completed, 0, 0, 0);
3474
3475                                 iostate.io_wanted = 1;
3476                                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_copy", NULL);
3477
3478                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
3479                                                 iostate.io_issued, iostate.io_completed, 0, 0, 0);
3480                         }
3481                         lck_mtx_unlock(cl_mtxp);
3482
3483                         if (iostate.io_error)
3484                                 error = iostate.io_error;
3485                         else {
3486                                 u_int32_t io_requested;
3487
3488                                 io_requested = val_size;
3489
3490                                 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
3491
3492                                 io_req_size -= (val_size - io_requested);
3493                         }
3494                 }
3495                 if (start_pg < last_pg) {
3496                         /*
3497                          * compute the range of pages that we actually issued an I/O for
3498                          * and either commit them as valid if the I/O succeeded
3499                          * or abort them if the I/O failed or we're not supposed to
3500                          * keep them in the cache
3501                          */
3502                         io_size = (last_pg - start_pg) * PAGE_SIZE;
3503
3504                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3505
3506                         if (error || (flags & IO_NOCACHE))
3507                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
3508                                                     UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3509                         else
3510                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
3511                                                      UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY | UPL_COMMIT_INACTIVATE);
3512
3513                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3514                 }
3515                 if ((last_pg - start_pg) < pages_in_upl) {
3516                         /*
3517                          * the set of pages that we issued an I/O for did not encompass
3518                          * the entire upl... so just release these without modifying
3519                          * their state
3520                          */
3521                         if (error)
3522                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3523                         else {
3524
3525                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3526                                              (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
3527
3528                                 /*
3529                                  * handle any valid pages at the beginning of
3530                                  * the upl... release these appropriately
3531                                  */
3532                                 cluster_read_upl_release(upl, 0, start_pg, flags);
3533
3534                                 /*
3535                                  * handle any valid pages immediately after the
3536                                  * pages we issued I/O for... ... release these appropriately
3537                                  */
3538                                 cluster_read_upl_release(upl, last_pg, uio_last, flags);
3539
3540                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, (int)upl, -1, -1, 0, 0);
3541                         }
3542                 }
3543                 if (retval == 0)
3544                         retval = error;
3545
3546                 if (io_req_size) {
3547                         if (cluster_hard_throttle_on(vp)) {
3548                                 rd_ahead_enabled = 0;
3549                                 prefetch_enabled = 0;
3550
3551                                 max_rd_size = HARD_THROTTLE_MAXSIZE;
3552                         } else {
3553                                 if (max_rd_size == HARD_THROTTLE_MAXSIZE) {
3554                                         /*
3555                                          * coming out of throttled state
3556                                          */
3557                                         if (rap != NULL)
3558                                                 rd_ahead_enabled = 1;
3559                                         prefetch_enabled = 1;
3560
3561                                         max_rd_size = max_prefetch;
3562                                         last_ioread_offset = 0;
3563                                 }
3564                         }
3565                 }
3566         }
3567         if (rap != NULL) {
3568                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3569                              (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
3570
3571                 lck_mtx_unlock(&rap->cl_lockr);
3572         } else {
3573                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3574                              (int)uio->uio_offset, io_req_size, 0, retval, 0);
3575         }
3576
3577         return (retval);
3578 }
3579
3580
3581 static int
3582 cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
3583                     int flags, int (*callback)(buf_t, void *), void *callback_arg)
3584 {
3585         upl_t            upl;
3586         upl_page_info_t  *pl;
3587         off_t            max_io_size;
3588         vm_offset_t      upl_offset;
3589         vm_size_t        upl_size;
3590         vm_size_t        upl_needed_size;
3591         unsigned int     pages_in_pl;
3592         int              upl_flags;
3593         int              bflag;
3594         kern_return_t    kret;
3595         unsigned int     i;
3596         int              force_data_sync;
3597         int              retval = 0;
3598         int              no_zero_fill = 0;
3599         int              abort_flag = 0;
3600         int              io_flag = 0;
3601         int              misaligned = 0;
3602         struct clios     iostate;
3603         user_addr_t      iov_base;
3604         u_int32_t        io_req_size;
3605         u_int32_t        offset_in_file;
3606         u_int32_t        offset_in_iovbase;
3607         u_int32_t        io_size;
3608         u_int32_t        io_min;
3609         u_int32_t        xsize;
3610         u_int32_t        devblocksize;
3611         u_int32_t        mem_alignment_mask;
3612         u_int32_t        max_upl_size;
3613         u_int32_t        max_rd_size;
3614         u_int32_t        max_rd_ahead;
3615
3616
3617         max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
3618
3619         max_rd_size = max_upl_size;
3620         max_rd_ahead = max_rd_size * 2;
3621
3622
3623         if (flags & IO_PASSIVE)
3624                 bflag = CL_PASSIVE;
3625         else
3626                 bflag = 0;
3627
3628         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
3629                      (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
3630
3631         iostate.io_completed = 0;
3632         iostate.io_issued = 0;
3633         iostate.io_error = 0;
3634         iostate.io_wanted = 0;
3635
3636         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
3637         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
3638
3639         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
3640                      (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
3641
3642         if (devblocksize == 1) {
3643                /*
3644                 * the AFP client advertises a devblocksize of 1
3645                 * however, its BLOCKMAP routine maps to physical
3646                 * blocks that are PAGE_SIZE in size...
3647                 * therefore we can't ask for I/Os that aren't page aligned
3648                 * or aren't multiples of PAGE_SIZE in size
3649                 * by setting devblocksize to PAGE_SIZE, we re-instate
3650                 * the old behavior we had before the mem_alignment_mask
3651                 * changes went in...
3652                 */
3653                devblocksize = PAGE_SIZE;
3654         }
3655 next_dread:
3656         io_req_size = *read_length;
3657         iov_base = uio_curriovbase(uio);
3658
3659         max_io_size = filesize - uio->uio_offset;
3660
3661         if ((off_t)io_req_size > max_io_size)
3662                 io_req_size = max_io_size;
3663
3664         offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
3665         offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
3666
3667         if (offset_in_file || offset_in_iovbase) {
3668                 /*
3669                  * one of the 2 important offsets is misaligned
3670                  * so fire an I/O through the cache for this entire vector
3671                  */
3672                 misaligned = 1;
3673         }
3674         if (iov_base & (devblocksize - 1)) {
3675                 /*
3676                  * the offset in memory must be on a device block boundary
3677                  * so that we can guarantee that we can generate an
3678                  * I/O that ends on a page boundary in cluster_io
3679                  */
3680                 misaligned = 1;
3681         }
3682         /*
3683          * When we get to this point, we know...
3684          *  -- the offset into the file is on a devblocksize boundary
3685          */
3686
3687         while (io_req_size && retval == 0) {
3688                 u_int32_t io_start;
3689
3690                 if (cluster_hard_throttle_on(vp)) {
3691                         max_rd_size  = HARD_THROTTLE_MAXSIZE;
3692                         max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
3693                 } else {
3694                         max_rd_size  = max_upl_size;
3695                         max_rd_ahead = max_rd_size * 2;
3696                 }
3697                 io_start = io_size = io_req_size;
3698
3699                 /*
3700                  * First look for pages already in the cache
3701                  * and move them to user space.
3702                  *
3703                  * cluster_copy_ubc_data returns the resid
3704                  * in io_size
3705                  */
3706                 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
3707
3708                 /*
3709                  * calculate the number of bytes actually copied
3710                  * starting size - residual
3711                  */
3712                 xsize = io_start - io_size;
3713
3714                 io_req_size -= xsize;
3715
3716                 /*
3717                  * check to see if we are finished with this request...
3718                  */
3719                 if (io_req_size == 0 || misaligned) {
3720                         /*
3721                          * see if there's another uio vector to
3722                          * process that's of type IO_DIRECT
3723                          *
3724                          * break out of while loop to get there
3725                          */
3726                         break;
3727                 }
3728                 /*
3729                  * assume the request ends on a device block boundary
3730                  */
3731                 io_min = devblocksize;
3732
3733                 /*
3734                  * we can handle I/O's in multiples of the device block size
3735                  * however, if io_size isn't a multiple of devblocksize we
3736                  * want to clip it back to the nearest page boundary since
3737                  * we are going to have to go through cluster_read_copy to
3738                  * deal with the 'overhang'... by clipping it to a PAGE_SIZE
3739                  * multiple, we avoid asking the drive for the same physical
3740                  * blocks twice.. once for the partial page at the end of the
3741                  * request and a 2nd time for the page we read into the cache
3742                  * (which overlaps the end of the direct read) in order to
3743                  * get at the overhang bytes
3744                  */
3745                 if (io_size & (devblocksize - 1)) {
3746                         /*
3747                          * request does NOT end on a device block boundary
3748                          * so clip it back to a PAGE_SIZE boundary
3749                          */
3750                         io_size &= ~PAGE_MASK;
3751                         io_min = PAGE_SIZE;
3752                 }
3753                 if (retval || io_size < io_min) {
3754                         /*
3755                          * either an error or we only have the tail left to
3756                          * complete via the copy path...
3757                          * we may have already spun some portion of this request
3758                          * off as async requests... we need to wait for the I/O
3759                          * to complete before returning
3760                          */
3761                         goto wait_for_dreads;
3762                 }
3763                 if ((xsize = io_size) > max_rd_size)
3764                         xsize = max_rd_size;
3765
3766                 io_size = 0;
3767
3768                 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
3769
3770                 if (io_size == 0) {
3771                         /*
3772                          * a page must have just come into the cache
3773                          * since the first page in this range is no
3774                          * longer absent, go back and re-evaluate
3775                          */
3776                         continue;
3777                 }
3778                 iov_base = uio_curriovbase(uio);
3779
3780                 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
3781                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
3782
3783                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
3784                              (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
3785
3786                 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
3787                         no_zero_fill = 1;
3788                         abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY;
3789                 } else {
3790                         no_zero_fill = 0;
3791                         abort_flag = UPL_ABORT_FREE_ON_EMPTY;
3792                 }
3793                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
3794                         pages_in_pl = 0;
3795                         upl_size = upl_needed_size;
3796                         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3797
3798                         if (no_zero_fill)
3799                                 upl_flags |= UPL_NOZEROFILL;
3800                         if (force_data_sync)
3801                                 upl_flags |= UPL_FORCE_DATA_SYNC;
3802
3803                         kret = vm_map_create_upl(current_map(),
3804                                                  (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
3805                                                  &upl_size, &upl, NULL, &pages_in_pl, &upl_flags);
3806
3807                         if (kret != KERN_SUCCESS) {
3808                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3809                                              (int)upl_offset, upl_size, io_size, kret, 0);
3810                                 /*
3811                                  * failed to get pagelist
3812                                  *
3813                                  * we may have already spun some portion of this request
3814                                  * off as async requests... we need to wait for the I/O
3815                                  * to complete before returning
3816                                  */
3817                                 goto wait_for_dreads;
3818                         }
3819                         pages_in_pl = upl_size / PAGE_SIZE;
3820                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3821
3822                         for (i = 0; i < pages_in_pl; i++) {
3823                                 if (!upl_valid_page(pl, i))
3824                                         break;
3825                         }
3826                         if (i == pages_in_pl)
3827                                 break;
3828
3829                         ubc_upl_abort(upl, abort_flag);
3830                 }
3831                 if (force_data_sync >= 3) {
3832                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3833                                      (int)upl_offset, upl_size, io_size, kret, 0);
3834
3835                         goto wait_for_dreads;
3836                 }
3837                 /*
3838                  * Consider the possibility that upl_size wasn't satisfied.
3839                  */
3840                 if (upl_size < upl_needed_size) {
3841                         if (upl_size && upl_offset == 0)
3842                                 io_size = upl_size;
3843                         else
3844                                 io_size = 0;
3845                 }
3846                 if (io_size == 0) {
3847                         ubc_upl_abort(upl, abort_flag);
3848                         goto wait_for_dreads;
3849                 }
3850                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3851                              (int)upl_offset, upl_size, io_size, kret, 0);
3852
3853                 /*
3854                  * request asynchronously so that we can overlap
3855                  * the preparation of the next I/O
3856                  * if there are already too many outstanding reads
3857                  * wait until some have completed before issuing the next read
3858                  */
3859                 lck_mtx_lock(cl_mtxp);
3860
3861                 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
3862                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
3863                                         iostate.io_issued, iostate.io_completed, max_rd_ahead, 0, 0);
3864
3865                         iostate.io_wanted = 1;
3866                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL);
3867
3868                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
3869                                         iostate.io_issued, iostate.io_completed, max_rd_ahead, 0, 0);
3870                 }
3871                 lck_mtx_unlock(cl_mtxp);
3872
3873                 if (iostate.io_error) {
3874                         /*
3875                          * one of the earlier reads we issued ran into a hard error
3876                          * don't issue any more reads, cleanup the UPL
3877                          * that was just created but not used, then
3878                          * go wait for any other reads to complete before
3879                          * returning the error to the caller
3880                          */
3881                         ubc_upl_abort(upl, abort_flag);
3882
3883                         goto wait_for_dreads;
3884                 }
3885                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
3886                              (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
3887
3888                 if (no_zero_fill)
3889                         io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO | bflag;
3890                 else
3891                         io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO | CL_PRESERVE | bflag;
3892
3893                 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
3894
3895                 /*
3896                  * update the uio structure
3897                  */
3898                 uio_update(uio, (user_size_t)io_size);
3899
3900                 io_req_size -= io_size;
3901
3902                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
3903                              (int)upl, (int)uio->uio_offset, io_req_size, retval, 0);
3904
3905         } /* end while */
3906
3907         if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
3908
3909                 retval = cluster_io_type(uio, read_type, read_length, 0);
3910
3911                 if (retval == 0 && *read_type == IO_DIRECT) {
3912
3913                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
3914                                      (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
3915
3916                         goto next_dread;
3917                 }
3918         }
3919
3920 wait_for_dreads:
3921         if (iostate.io_issued) {
3922                 /*
3923                  * make sure all async reads that are part of this stream
3924                  * have completed before we return
3925                  */
3926                 lck_mtx_lock(cl_mtxp);
3927
3928                 while (iostate.io_issued != iostate.io_completed) {
3929                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
3930                                         iostate.io_issued, iostate.io_completed, 0, 0, 0);
3931
3932                         iostate.io_wanted = 1;
3933                         msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL);
3934
3935                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
3936                                         iostate.io_issued, iostate.io_completed, 0, 0, 0);
3937                 }
3938                 lck_mtx_unlock(cl_mtxp);
3939         }
3940
3941         if (iostate.io_error)
3942                 retval = iostate.io_error;
3943
3944         if (io_req_size && retval == 0) {
3945                 /*
3946                  * we couldn't handle the tail of this request in DIRECT mode
3947                  * so fire it through the copy path
3948                  */
3949                 retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
3950
3951                 *read_type = IO_UNKNOWN;
3952         }
3953         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
3954                      (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
3955
3956         return (retval);
3957 }
3958
3959
3960 static int
3961 cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
3962                     int (*callback)(buf_t, void *), void *callback_arg, int flags)
3963 {
3964         upl_page_info_t *pl;
3965         upl_t            upl[MAX_VECTS];
3966         vm_offset_t      upl_offset;
3967         addr64_t         dst_paddr = 0;
3968         user_addr_t      iov_base;
3969         off_t            max_size;
3970         vm_size_t        upl_size;
3971         vm_size_t        upl_needed_size;
3972         mach_msg_type_number_t  pages_in_pl;
3973         int              upl_flags;
3974         kern_return_t    kret;
3975         struct clios     iostate;
3976         int              error= 0;
3977         int              cur_upl = 0;
3978         int              num_upl = 0;
3979         int              n;
3980         u_int32_t        xsize;
3981         u_int32_t        io_size;
3982         u_int32_t        devblocksize;
3983         u_int32_t        mem_alignment_mask;
3984         u_int32_t        tail_size = 0;
3985         int              bflag;
3986
3987         if (flags & IO_PASSIVE)
3988                 bflag = CL_PASSIVE;
3989         else
3990                 bflag = 0;
3991
3992         /*
3993          * When we enter this routine, we know
3994          *  -- the read_length will not exceed the current iov_len
3995          *  -- the target address is physically contiguous for read_length
3996          */
3997         cluster_syncup(vp, filesize, callback, callback_arg);
3998
3999         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
4000         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
4001
4002         iostate.io_completed = 0;
4003         iostate.io_issued = 0;
4004         iostate.io_error = 0;
4005         iostate.io_wanted = 0;
4006
4007 next_cread:
4008         io_size = *read_length;
4009
4010         max_size = filesize - uio->uio_offset;
4011
4012         if (io_size > max_size)
4013                 io_size = max_size;
4014
4015         iov_base = uio_curriovbase(uio);
4016
4017         upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
4018         upl_needed_size = upl_offset + io_size;
4019
4020         pages_in_pl = 0;
4021         upl_size = upl_needed_size;
4022         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
4023
4024
4025         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START,
4026                      (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
4027
4028         kret = vm_map_get_upl(current_map(),
4029                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
4030                               &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0);
4031
4032         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END,
4033                      (int)upl_offset, upl_size, io_size, kret, 0);
4034
4035         if (kret != KERN_SUCCESS) {
4036                 /*
4037                  * failed to get pagelist
4038                  */
4039                 error = EINVAL;
4040                 goto wait_for_creads;
4041         }
4042         num_upl++;
4043
4044         if (upl_size < upl_needed_size) {
4045                 /*
4046                  * The upl_size wasn't satisfied.
4047                  */
4048                 error = EINVAL;
4049                 goto wait_for_creads;
4050         }
4051         pl = ubc_upl_pageinfo(upl[cur_upl]);
4052
4053         dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
4054
4055         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
4056                 u_int32_t   head_size;
4057
4058                 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
4059
4060                 if (head_size > io_size)
4061                         head_size = io_size;
4062
4063                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
4064
4065                 if (error)
4066                         goto wait_for_creads;
4067
4068                 upl_offset += head_size;
4069                 dst_paddr  += head_size;
4070                 io_size    -= head_size;
4071
4072                 iov_base   += head_size;
4073         }
4074         if ((u_int32_t)iov_base & mem_alignment_mask) {
4075                 /*
4076                  * request doesn't set up on a memory boundary
4077                  * the underlying DMA engine can handle...
4078                  * return an error instead of going through
4079                  * the slow copy path since the intent of this
4080                  * path is direct I/O to device memory
4081                  */
4082                 error = EINVAL;
4083                 goto wait_for_creads;
4084         }
4085
4086         tail_size = io_size & (devblocksize - 1);
4087
4088         io_size  -= tail_size;
4089
4090         while (io_size && error == 0) {
4091
4092                 if (io_size > MAX_IO_CONTIG_SIZE)
4093                         xsize = MAX_IO_CONTIG_SIZE;
4094                 else
4095                         xsize = io_size;
4096                 /*
4097                  * request asynchronously so that we can overlap
4098                  * the preparation of the next I/O... we'll do
4099                  * the commit after all the I/O has completed
4100                  * since its all issued against the same UPL
4101                  * if there are already too many outstanding reads
4102                  * wait until some have completed before issuing the next
4103                  */
4104                 if (iostate.io_issued) {
4105                         lck_mtx_lock(cl_mtxp);
4106
4107                         while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_IO_CONTIG_SIZE)) {
4108                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
4109                                                 iostate.io_issued, iostate.io_completed, 2 * MAX_IO_CONTIG_SIZE, 0, 0);
4110
4111                                 iostate.io_wanted = 1;
4112                                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL);
4113
4114                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
4115                                                 iostate.io_issued, iostate.io_completed, 2 * MAX_IO_CONTIG_SIZE, 0, 0);
4116                         }
4117                         lck_mtx_unlock(cl_mtxp);
4118                 }
4119                 if (iostate.io_error) {
4120                         /*
4121                          * one of the earlier reads we issued ran into a hard error
4122                          * don't issue any more reads...
4123                          * go wait for any other reads to complete before
4124                          * returning the error to the caller
4125                          */
4126                         goto wait_for_creads;
4127                 }
4128                 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
4129                                    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag,
4130                                    (buf_t)NULL, &iostate, callback, callback_arg);
4131                 /*
4132                  * The cluster_io read was issued successfully,
4133                  * update the uio structure
4134                  */
4135                 if (error == 0) {
4136                         uio_update(uio, (user_size_t)xsize);
4137
4138                         dst_paddr  += xsize;
4139                         upl_offset += xsize;
4140                         io_size    -= xsize;
4141                 }
4142         }
4143         if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
4144
4145                 error = cluster_io_type(uio, read_type, read_length, 0);
4146
4147                 if (error == 0 && *read_type == IO_CONTIG) {
4148                         cur_upl++;
4149                         goto next_cread;
4150                 }
4151         } else
4152                 *read_type = IO_UNKNOWN;
4153
4154 wait_for_creads:
4155         /*
4156          * make sure all async reads that are part of this stream
4157          * have completed before we proceed
4158          */
4159         lck_mtx_lock(cl_mtxp);
4160
4161         while (iostate.io_issued != iostate.io_completed) {
4162                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
4163                                 iostate.io_issued, iostate.io_completed, 0, 0, 0);
4164
4165                 iostate.io_wanted = 1;
4166                 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL);
4167
4168                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
4169                                 iostate.io_issued, iostate.io_completed, 0, 0, 0);
4170         }
4171         lck_mtx_unlock(cl_mtxp);
4172
4173         if (iostate.io_error)
4174                 error = iostate.io_error;
4175
4176         if (error == 0 && tail_size)
4177                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
4178
4179         for (n = 0; n < num_upl; n++)
4180                 /*
4181                  * just release our hold on each physically contiguous
4182                  * region without changing any state
4183                  */
4184                 ubc_upl_abort(upl[n], 0);
4185
4186         return (error);
4187 }
4188
4189
4190 static int
4191 cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length)
4192 {
4193         user_size_t      iov_len;
4194         user_addr_t      iov_base = 0;
4195         upl_t            upl;
4196         vm_size_t        upl_size;
4197         int              upl_flags;
4198         int              retval = 0;
4199
4200         /*
4201          * skip over any emtpy vectors
4202          */
4203         uio_update(uio, (user_size_t)0);
4204
4205         iov_len = uio_curriovlen(uio);
4206
4207         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, (int)uio, (int)iov_len, 0, 0, 0);
4208
4209         if (iov_len) {
4210                 iov_base = uio_curriovbase(uio);
4211                 /*
4212                  * make sure the size of the vector isn't too big...
4213                  * internally, we want to handle all of the I/O in
4214                  * chunk sizes that fit in a 32 bit int
4215                  */
4216                 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE)
4217                         upl_size = MAX_IO_REQUEST_SIZE;
4218                 else
4219                         upl_size = (u_int32_t)iov_len;
4220
4221                 upl_flags = UPL_QUERY_OBJECT_TYPE;
4222
4223                 if ((vm_map_get_upl(current_map(),
4224                                     (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
4225                                     &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
4226                         /*
4227                          * the user app must have passed in an invalid address
4228                          */
4229                         retval = EFAULT;
4230                 }
4231                 if (upl_size == 0)
4232                         retval = EFAULT;
4233
4234                 *io_length = upl_size;
4235
4236                 if (upl_flags & UPL_PHYS_CONTIG)
4237                         *io_type = IO_CONTIG;
4238                 else if (iov_len >= min_length)
4239                         *io_type = IO_DIRECT;
4240                 else
4241                         *io_type = IO_COPY;
4242         } else {
4243                 /*
4244                  * nothing left to do for this uio
4245                  */
4246                 *io_length = 0;
4247                 *io_type   = IO_UNKNOWN;
4248         }
4249         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, (int)iov_base, *io_type, *io_length, retval, 0);
4250
4251         return (retval);
4252 }
4253
4254
4255 /*
4256  * generate advisory I/O's in the largest chunks possible
4257  * the completed pages will be released into the VM cache
4258  */
4259 int
4260 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
4261 {
4262         return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
4263 }
4264
4265 int
4266 advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
4267 {
4268         upl_page_info_t *pl;
4269         upl_t            upl;
4270         vm_offset_t      upl_offset;
4271         int              upl_size;
4272         off_t            upl_f_offset;
4273         int              start_offset;
4274         int              start_pg;
4275         int              last_pg;
4276         int              pages_in_upl;
4277         off_t            max_size;
4278         int              io_size;
4279         kern_return_t    kret;
4280         int              retval = 0;
4281         int              issued_io;
4282         int              skip_range;
4283         uint32_t         max_io_size;
4284
4285
4286         if ( !UBCINFOEXISTS(vp))
4287                 return(EINVAL);
4288
4289         if (resid < 0)
4290                 return(EINVAL);
4291
4292         max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
4293
4294         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
4295                         (int)f_offset, resid, (int)filesize, 0, 0);
4296
4297         while (resid && f_offset < filesize && retval == 0) {
4298                 /*
4299                  * compute the size of the upl needed to encompass
4300                  * the requested read... limit each call to cluster_io
4301                  * to the maximum UPL size... cluster_io will clip if
4302                  * this exceeds the maximum io_size for the device,
4303                  * make sure to account for
4304                  * a starting offset that's not page aligned
4305                  */
4306                 start_offset = (int)(f_offset & PAGE_MASK_64);
4307                 upl_f_offset = f_offset - (off_t)start_offset;
4308                 max_size     = filesize - f_offset;
4309
4310                 if (resid < max_size)
4311                         io_size = resid;
4312                 else
4313                         io_size = max_size;
4314
4315                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4316                 if ((uint32_t)upl_size > max_io_size)
4317                         upl_size = max_io_size;
4318
4319                 skip_range = 0;
4320                 /*
4321                  * return the number of contiguously present pages in the cache
4322                  * starting at upl_f_offset within the file
4323                  */
4324                 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
4325
4326                 if (skip_range) {
4327                         /*
4328                          * skip over pages already present in the cache
4329                          */
4330                         io_size = skip_range - start_offset;
4331
4332                         f_offset += io_size;
4333                         resid    -= io_size;
4334
4335                         if (skip_range == upl_size)
4336                                 continue;
4337                         /*
4338                          * have to issue some real I/O
4339                          * at this point, we know it's starting on a page boundary
4340                          * because we've skipped over at least the first page in the request
4341                          */
4342                         start_offset = 0;
4343                         upl_f_offset += skip_range;
4344                         upl_size     -= skip_range;
4345                 }
4346                 pages_in_upl = upl_size / PAGE_SIZE;
4347
4348                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
4349                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
4350
4351                 kret = ubc_create_upl(vp,
4352                                       upl_f_offset,
4353                                       upl_size,
4354                                       &upl,
4355                                       &pl,
4356                                       UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
4357                 if (kret != KERN_SUCCESS)
4358                         return(retval);
4359                 issued_io = 0;
4360
4361                 /*
4362                  * before we start marching forward, we must make sure we end on
4363                  * a present page, otherwise we will be working with a freed
4364                  * upl
4365                  */
4366                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
4367                         if (upl_page_present(pl, last_pg))
4368                                 break;
4369                 }
4370                 pages_in_upl = last_pg + 1;
4371
4372
4373                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
4374                              (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
4375
4376
4377                 for (last_pg = 0; last_pg < pages_in_upl; ) {
4378                         /*
4379                          * scan from the beginning of the upl looking for the first
4380                          * page that is present.... this will become the first page in
4381                          * the request we're going to make to 'cluster_io'... if all
4382                          * of the pages are absent, we won't call through to 'cluster_io'
4383                          */
4384                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
4385                                 if (upl_page_present(pl, start_pg))
4386                                         break;
4387                         }
4388
4389                         /*
4390                          * scan from the starting present page looking for an absent
4391                          * page before the end of the upl is reached, if we
4392                          * find one, then it will terminate the range of pages being
4393                          * presented to 'cluster_io'
4394                          */
4395                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4396                                 if (!upl_page_present(pl, last_pg))
4397                                         break;
4398                         }
4399
4400                         if (last_pg > start_pg) {
4401                                 /*
4402                                  * we found a range of pages that must be filled
4403                                  * if the last page in this range is the last page of the file
4404                                  * we may have to clip the size of it to keep from reading past
4405                                  * the end of the last physical block associated with the file
4406                                  */
4407                                 upl_offset = start_pg * PAGE_SIZE;
4408                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
4409
4410                                 if ((upl_f_offset + upl_offset + io_size) > filesize)
4411                                         io_size = filesize - (upl_f_offset + upl_offset);
4412
4413                                 /*
4414                                  * issue an asynchronous read to cluster_io
4415                                  */
4416                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
4417                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
4418
4419                                 issued_io = 1;
4420                         }
4421                 }
4422                 if (issued_io == 0)
4423                         ubc_upl_abort(upl, 0);
4424
4425                 io_size = upl_size - start_offset;
4426
4427                 if (io_size > resid)
4428                         io_size = resid;
4429                 f_offset += io_size;
4430                 resid    -= io_size;
4431         }
4432
4433         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
4434                      (int)f_offset, resid, retval, 0, 0);
4435
4436         return(retval);
4437 }
4438
4439
4440 int
4441 cluster_push(vnode_t vp, int flags)
4442 {
4443         return cluster_push_ext(vp, flags, NULL, NULL);
4444 }
4445
4446
4447 int
4448 cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg)
4449 {
4450         int     retval;
4451         struct  cl_writebehind *wbp;
4452
4453         if ( !UBCINFOEXISTS(vp)) {
4454                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -1, 0);
4455                 return (0);
4456         }
4457         /* return if deferred write is set */
4458         if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
4459                 return (0);
4460         }
4461         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
4462                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -2, 0);
4463                 return (0);
4464         }
4465         if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
4466                 lck_mtx_unlock(&wbp->cl_lockw);
4467
4468                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -3, 0);
4469                 return(0);
4470         }
4471         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
4472                      (int)wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
4473
4474         if (wbp->cl_scmap) {
4475                 sparse_cluster_push(wbp, vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg);
4476
4477                 retval = 1;
4478         } else
4479                 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg);
4480
4481         lck_mtx_unlock(&wbp->cl_lockw);
4482
4483         if (flags & IO_SYNC)
4484                 (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
4485
4486         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
4487                      (int)wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
4488
4489         return (retval);
4490 }
4491
4492
4493 __private_extern__ void
4494 cluster_release(struct ubc_info *ubc)
4495 {
4496         struct cl_writebehind *wbp;
4497         struct cl_readahead   *rap;
4498
4499         if ((wbp = ubc->cl_wbehind)) {
4500
4501                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4502
4503                 if (wbp->cl_scmap)
4504                         vfs_drt_control(&(wbp->cl_scmap), 0);
4505         } else {
4506                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, 0, 0, 0, 0);
4507         }
4508
4509         rap = ubc->cl_rahead;
4510
4511         if (wbp != NULL) {
4512                 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
4513                 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
4514         }
4515         if ((rap = ubc->cl_rahead)) {
4516                 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
4517                 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
4518         }
4519         ubc->cl_rahead  = NULL;
4520         ubc->cl_wbehind = NULL;
4521
4522         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)ubc, (int)rap, (int)wbp, 0, 0);
4523 }
4524
4525
4526 static int
4527 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int (*callback)(buf_t, void *), void *callback_arg)
4528 {
4529         int cl_index;
4530         int cl_index1;
4531         int min_index;
4532         int cl_len;
4533         int cl_pushed = 0;
4534         struct cl_wextent l_clusters[MAX_CLUSTERS];
4535         u_int  max_cluster_pgcount;
4536
4537
4538         max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
4539         /*
4540          * the write behind context exists and has
4541          * already been locked...
4542          */
4543         if (wbp->cl_number == 0)
4544                 /*
4545                  * no clusters to push
4546                  * return number of empty slots
4547                  */
4548                 return (MAX_CLUSTERS);
4549
4550         /*
4551          * make a local 'sorted' copy of the clusters
4552          * and clear wbp->cl_number so that new clusters can
4553          * be developed
4554          */
4555         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
4556                 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
4557                         if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
4558                                 continue;
4559                         if (min_index == -1)
4560                                 min_index = cl_index1;
4561                         else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
4562                                 min_index = cl_index1;
4563                 }
4564                 if (min_index == -1)
4565                         break;
4566                 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
4567                 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
4568                 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
4569
4570                 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
4571         }
4572         wbp->cl_number = 0;
4573
4574         cl_len = cl_index;
4575
4576         if ( (push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS ) {
4577                 int   i;
4578
4579                 /*
4580                  * determine if we appear to be writing the file sequentially
4581                  * if not, by returning without having pushed any clusters
4582                  * we will cause this vnode to be pushed into the sparse cluster mechanism
4583                  * used for managing more random I/O patterns
4584                  *
4585                  * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
4586                  * that's why we're in try_push with PUSH_DELAY...
4587                  *
4588                  * check to make sure that all the clusters except the last one are 'full'... and that each cluster
4589                  * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
4590                  * so we can just make a simple pass through, up to, but not including the last one...
4591                  * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
4592                  * are sequential
4593                  *
4594                  * we let the last one be partial as long as it was adjacent to the previous one...
4595                  * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
4596                  * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
4597                  */
4598                 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
4599                         if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount)
4600                                 goto dont_try;
4601                         if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
4602                                 goto dont_try;
4603                 }
4604         }
4605         for (cl_index = 0; cl_index < cl_len; cl_index++) {
4606                 int     flags;
4607                 struct  cl_extent cl;
4608
4609                 /*
4610                  * try to push each cluster in turn...
4611                  */
4612                 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE)
4613                         flags = IO_NOCACHE;
4614                 else
4615                         flags = 0;
4616
4617                 if ((l_clusters[cl_index].io_flags & CLW_IOPASSIVE) || (push_flag & IO_PASSIVE))
4618                         flags |= IO_PASSIVE;
4619
4620                 if (push_flag & PUSH_SYNC)
4621                         flags |= IO_SYNC;
4622
4623                 cl.b_addr = l_clusters[cl_index].b_addr;
4624                 cl.e_addr = l_clusters[cl_index].e_addr;
4625
4626                 cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg);
4627
4628                 l_clusters[cl_index].b_addr = 0;
4629                 l_clusters[cl_index].e_addr = 0;
4630
4631                 cl_pushed++;
4632
4633                 if ( !(push_flag & PUSH_ALL) )
4634                         break;
4635         }
4636 dont_try:
4637         if (cl_len > cl_pushed) {
4638                /*
4639                 * we didn't push all of the clusters, so
4640                 * lets try to merge them back in to the vnode
4641                 */
4642                 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
4643                         /*
4644                          * we picked up some new clusters while we were trying to
4645                          * push the old ones... this can happen because I've dropped
4646                          * the vnode lock... the sum of the
4647                          * leftovers plus the new cluster count exceeds our ability
4648                          * to represent them, so switch to the sparse cluster mechanism
4649                          *
4650                          * collect the active public clusters...
4651                          */
4652                         sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg);
4653
4654                         for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
4655                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
4656                                         continue;
4657                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
4658                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
4659                                 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
4660
4661                                 cl_index1++;
4662                         }
4663                         /*
4664                          * update the cluster count
4665                          */
4666                         wbp->cl_number = cl_index1;
4667
4668                         /*
4669                          * and collect the original clusters that were moved into the
4670                          * local storage for sorting purposes
4671                          */
4672                         sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg);
4673
4674                 } else {
4675                         /*
4676                          * we've got room to merge the leftovers back in
4677                          * just append them starting at the next 'hole'
4678                          * represented by wbp->cl_number
4679                          */
4680                         for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
4681                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
4682                                         continue;
4683
4684                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
4685                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
4686                                 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
4687
4688                                 cl_index1++;
4689                         }
4690                         /*
4691                          * update the cluster count
4692                          */
4693                         wbp->cl_number = cl_index1;
4694                 }
4695         }
4696         return (MAX_CLUSTERS - wbp->cl_number);
4697 }
4698
4699
4700
4701 static int
4702 cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*callback)(buf_t, void *), void *callback_arg)
4703 {
4704         upl_page_info_t *pl;
4705         upl_t            upl;
4706         vm_offset_t      upl_offset;
4707         int              upl_size;
4708         off_t            upl_f_offset;
4709         int              pages_in_upl;
4710         int              start_pg;
4711         int              last_pg;
4712         int              io_size;
4713         int              io_flags;
4714         int              upl_flags;
4715         int              bflag;
4716         int              size;
4717         int              error = 0;
4718         int              retval;
4719         kern_return_t    kret;
4720
4721         if (flags & IO_PASSIVE)
4722             bflag = CL_PASSIVE;
4723         else
4724             bflag = 0;
4725
4726         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
4727                      (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
4728
4729         if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
4730                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
4731
4732                 return (0);
4733         }
4734         upl_size = pages_in_upl * PAGE_SIZE;
4735         upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4736
4737         if (upl_f_offset + upl_size >= EOF) {
4738
4739                 if (upl_f_offset >= EOF) {
4740                         /*
4741                          * must have truncated the file and missed
4742                          * clearing a dangling cluster (i.e. it's completely
4743                          * beyond the new EOF
4744                          */
4745                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
4746
4747                         return(0);
4748                 }
4749                 size = EOF - upl_f_offset;
4750
4751                 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4752                 pages_in_upl = upl_size / PAGE_SIZE;
4753         } else
4754                 size = upl_size;
4755
4756         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
4757
4758         /*
4759          * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
4760          *
4761          * - only pages that are currently dirty are returned... these are the ones we need to clean
4762          * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
4763          * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
4764          * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
4765          *   someone dirties this page while the I/O is in progress, we don't lose track of the new state
4766          *
4767          * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
4768          */
4769
4770         if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE))
4771                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
4772         else
4773                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
4774
4775         kret = ubc_create_upl(vp,
4776                                 upl_f_offset,
4777                                 upl_size,
4778                                 &upl,
4779                                 &pl,
4780                                 upl_flags);
4781         if (kret != KERN_SUCCESS)
4782                 panic("cluster_push: failed to get pagelist");
4783
4784         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
4785
4786         /*
4787          * since we only asked for the dirty pages back
4788          * it's possible that we may only get a few or even none, so...
4789          * before we start marching forward, we must make sure we know
4790          * where the last present page is in the UPL, otherwise we could
4791          * end up working with a freed upl due to the FREE_ON_EMPTY semantics
4792          * employed by commit_range and abort_range.
4793          */
4794         for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
4795                 if (upl_page_present(pl, last_pg))
4796                         break;
4797         }
4798         pages_in_upl = last_pg + 1;
4799
4800         if (pages_in_upl == 0) {
4801                 ubc_upl_abort(upl, 0);
4802
4803                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
4804                 return(0);
4805         }
4806
4807         for (last_pg = 0; last_pg < pages_in_upl; ) {
4808                 /*
4809                  * find the next dirty page in the UPL
4810                  * this will become the first page in the
4811                  * next I/O to generate
4812                  */
4813                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
4814                         if (upl_dirty_page(pl, start_pg))
4815                                 break;
4816                         if (upl_page_present(pl, start_pg))
4817                                 /*
4818                                  * RET_ONLY_DIRTY will return non-dirty 'precious' pages
4819                                  * just release these unchanged since we're not going
4820                                  * to steal them or change their state
4821                                  */
4822                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4823                 }
4824                 if (start_pg >= pages_in_upl)
4825                         /*
4826                          * done... no more dirty pages to push
4827                          */
4828                         break;
4829                 if (start_pg > last_pg)
4830                         /*
4831                          * skipped over some non-dirty pages
4832                          */
4833                         size -= ((start_pg - last_pg) * PAGE_SIZE);
4834
4835                 /*
4836                  * find a range of dirty pages to write
4837                  */
4838                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4839                         if (!upl_dirty_page(pl, last_pg))
4840                                 break;
4841                 }
4842                 upl_offset = start_pg * PAGE_SIZE;
4843
4844                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
4845
4846                 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag;
4847
4848                 if ( !(flags & IO_SYNC))
4849                         io_flags |= CL_ASYNC;
4850
4851                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
4852                                     io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
4853
4854                 if (error == 0 && retval)
4855                         error = retval;
4856
4857                 size -= io_size;
4858         }
4859         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
4860
4861         return(error);
4862 }
4863
4864
4865 /*
4866  * sparse_cluster_switch is called with the write behind lock held
4867  */
4868 static void
4869 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg)
4870 {
4871         int     cl_index;
4872
4873         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4874
4875         if (wbp->cl_scmap == NULL)
4876                 wbp->cl_scdirty = 0;
4877
4878         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
4879                 int       flags;
4880                 struct cl_extent cl;
4881
4882                 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
4883
4884                         if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
4885                                 if (flags & UPL_POP_DIRTY) {
4886                                         cl.e_addr = cl.b_addr + 1;
4887
4888                                         sparse_cluster_add(wbp, vp, &cl, EOF, callback, callback_arg);
4889                                 }
4890                         }
4891                 }
4892         }
4893         wbp->cl_number = 0;
4894
4895         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4896 }
4897
4898
4899 /*
4900  * sparse_cluster_push is called with the write behind lock held
4901  */
4902 static void
4903 sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int (*callback)(buf_t, void *), void *callback_arg)
4904 {
4905         struct cl_extent cl;
4906         off_t           offset;
4907         u_int           length;
4908
4909         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, push_flag, 0);
4910
4911         if (push_flag & PUSH_ALL)
4912                 vfs_drt_control(&(wbp->cl_scmap), 1);
4913
4914         for (;;) {
4915                 if (vfs_drt_get_cluster(&(wbp->cl_scmap), &offset, &length) != KERN_SUCCESS)
4916                         break;
4917
4918                 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
4919                 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
4920
4921                 wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr);
4922
4923                 cluster_push_now(vp, &cl, EOF, push_flag & IO_PASSIVE, callback, callback_arg);
4924
4925                 if ( !(push_flag & PUSH_ALL) )
4926                         break;
4927         }
4928         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4929 }
4930
4931
4932 /*
4933  * sparse_cluster_add is called with the write behind lock held
4934  */
4935 static void
4936 sparse_cluster_add(struct cl_writebehind *wbp, vnode_t vp, struct cl_extent *cl, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg)
4937 {
4938         u_int   new_dirty;
4939         u_int   length;
4940         off_t   offset;
4941
4942         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)wbp->cl_scmap, wbp->cl_scdirty, (int)cl->b_addr, (int)cl->e_addr, 0);
4943
4944         offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4945         length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
4946
4947         while (vfs_drt_mark_pages(&(wbp->cl_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
4948                 /*
4949                  * no room left in the map
4950                  * only a partial update was done
4951                  * push out some pages and try again
4952                  */
4953                 wbp->cl_scdirty += new_dirty;
4954
4955                 sparse_cluster_push(wbp, vp, EOF, 0, callback, callback_arg);
4956
4957                 offset += (new_dirty * PAGE_SIZE_64);
4958                 length -= (new_dirty * PAGE_SIZE);
4959         }
4960         wbp->cl_scdirty += new_dirty;
4961
4962         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4963 }
4964
4965
4966 static int
4967 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
4968 {
4969         upl_page_info_t  *pl;
4970         upl_t            upl;
4971         addr64_t         ubc_paddr;
4972         kern_return_t    kret;
4973         int              error = 0;
4974         int              did_read = 0;
4975         int              abort_flags;
4976         int              upl_flags;
4977         int              bflag;
4978
4979         if (flags & IO_PASSIVE)
4980             bflag = CL_PASSIVE;
4981         else
4982             bflag = 0;
4983
4984         upl_flags = UPL_SET_LITE;
4985
4986         if ( !(flags & CL_READ) ) {
4987                 /*
4988                  * "write" operation:  let the UPL subsystem know
4989                  * that we intend to modify the buffer cache pages
4990                  * we're gathering.
4991                  */
4992                 upl_flags |= UPL_WILL_MODIFY;
4993         } else {
4994                 /*
4995                  * indicate that there is no need to pull the
4996                  * mapping for this page... we're only going
4997                  * to read from it, not modify it.
4998                  */
4999                 upl_flags |= UPL_FILE_IO;
5000         }
5001         kret = ubc_create_upl(vp,
5002                               uio->uio_offset & ~PAGE_MASK_64,
5003                               PAGE_SIZE,
5004                               &upl,
5005                               &pl,
5006                               upl_flags);
5007
5008         if (kret != KERN_SUCCESS)
5009                 return(EINVAL);
5010
5011         if (!upl_valid_page(pl, 0)) {
5012                 /*
5013                  * issue a synchronous read to cluster_io
5014                  */
5015                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
5016                                    CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
5017                 if (error) {
5018                           ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
5019
5020                           return(error);
5021                 }
5022                 did_read = 1;
5023         }
5024         ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
5025
5026 /*
5027  *      NOTE:  There is no prototype for the following in BSD. It, and the definitions
5028  *      of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
5029  *      osfmk/ppc/mappings.h.  They are not included here because there appears to be no
5030  *      way to do so without exporting them to kexts as well.
5031  */
5032         if (flags & CL_READ)
5033 //              copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);    /* Copy physical to physical and flush the destination */
5034                 copypv(ubc_paddr, usr_paddr, xsize,        2 |        1 |        4);    /* Copy physical to physical and flush the destination */
5035         else
5036 //              copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);    /* Copy physical to physical and flush the source */
5037                 copypv(usr_paddr, ubc_paddr, xsize,        2 |        1 |        8);    /* Copy physical to physical and flush the source */
5038
5039         if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
5040                 /*
5041                  * issue a synchronous write to cluster_io
5042                  */
5043                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
5044                                    bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
5045         }
5046         if (error == 0)
5047                 uio_update(uio, (user_size_t)xsize);
5048
5049         if (did_read)
5050                 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
5051         else
5052                 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
5053
5054         ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
5055
5056         return (error);
5057 }
5058
5059
5060
5061 int
5062 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid)
5063 {
5064         int       pg_offset;
5065         int       pg_index;
5066         int       csize;
5067         int       segflg;
5068         int       retval = 0;
5069         int       xsize;
5070         upl_page_info_t *pl;
5071
5072         xsize = *io_resid;
5073
5074         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
5075                      (int)uio->uio_offset, upl_offset, xsize, 0, 0);
5076
5077         segflg = uio->uio_segflg;
5078
5079         switch(segflg) {
5080
5081           case UIO_USERSPACE32:
5082           case UIO_USERISPACE32:
5083                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
5084                 break;
5085
5086           case UIO_USERSPACE:
5087           case UIO_USERISPACE:
5088                 uio->uio_segflg = UIO_PHYS_USERSPACE;
5089                 break;
5090
5091           case UIO_USERSPACE64:
5092           case UIO_USERISPACE64:
5093                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
5094                 break;
5095
5096           case UIO_SYSSPACE32:
5097                 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
5098                 break;
5099
5100           case UIO_SYSSPACE:
5101                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
5102                 break;
5103
5104           case UIO_SYSSPACE64:
5105                 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
5106                 break;
5107         }
5108         pl = ubc_upl_pageinfo(upl);
5109
5110         pg_index  = upl_offset / PAGE_SIZE;
5111         pg_offset = upl_offset & PAGE_MASK;
5112         csize     = min(PAGE_SIZE - pg_offset, xsize);
5113
5114         while (xsize && retval == 0) {
5115                 addr64_t  paddr;
5116
5117                 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
5118
5119                 retval = uiomove64(paddr, csize, uio);
5120
5121                 pg_index += 1;
5122                 pg_offset = 0;
5123                 xsize    -= csize;
5124                 csize     = min(PAGE_SIZE, xsize);
5125         }
5126         *io_resid = xsize;
5127
5128         uio->uio_segflg = segflg;
5129
5130         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
5131                      (int)uio->uio_offset, xsize, retval, segflg, 0);
5132
5133         return (retval);
5134 }
5135
5136
5137 int
5138 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
5139 {
5140
5141         return (cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1));
5142 }
5143
5144
5145 static int
5146 cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference)
5147 {
5148         int       segflg;
5149         int       io_size;
5150         int       xsize;
5151         int       start_offset;
5152         int       retval = 0;
5153         memory_object_control_t  control;
5154
5155         io_size = *io_resid;
5156
5157         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
5158                      (int)uio->uio_offset, 0, io_size, 0, 0);
5159
5160         control = ubc_getobject(vp, UBC_FLAGS_NONE);
5161
5162         if (control == MEMORY_OBJECT_CONTROL_NULL) {
5163                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
5164                              (int)uio->uio_offset, io_size, retval, 3, 0);
5165
5166                 return(0);
5167         }
5168         segflg = uio->uio_segflg;
5169
5170         switch(segflg) {
5171
5172           case UIO_USERSPACE32:
5173           case UIO_USERISPACE32:
5174                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
5175                 break;
5176
5177           case UIO_USERSPACE64:
5178           case UIO_USERISPACE64:
5179                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
5180                 break;
5181
5182           case UIO_SYSSPACE32:
5183                 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
5184                 break;
5185
5186           case UIO_SYSSPACE64:
5187                 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
5188                 break;
5189
5190           case UIO_USERSPACE:
5191           case UIO_USERISPACE:
5192                 uio->uio_segflg = UIO_PHYS_USERSPACE;
5193                 break;
5194
5195           case UIO_SYSSPACE:
5196                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
5197                 break;
5198         }
5199
5200         if ( (io_size = *io_resid) ) {
5201                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
5202                 xsize = uio_resid(uio);
5203
5204                 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
5205                                                        start_offset, io_size, mark_dirty, take_reference);
5206                 xsize -= uio_resid(uio);
5207                 io_size -= xsize;
5208         }
5209         uio->uio_segflg = segflg;
5210         *io_resid       = io_size;
5211
5212         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
5213                      (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0);
5214
5215         return(retval);
5216 }
5217
5218
5219 int
5220 is_file_clean(vnode_t vp, off_t filesize)
5221 {
5222         off_t f_offset;
5223         int   flags;
5224         int   total_dirty = 0;
5225
5226         for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
5227                 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
5228                         if (flags & UPL_POP_DIRTY) {
5229                                 total_dirty++;
5230                         }
5231                 }
5232         }
5233         if (total_dirty)
5234                 return(EINVAL);
5235
5236         return (0);
5237 }
5238
5239
5240
5241 /*
5242  * Dirty region tracking/clustering mechanism.
5243  *
5244  * This code (vfs_drt_*) provides a mechanism for tracking and clustering
5245  * dirty regions within a larger space (file).  It is primarily intended to
5246  * support clustering in large files with many dirty areas.
5247  *
5248  * The implementation assumes that the dirty regions are pages.
5249  *
5250  * To represent dirty pages within the file, we store bit vectors in a
5251  * variable-size circular hash.
5252  */
5253
5254 /*
5255  * Bitvector size.  This determines the number of pages we group in a
5256  * single hashtable entry.  Each hashtable entry is aligned to this
5257  * size within the file.
5258  */
5259 #define DRT_BITVECTOR_PAGES             256
5260
5261 /*
5262  * File offset handling.
5263  *
5264  * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
5265  * the correct formula is  (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
5266  */
5267 #define DRT_ADDRESS_MASK                (~((1 << 20) - 1))
5268 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK)
5269
5270 /*
5271  * Hashtable address field handling.
5272  *
5273  * The low-order bits of the hashtable address are used to conserve
5274  * space.
5275  *
5276  * DRT_HASH_COUNT_MASK must be large enough to store the range
5277  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
5278  * to indicate that the bucket is actually unoccupied.
5279  */
5280 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
5281 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \
5282         do {                                                                                            \
5283                 (scm)->scm_hashtable[(i)].dhe_control =                                                 \
5284                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
5285         } while (0)
5286 #define DRT_HASH_COUNT_MASK             0x1ff
5287 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
5288 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \
5289         do {                                                                                                            \
5290                 (scm)->scm_hashtable[(i)].dhe_control =                                                                 \
5291                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \
5292         } while (0)
5293 #define DRT_HASH_CLEAR(scm, i)                                                                                          \
5294         do {                                                                                                            \
5295                 (scm)->scm_hashtable[(i)].dhe_control = 0;                                                              \
5296         } while (0)
5297 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
5298 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
5299 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \
5300         do {                                                                                            \
5301                 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \
5302                 DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \
5303         } while(0);
5304
5305
5306 /*
5307  * Hash table moduli.
5308  *
5309  * Since the hashtable entry's size is dependent on the size of
5310  * the bitvector, and since the hashtable size is constrained to
5311  * both being prime and fitting within the desired allocation
5312  * size, these values need to be manually determined.
5313  *
5314  * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
5315  *
5316  * The small hashtable allocation is 1024 bytes, so the modulus is 23.
5317  * The large hashtable allocation is 16384 bytes, so the modulus is 401.
5318  */
5319 #define DRT_HASH_SMALL_MODULUS  23
5320 #define DRT_HASH_LARGE_MODULUS  401
5321
5322 #define DRT_SMALL_ALLOCATION    1024    /* 104 bytes spare */
5323 #define DRT_LARGE_ALLOCATION    16384   /* 344 bytes spare */
5324
5325 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
5326
5327 /*
5328  * Hashtable bitvector handling.
5329  *
5330  * Bitvector fields are 32 bits long.
5331  */
5332
5333 #define DRT_HASH_SET_BIT(scm, i, bit)                           \
5334         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
5335
5336 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \
5337         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
5338
5339 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \
5340         ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
5341
5342 #define DRT_BITVECTOR_CLEAR(scm, i)                             \
5343         bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
5344
5345 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \
5346         bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \
5347             &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \
5348             (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
5349
5350
5351
5352 /*
5353  * Hashtable entry.
5354  */
5355 struct vfs_drt_hashentry {
5356         u_int64_t       dhe_control;
5357         u_int32_t       dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
5358 };
5359
5360 /*
5361  * Dirty Region Tracking structure.
5362  *
5363  * The hashtable is allocated entirely inside the DRT structure.
5364  *
5365  * The hash is a simple circular prime modulus arrangement, the structure
5366  * is resized from small to large if it overflows.
5367  */
5368
5369 struct vfs_drt_clustermap {
5370         u_int32_t               scm_magic;      /* sanity/detection */
5371 #define DRT_SCM_MAGIC           0x12020003
5372         u_int32_t               scm_modulus;    /* current ring size */
5373         u_int32_t               scm_buckets;    /* number of occupied buckets */
5374         u_int32_t               scm_lastclean;  /* last entry we cleaned */
5375         u_int32_t               scm_iskips;     /* number of slot skips */
5376
5377         struct vfs_drt_hashentry scm_hashtable[0];
5378 };
5379
5380
5381 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus)
5382 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus)
5383
5384 /*
5385  * Debugging codes and arguments.
5386  */
5387 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
5388 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
5389 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
5390 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
5391 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
5392                                                             * dirty */
5393                                                            /* 0, setcount */
5394                                                            /* 1 (clean, no map) */
5395                                                            /* 2 (map alloc fail) */
5396                                                            /* 3, resid (partial) */
5397 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87))
5398 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
5399                                                             * lastclean, iskips */
5400
5401
5402 static kern_return_t    vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
5403 static kern_return_t    vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
5404 static kern_return_t    vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
5405         u_int64_t offset, int *indexp);
5406 static kern_return_t    vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
5407         u_int64_t offset,
5408         int *indexp,
5409         int recursed);
5410 static kern_return_t    vfs_drt_do_mark_pages(
5411         void            **cmapp,
5412         u_int64_t       offset,
5413         u_int           length,
5414         u_int           *setcountp,
5415         int             dirty);
5416 static void             vfs_drt_trace(
5417         struct vfs_drt_clustermap *cmap,
5418         int code,
5419         int arg1,
5420         int arg2,
5421         int arg3,
5422         int arg4);
5423
5424
5425 /*
5426  * Allocate and initialise a sparse cluster map.
5427  *
5428  * Will allocate a new map, resize or compact an existing map.
5429  *
5430  * XXX we should probably have at least one intermediate map size,
5431  * as the 1:16 ratio seems a bit drastic.
5432  */
5433 static kern_return_t
5434 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
5435 {
5436         struct vfs_drt_clustermap *cmap, *ocmap;
5437         kern_return_t   kret;
5438         u_int64_t       offset;
5439         u_int32_t       i;
5440         int             nsize, active_buckets, index, copycount;
5441
5442         ocmap = NULL;
5443         if (cmapp != NULL)
5444                 ocmap = *cmapp;
5445
5446         /*
5447          * Decide on the size of the new map.
5448          */
5449         if (ocmap == NULL) {
5450                 nsize = DRT_HASH_SMALL_MODULUS;
5451         } else {
5452                 /* count the number of active buckets in the old map */
5453                 active_buckets = 0;
5454                 for (i = 0; i < ocmap->scm_modulus; i++) {
5455                         if (!DRT_HASH_VACANT(ocmap, i) &&
5456                             (DRT_HASH_GET_COUNT(ocmap, i) != 0))
5457                                 active_buckets++;
5458                 }
5459                 /*
5460                  * If we're currently using the small allocation, check to
5461                  * see whether we should grow to the large one.
5462                  */
5463                 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
5464                         /* if the ring is nearly full */
5465                         if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
5466                                 nsize = DRT_HASH_LARGE_MODULUS;
5467                         } else {
5468                                 nsize = DRT_HASH_SMALL_MODULUS;
5469                         }
5470                 } else {
5471                         /* already using the large modulus */
5472                         nsize = DRT_HASH_LARGE_MODULUS;
5473                         /*
5474                          * If the ring is completely full, there's
5475                          * nothing useful for us to do.  Behave as
5476                          * though we had compacted into the new
5477                          * array and return.
5478                          */
5479                         if (active_buckets >= DRT_HASH_LARGE_MODULUS)
5480                                 return(KERN_SUCCESS);
5481                 }
5482         }
5483
5484         /*
5485          * Allocate and initialise the new map.
5486          */
5487
5488         kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
5489             (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
5490         if (kret != KERN_SUCCESS)
5491                 return(kret);
5492         cmap->scm_magic = DRT_SCM_MAGIC;
5493         cmap->scm_modulus = nsize;
5494         cmap->scm_buckets = 0;
5495         cmap->scm_lastclean = 0;
5496         cmap->scm_iskips = 0;
5497         for (i = 0; i < cmap->scm_modulus; i++) {
5498                 DRT_HASH_CLEAR(cmap, i);
5499                 DRT_HASH_VACATE(cmap, i);
5500                 DRT_BITVECTOR_CLEAR(cmap, i);
5501         }
5502
5503         /*
5504          * If there's an old map, re-hash entries from it into the new map.
5505          */
5506         copycount = 0;
5507         if (ocmap != NULL) {
5508                 for (i = 0; i < ocmap->scm_modulus; i++) {
5509                         /* skip empty buckets */
5510                         if (DRT_HASH_VACANT(ocmap, i) ||
5511                             (DRT_HASH_GET_COUNT(ocmap, i) == 0))
5512                                 continue;
5513                         /* get new index */
5514                         offset = DRT_HASH_GET_ADDRESS(ocmap, i);
5515                         kret = vfs_drt_get_index(&cmap, offset, &index, 1);
5516                         if (kret != KERN_SUCCESS) {
5517                                 /* XXX need to bail out gracefully here */
5518                                 panic("vfs_drt: new cluster map mysteriously too small");
5519                                 index = 0;
5520                         }
5521                         /* copy */
5522                         DRT_HASH_COPY(ocmap, i, cmap, index);
5523                         copycount++;
5524                 }
5525         }
5526
5527         /* log what we've done */
5528         vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
5529
5530         /*
5531          * It's important to ensure that *cmapp always points to
5532          * a valid map, so we must overwrite it before freeing
5533          * the old map.
5534          */
5535         *cmapp = cmap;
5536         if (ocmap != NULL) {
5537                 /* emit stats into trace buffer */
5538                 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
5539                               ocmap->scm_modulus,
5540                               ocmap->scm_buckets,
5541                               ocmap->scm_lastclean,
5542                               ocmap->scm_iskips);
5543
5544                 vfs_drt_free_map(ocmap);
5545         }
5546         return(KERN_SUCCESS);
5547 }
5548
5549
5550 /*
5551  * Free a sparse cluster map.
5552  */
5553 static kern_return_t
5554 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
5555 {
5556         kmem_free(kernel_map, (vm_offset_t)cmap,
5557                   (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
5558         return(KERN_SUCCESS);
5559 }
5560
5561
5562 /*
5563  * Find the hashtable slot currently occupied by an entry for the supplied offset.
5564  */
5565 static kern_return_t
5566 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
5567 {
5568         int             index;
5569         u_int32_t       i;
5570
5571         offset = DRT_ALIGN_ADDRESS(offset);
5572         index = DRT_HASH(cmap, offset);
5573
5574         /* traverse the hashtable */
5575         for (i = 0; i < cmap->scm_modulus; i++) {
5576
5577                 /*
5578                  * If the slot is vacant, we can stop.
5579                  */
5580                 if (DRT_HASH_VACANT(cmap, index))
5581                         break;
5582
5583                 /*
5584                  * If the address matches our offset, we have success.
5585                  */
5586                 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
5587                         *indexp = index;
5588                         return(KERN_SUCCESS);
5589                 }
5590
5591                 /*
5592                  * Move to the next slot, try again.
5593                  */
5594                 index = DRT_HASH_NEXT(cmap, index);
5595         }
5596         /*
5597          * It's not there.
5598          */
5599         return(KERN_FAILURE);
5600 }
5601
5602 /*
5603  * Find the hashtable slot for the supplied offset.  If we haven't allocated
5604  * one yet, allocate one and populate the address field.  Note that it will
5605  * not have a nonzero page count and thus will still technically be free, so
5606  * in the case where we are called to clean pages, the slot will remain free.
5607  */
5608 static kern_return_t
5609 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
5610 {
5611         struct vfs_drt_clustermap *cmap;
5612         kern_return_t   kret;
5613         u_int32_t       index;
5614         u_int32_t       i;
5615
5616         cmap = *cmapp;
5617
5618         /* look for an existing entry */
5619         kret = vfs_drt_search_index(cmap, offset, indexp);
5620         if (kret == KERN_SUCCESS)
5621                 return(kret);
5622
5623         /* need to allocate an entry */
5624         offset = DRT_ALIGN_ADDRESS(offset);
5625         index = DRT_HASH(cmap, offset);
5626
5627         /* scan from the index forwards looking for a vacant slot */
5628         for (i = 0; i < cmap->scm_modulus; i++) {
5629                 /* slot vacant? */
5630                 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
5631                         cmap->scm_buckets++;
5632                         if (index < cmap->scm_lastclean)
5633                                 cmap->scm_lastclean = index;
5634                         DRT_HASH_SET_ADDRESS(cmap, index, offset);
5635                         DRT_HASH_SET_COUNT(cmap, index, 0);
5636                         DRT_BITVECTOR_CLEAR(cmap, index);
5637                         *indexp = index;
5638                         vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
5639                         return(KERN_SUCCESS);
5640                 }
5641                 cmap->scm_iskips += i;
5642                 index = DRT_HASH_NEXT(cmap, index);
5643         }
5644
5645         /*
5646          * We haven't found a vacant slot, so the map is full.  If we're not
5647          * already recursed, try reallocating/compacting it.
5648          */
5649         if (recursed)
5650                 return(KERN_FAILURE);
5651         kret = vfs_drt_alloc_map(cmapp);
5652         if (kret == KERN_SUCCESS) {
5653                 /* now try to insert again */
5654                 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
5655         }
5656         return(kret);
5657 }
5658
5659 /*
5660  * Implementation of set dirty/clean.
5661  *
5662  * In the 'clean' case, not finding a map is OK.
5663  */
5664 static kern_return_t
5665 vfs_drt_do_mark_pages(
5666         void            **private,
5667         u_int64_t       offset,
5668         u_int           length,
5669         u_int           *setcountp,
5670         int             dirty)
5671 {
5672         struct vfs_drt_clustermap *cmap, **cmapp;
5673         kern_return_t   kret;
5674         int             i, index, pgoff, pgcount, setcount, ecount;
5675
5676         cmapp = (struct vfs_drt_clustermap **)private;
5677         cmap = *cmapp;
5678
5679         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
5680
5681         if (setcountp != NULL)
5682                 *setcountp = 0;
5683
5684         /* allocate a cluster map if we don't already have one */
5685         if (cmap == NULL) {
5686                 /* no cluster map, nothing to clean */
5687                 if (!dirty) {
5688                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
5689                         return(KERN_SUCCESS);
5690                 }
5691                 kret = vfs_drt_alloc_map(cmapp);
5692                 if (kret != KERN_SUCCESS) {
5693                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
5694                         return(kret);
5695                 }
5696         }
5697         setcount = 0;
5698
5699         /*
5700          * Iterate over the length of the region.
5701          */
5702         while (length > 0) {
5703                 /*
5704                  * Get the hashtable index for this offset.
5705                  *
5706                  * XXX this will add blank entries if we are clearing a range
5707                  * that hasn't been dirtied.
5708                  */
5709                 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
5710                 cmap = *cmapp;  /* may have changed! */
5711                 /* this may be a partial-success return */
5712                 if (kret != KERN_SUCCESS) {
5713                         if (setcountp != NULL)
5714                                 *setcountp = setcount;
5715                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
5716
5717                         return(kret);
5718                 }
5719
5720                 /*
5721                  * Work out how many pages we're modifying in this
5722                  * hashtable entry.
5723                  */
5724                 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
5725                 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
5726
5727                 /*
5728                  * Iterate over pages, dirty/clearing as we go.
5729                  */
5730                 ecount = DRT_HASH_GET_COUNT(cmap, index);
5731                 for (i = 0; i < pgcount; i++) {
5732                         if (dirty) {
5733                                 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
5734                                         DRT_HASH_SET_BIT(cmap, index, pgoff + i);
5735                                         ecount++;
5736                                         setcount++;
5737                                 }
5738                         } else {
5739                                 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
5740                                         DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
5741                                         ecount--;
5742                                         setcount++;
5743                                 }
5744                         }
5745                 }
5746                 DRT_HASH_SET_COUNT(cmap, index, ecount);
5747
5748                 offset += pgcount * PAGE_SIZE;
5749                 length -= pgcount * PAGE_SIZE;
5750         }
5751         if (setcountp != NULL)
5752                 *setcountp = setcount;
5753
5754         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
5755
5756         return(KERN_SUCCESS);
5757 }
5758
5759 /*
5760  * Mark a set of pages as dirty/clean.
5761  *
5762  * This is a public interface.
5763  *
5764  * cmapp
5765  *      Pointer to storage suitable for holding a pointer.  Note that
5766  *      this must either be NULL or a value set by this function.
5767  *
5768  * size
5769  *      Current file size in bytes.
5770  *
5771  * offset
5772  *      Offset of the first page to be marked as dirty, in bytes.  Must be
5773  *      page-aligned.
5774  *
5775  * length
5776  *      Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE.
5777  *
5778  * setcountp
5779  *      Number of pages newly marked dirty by this call (optional).
5780  *
5781  * Returns KERN_SUCCESS if all the pages were successfully marked.
5782  */
5783 static kern_return_t
5784 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp)
5785 {
5786         /* XXX size unused, drop from interface */
5787         return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
5788 }
5789
5790 #if 0
5791 static kern_return_t
5792 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
5793 {
5794         return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
5795 }
5796 #endif
5797
5798 /*
5799  * Get a cluster of dirty pages.
5800  *
5801  * This is a public interface.
5802  *
5803  * cmapp
5804  *      Pointer to storage managed by drt_mark_pages.  Note that this must
5805  *      be NULL or a value set by drt_mark_pages.
5806  *
5807  * offsetp
5808  *      Returns the byte offset into the file of the first page in the cluster.
5809  *
5810  * lengthp
5811  *      Returns the length in bytes of the cluster of dirty pages.
5812  *
5813  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there
5814  * are no dirty pages meeting the minmum size criteria.  Private storage will
5815  * be released if there are no more dirty pages left in the map
5816  *
5817  */
5818 static kern_return_t
5819 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
5820 {
5821         struct vfs_drt_clustermap *cmap;
5822         u_int64_t       offset;
5823         u_int           length;
5824         u_int32_t       j;
5825         int             index, i, fs, ls;
5826
5827         /* sanity */
5828         if ((cmapp == NULL) || (*cmapp == NULL))
5829                 return(KERN_FAILURE);
5830         cmap = *cmapp;
5831
5832         /* walk the hashtable */
5833         for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
5834                 index = DRT_HASH(cmap, offset);
5835
5836                 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
5837                         continue;
5838
5839                 /* scan the bitfield for a string of bits */
5840                 fs = -1;
5841
5842                 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5843                         if (DRT_HASH_TEST_BIT(cmap, index, i)) {
5844                                 fs = i;
5845                                 break;
5846                         }
5847                 }
5848                 if (fs == -1) {
5849                         /*  didn't find any bits set */
5850                         panic("vfs_drt: entry summary count > 0 but no bits set in map");
5851                 }
5852                 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
5853                         if (!DRT_HASH_TEST_BIT(cmap, index, i))
5854                                 break;
5855                 }
5856
5857                 /* compute offset and length, mark pages clean */
5858                 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
5859                 length = ls * PAGE_SIZE;
5860                 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
5861                 cmap->scm_lastclean = index;
5862
5863                 /* return successful */
5864                 *offsetp = (off_t)offset;
5865                 *lengthp = length;
5866
5867                 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
5868                 return(KERN_SUCCESS);
5869         }
5870         /*
5871          * We didn't find anything... hashtable is empty
5872          * emit stats into trace buffer and
5873          * then free it
5874          */
5875         vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5876                       cmap->scm_modulus,
5877                       cmap->scm_buckets,
5878                       cmap->scm_lastclean,
5879                       cmap->scm_iskips);
5880
5881         vfs_drt_free_map(cmap);
5882         *cmapp = NULL;
5883
5884         return(KERN_FAILURE);
5885 }
5886
5887
5888 static kern_return_t
5889 vfs_drt_control(void **cmapp, int op_type)
5890 {
5891         struct vfs_drt_clustermap *cmap;
5892
5893         /* sanity */
5894         if ((cmapp == NULL) || (*cmapp == NULL))
5895                 return(KERN_FAILURE);
5896         cmap = *cmapp;
5897
5898         switch (op_type) {
5899         case 0:
5900                 /* emit stats into trace buffer */
5901                 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5902                               cmap->scm_modulus,
5903                               cmap->scm_buckets,
5904                               cmap->scm_lastclean,
5905                               cmap->scm_iskips);
5906
5907                 vfs_drt_free_map(cmap);
5908                 *cmapp = NULL;
5909                 break;
5910
5911         case 1:
5912                 cmap->scm_lastclean = 0;
5913                 break;
5914         }
5915         return(KERN_SUCCESS);
5916 }
5917
5918
5919
5920 /*
5921  * Emit a summary of the state of the clustermap into the trace buffer
5922  * along with some caller-provided data.
5923  */
5924 #if KDEBUG
5925 static void
5926 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
5927 {
5928         KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
5929 }
5930 #else
5931 static void
5932 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
5933                           __unused int arg1, __unused int arg2, __unused int arg3,
5934                           __unused int arg4)
5935 {
5936 }
5937 #endif
5938
5939 #if 0
5940 /*
5941  * Perform basic sanity check on the hash entry summary count
5942  * vs. the actual bits set in the entry.
5943  */
5944 static void
5945 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
5946 {
5947         int index, i;
5948         int bits_on;
5949
5950         for (index = 0; index < cmap->scm_modulus; index++) {
5951                 if (DRT_HASH_VACANT(cmap, index))
5952                         continue;
5953
5954                 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5955                         if (DRT_HASH_TEST_BIT(cmap, index, i))
5956                                 bits_on++;
5957                 }
5958                 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
5959                         panic("bits_on = %d,  index = %d\n", bits_on, index);
5960         }
5961 }
5962 #endif