bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  62  */
  63
  64 #include <sys/param.h>
  65 #include <sys/proc_internal.h>
  66 #include <sys/buf_internal.h>
  67 #include <sys/mount_internal.h>
  68 #include <sys/vnode_internal.h>
  69 #include <sys/trace.h>
  70 #include <kern/kalloc.h>
  71 #include <sys/time.h>
  72 #include <sys/kernel.h>
  73 #include <sys/resourcevar.h>
  74 #include <miscfs/specfs/specdev.h>
  75 #include <sys/uio_internal.h>
  76 #include <libkern/libkern.h>
  77 #include <machine/machine_routines.h>
  78
  79 #include <sys/ubc_internal.h>
  80 #include <vm/vnode_pager.h>
  81
  82 #include <mach/mach_types.h>
  83 #include <mach/memory_object_types.h>
  84 #include <mach/vm_map.h>
  85 #include <mach/upl.h>
  86 #include <kern/task.h>
  87 #include <kern/policy_internal.h>
  88
  89 #include <vm/vm_kern.h>
  90 #include <vm/vm_map.h>
  91 #include <vm/vm_pageout.h>
  92 #include <vm/vm_fault.h>
  93
  94 #include <sys/kdebug.h>
  95 #include <libkern/OSAtomic.h>
  96
  97 #include <sys/sdt.h>
  98
  99 #include <stdbool.h>
 100
 101 #include <vfs/vfs_disk_conditioner.h>
 102
 103 #if 0
 104 #undef KERNEL_DEBUG
 105 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 106 #endif
 107
 108
 109 #define CL_READ         0x01
 110 #define CL_WRITE        0x02
 111 #define CL_ASYNC        0x04
 112 #define CL_COMMIT       0x08
 113 #define CL_PAGEOUT      0x10
 114 #define CL_AGE          0x20
 115 #define CL_NOZERO       0x40
 116 #define CL_PAGEIN       0x80
 117 #define CL_DEV_MEMORY   0x100
 118 #define CL_PRESERVE     0x200
 119 #define CL_THROTTLE     0x400
 120 #define CL_KEEPCACHED   0x800
 121 #define CL_DIRECT_IO    0x1000
 122 #define CL_PASSIVE      0x2000
 123 #define CL_IOSTREAMING  0x4000
 124 #define CL_CLOSE        0x8000
 125 #define CL_ENCRYPTED    0x10000
 126 #define CL_RAW_ENCRYPTED        0x20000
 127 #define CL_NOCACHE      0x40000
 128
 129 #define MAX_VECTOR_UPL_ELEMENTS 8
 130 #define MAX_VECTOR_UPL_SIZE     (2 * MAX_UPL_SIZE_BYTES)
 131
 132 #define CLUSTER_IO_WAITING              ((buf_t)1)
 133
 134 extern upl_t vector_upl_create(vm_offset_t);
 135 extern boolean_t vector_upl_is_valid(upl_t);
 136 extern boolean_t vector_upl_set_subupl(upl_t, upl_t, u_int32_t);
 137 extern void vector_upl_set_pagelist(upl_t);
 138 extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
 139
 140 struct clios {
 141         lck_mtx_t io_mtxp;
 142         u_int  io_completed;       /* amount of io that has currently completed */
 143         u_int  io_issued;          /* amount of io that was successfully issued */
 144         int    io_error;           /* error code of first error encountered */
 145         int    io_wanted;          /* someone is sleeping waiting for a change in state */
 146 };
 147
 148 struct cl_direct_read_lock {
 149         LIST_ENTRY(cl_direct_read_lock)         chain;
 150         int32_t                                                         ref_count;
 151         vnode_t                                                         vp;
 152         lck_rw_t                                                        rw_lock;
 153 };
 154
 155 #define CL_DIRECT_READ_LOCK_BUCKETS 61
 156
 157 static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
 158 cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
 159
 160 static LCK_GRP_DECLARE(cl_mtx_grp, "cluster I/O");
 161 static LCK_MTX_DECLARE(cl_transaction_mtxp, &cl_mtx_grp);
 162 static LCK_SPIN_DECLARE(cl_direct_read_spin_lock, &cl_mtx_grp);
 163
 164 static ZONE_DECLARE(cl_rd_zone, "cluster_read",
 165     sizeof(struct cl_readahead), ZC_ZFREE_CLEARMEM | ZC_NOENCRYPT);
 166
 167 static ZONE_DECLARE(cl_wr_zone, "cluster_write",
 168     sizeof(struct cl_writebehind), ZC_ZFREE_CLEARMEM | ZC_NOENCRYPT);
 169
 170 #define IO_UNKNOWN      0
 171 #define IO_DIRECT       1
 172 #define IO_CONTIG       2
 173 #define IO_COPY         3
 174
 175 #define PUSH_DELAY      0x01
 176 #define PUSH_ALL        0x02
 177 #define PUSH_SYNC       0x04
 178
 179
 180 static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset);
 181 static void cluster_wait_IO(buf_t cbp_head, int async);
 182 static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait);
 183
 184 static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length);
 185
 186 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 187     int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg);
 188 static int cluster_iodone(buf_t bp, void *callback_arg);
 189 static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
 190 static int cluster_is_throttled(vnode_t vp);
 191
 192 static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name);
 193
 194 static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags);
 195
 196 static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
 197 static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference);
 198
 199 static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
 200     int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
 201 static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
 202     int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
 203 static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
 204     int (*)(buf_t, void *), void *callback_arg, int flags) __attribute__((noinline));
 205
 206 static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
 207     off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
 208 static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
 209     int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
 210 static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
 211     int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag) __attribute__((noinline));
 212
 213 static void cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes, boolean_t *first_pass,
 214     off_t write_off, int write_cnt, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
 215
 216 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg);
 217
 218 static int      cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
 219 static void     cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra,
 220     int (*callback)(buf_t, void *), void *callback_arg, int bflag);
 221
 222 static int      cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_ioitiated);
 223
 224 static int      cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *),
 225     void *callback_arg, int *err, boolean_t vm_initiated);
 226
 227 static int      sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
 228 static int      sparse_cluster_push(struct cl_writebehind *, void **cmapp, vnode_t vp, off_t EOF, int push_flag,
 229     int io_flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
 230 static int      sparse_cluster_add(struct cl_writebehind *, void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF,
 231     int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
 232
 233 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp);
 234 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
 235 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
 236 static kern_return_t vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag);
 237
 238
 239 /*
 240  * For throttled IO to check whether
 241  * a block is cached by the boot cache
 242  * and thus it can avoid delaying the IO.
 243  *
 244  * bootcache_contains_block is initially
 245  * NULL. The BootCache will set it while
 246  * the cache is active and clear it when
 247  * the cache is jettisoned.
 248  *
 249  * Returns 0 if the block is not
 250  * contained in the cache, 1 if it is
 251  * contained.
 252  *
 253  * The function pointer remains valid
 254  * after the cache has been evicted even
 255  * if bootcache_contains_block has been
 256  * cleared.
 257  *
 258  * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
 259  */
 260 int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
 261
 262
 263 /*
 264  * limit the internal I/O size so that we
 265  * can represent it in a 32 bit int
 266  */
 267 #define MAX_IO_REQUEST_SIZE     (1024 * 1024 * 512)
 268 #define MAX_IO_CONTIG_SIZE      MAX_UPL_SIZE_BYTES
 269 #define MAX_VECTS               16
 270 /*
 271  * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
 272  * allowing the caller to bypass the buffer cache.  For small I/Os (less than 16k),
 273  * we have not historically allowed the write to bypass the UBC.
 274  */
 275 #define MIN_DIRECT_WRITE_SIZE   (16384)
 276
 277 #define WRITE_THROTTLE          6
 278 #define WRITE_THROTTLE_SSD      2
 279 #define WRITE_BEHIND            1
 280 #define WRITE_BEHIND_SSD        1
 281
 282 #if !defined(XNU_TARGET_OS_OSX)
 283 #define PREFETCH                1
 284 #define PREFETCH_SSD            1
 285 uint32_t speculative_prefetch_max = (2048 * 1024);              /* maximum bytes in a specluative read-ahead */
 286 uint32_t speculative_prefetch_max_iosize = (512 * 1024);        /* maximum I/O size to use in a specluative read-ahead */
 287 #else /* XNU_TARGET_OS_OSX */
 288 #define PREFETCH                3
 289 #define PREFETCH_SSD            2
 290 uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3);   /* maximum bytes in a specluative read-ahead */
 291 uint32_t speculative_prefetch_max_iosize = (512 * 1024);        /* maximum I/O size to use in a specluative read-ahead on SSDs*/
 292 #endif /* ! XNU_TARGET_OS_OSX */
 293
 294
 295 #define IO_SCALE(vp, base)              (vp->v_mount->mnt_ioscale * (base))
 296 #define MAX_CLUSTER_SIZE(vp)            (cluster_max_io_size(vp->v_mount, CL_WRITE))
 297 #define MAX_PREFETCH(vp, size, is_ssd)  (size * IO_SCALE(vp, ((is_ssd) ? PREFETCH_SSD : PREFETCH)))
 298
 299 int     speculative_reads_disabled = 0;
 300
 301 /*
 302  * throttle the number of async writes that
 303  * can be outstanding on a single vnode
 304  * before we issue a synchronous write
 305  */
 306 #define THROTTLE_MAXCNT 0
 307
 308 uint32_t throttle_max_iosize = (128 * 1024);
 309
 310 #define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
 311
 312 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
 313
 314
 315 void
 316 cluster_init(void)
 317 {
 318         for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i) {
 319                 LIST_INIT(&cl_direct_read_locks[i]);
 320         }
 321 }
 322
 323
 324 uint32_t
 325 cluster_max_io_size(mount_t mp, int type)
 326 {
 327         uint32_t        max_io_size;
 328         uint32_t        segcnt;
 329         uint32_t        maxcnt;
 330
 331         switch (type) {
 332         case CL_READ:
 333                 segcnt = mp->mnt_segreadcnt;
 334                 maxcnt = mp->mnt_maxreadcnt;
 335                 break;
 336         case CL_WRITE:
 337                 segcnt = mp->mnt_segwritecnt;
 338                 maxcnt = mp->mnt_maxwritecnt;
 339                 break;
 340         default:
 341                 segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
 342                 maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
 343                 break;
 344         }
 345         if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
 346                 /*
 347                  * don't allow a size beyond the max UPL size we can create
 348                  */
 349                 segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
 350         }
 351         max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
 352
 353         if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
 354                 /*
 355                  * don't allow a size smaller than the old fixed limit
 356                  */
 357                 max_io_size = MAX_UPL_TRANSFER_BYTES;
 358         } else {
 359                 /*
 360                  * make sure the size specified is a multiple of PAGE_SIZE
 361                  */
 362                 max_io_size &= ~PAGE_MASK;
 363         }
 364         return max_io_size;
 365 }
 366
 367
 368
 369
 370 #define CLW_ALLOCATE            0x01
 371 #define CLW_RETURNLOCKED        0x02
 372 #define CLW_IONOCACHE           0x04
 373 #define CLW_IOPASSIVE   0x08
 374
 375 /*
 376  * if the read ahead context doesn't yet exist,
 377  * allocate and initialize it...
 378  * the vnode lock serializes multiple callers
 379  * during the actual assignment... first one
 380  * to grab the lock wins... the other callers
 381  * will release the now unnecessary storage
 382  *
 383  * once the context is present, try to grab (but don't block on)
 384  * the lock associated with it... if someone
 385  * else currently owns it, than the read
 386  * will run without read-ahead.  this allows
 387  * multiple readers to run in parallel and
 388  * since there's only 1 read ahead context,
 389  * there's no real loss in only allowing 1
 390  * reader to have read-ahead enabled.
 391  */
 392 static struct cl_readahead *
 393 cluster_get_rap(vnode_t vp)
 394 {
 395         struct ubc_info         *ubc;
 396         struct cl_readahead     *rap;
 397
 398         ubc = vp->v_ubcinfo;
 399
 400         if ((rap = ubc->cl_rahead) == NULL) {
 401                 rap = zalloc_flags(cl_rd_zone, Z_WAITOK | Z_ZERO);
 402                 rap->cl_lastr = -1;
 403                 lck_mtx_init(&rap->cl_lockr, &cl_mtx_grp, LCK_ATTR_NULL);
 404
 405                 vnode_lock(vp);
 406
 407                 if (ubc->cl_rahead == NULL) {
 408                         ubc->cl_rahead = rap;
 409                 } else {
 410                         lck_mtx_destroy(&rap->cl_lockr, &cl_mtx_grp);
 411                         zfree(cl_rd_zone, rap);
 412                         rap = ubc->cl_rahead;
 413                 }
 414                 vnode_unlock(vp);
 415         }
 416         if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE) {
 417                 return rap;
 418         }
 419
 420         return (struct cl_readahead *)NULL;
 421 }
 422
 423
 424 /*
 425  * if the write behind context doesn't yet exist,
 426  * and CLW_ALLOCATE is specified, allocate and initialize it...
 427  * the vnode lock serializes multiple callers
 428  * during the actual assignment... first one
 429  * to grab the lock wins... the other callers
 430  * will release the now unnecessary storage
 431  *
 432  * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
 433  * the lock associated with the write behind context before
 434  * returning
 435  */
 436
 437 static struct cl_writebehind *
 438 cluster_get_wbp(vnode_t vp, int flags)
 439 {
 440         struct ubc_info *ubc;
 441         struct cl_writebehind *wbp;
 442
 443         ubc = vp->v_ubcinfo;
 444
 445         if ((wbp = ubc->cl_wbehind) == NULL) {
 446                 if (!(flags & CLW_ALLOCATE)) {
 447                         return (struct cl_writebehind *)NULL;
 448                 }
 449
 450                 wbp = zalloc_flags(cl_wr_zone, Z_WAITOK | Z_ZERO);
 451
 452                 lck_mtx_init(&wbp->cl_lockw, &cl_mtx_grp, LCK_ATTR_NULL);
 453
 454                 vnode_lock(vp);
 455
 456                 if (ubc->cl_wbehind == NULL) {
 457                         ubc->cl_wbehind = wbp;
 458                 } else {
 459                         lck_mtx_destroy(&wbp->cl_lockw, &cl_mtx_grp);
 460                         zfree(cl_wr_zone, wbp);
 461                         wbp = ubc->cl_wbehind;
 462                 }
 463                 vnode_unlock(vp);
 464         }
 465         if (flags & CLW_RETURNLOCKED) {
 466                 lck_mtx_lock(&wbp->cl_lockw);
 467         }
 468
 469         return wbp;
 470 }
 471
 472
 473 static void
 474 cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags)
 475 {
 476         struct cl_writebehind *wbp;
 477
 478         if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
 479                 if (wbp->cl_number) {
 480                         lck_mtx_lock(&wbp->cl_lockw);
 481
 482                         cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg, NULL, FALSE);
 483
 484                         lck_mtx_unlock(&wbp->cl_lockw);
 485                 }
 486         }
 487 }
 488
 489
 490 static int
 491 cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
 492 {
 493         daddr64_t blkno;
 494         size_t    io_size;
 495         int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
 496
 497         if (bootcache_check_fn && vp->v_mount && vp->v_mount->mnt_devvp) {
 498                 if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL)) {
 499                         return 0;
 500                 }
 501
 502                 if (io_size == 0) {
 503                         return 0;
 504                 }
 505
 506                 if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) {
 507                         return 1;
 508                 }
 509         }
 510         return 0;
 511 }
 512
 513
 514 static int
 515 cluster_is_throttled(vnode_t vp)
 516 {
 517         return throttle_io_will_be_throttled(-1, vp->v_mount);
 518 }
 519
 520
 521 static void
 522 cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name)
 523 {
 524         lck_mtx_lock(&iostate->io_mtxp);
 525
 526         while ((iostate->io_issued - iostate->io_completed) > target) {
 527                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
 528                     iostate->io_issued, iostate->io_completed, target, 0, 0);
 529
 530                 iostate->io_wanted = 1;
 531                 msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
 532
 533                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
 534                     iostate->io_issued, iostate->io_completed, target, 0, 0);
 535         }
 536         lck_mtx_unlock(&iostate->io_mtxp);
 537 }
 538
 539 static void
 540 cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
 541     upl_offset_t upl_offset, upl_size_t size)
 542 {
 543         if (!size) {
 544                 return;
 545         }
 546
 547         upl_t associated_upl = upl_associated_upl(upl);
 548
 549         if (!associated_upl) {
 550                 return;
 551         }
 552
 553 #if 0
 554         printf("1: %d %d\n", upl_offset, upl_offset + size);
 555 #endif
 556
 557         /*
 558          * The associated UPL is page aligned to file offsets whereas the
 559          * UPL it's attached to has different alignment requirements.  The
 560          * upl_offset that we have refers to @upl.  The code that follows
 561          * has to deal with the first and last pages in this transaction
 562          * which might straddle pages in the associated UPL.  To keep
 563          * track of these pages, we use the mark bits: if the mark bit is
 564          * set, we know another transaction has completed its part of that
 565          * page and so we can unlock that page here.
 566          *
 567          * The following illustrates what we have to deal with:
 568          *
 569          *    MEM u <------------ 1 PAGE ------------> e
 570          *        +-------------+----------------------+-----------------
 571          *        |             |######################|#################
 572          *        +-------------+----------------------+-----------------
 573          *   FILE | <--- a ---> o <------------ 1 PAGE ------------>
 574          *
 575          * So here we show a write to offset @o.  The data that is to be
 576          * written is in a buffer that is not page aligned; it has offset
 577          * @a in the page.  The upl that carries the data starts in memory
 578          * at @u.  The associated upl starts in the file at offset @o.  A
 579          * transaction will always end on a page boundary (like @e above)
 580          * except for the very last transaction in the group.  We cannot
 581          * unlock the page at @o in the associated upl until both the
 582          * transaction ending at @e and the following transaction (that
 583          * starts at @e) has completed.
 584          */
 585
 586         /*
 587          * We record whether or not the two UPLs are aligned as the mark
 588          * bit in the first page of @upl.
 589          */
 590         upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
 591         bool is_unaligned = upl_page_get_mark(pl, 0);
 592
 593         if (is_unaligned) {
 594                 upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
 595
 596                 upl_offset_t upl_end = upl_offset + size;
 597                 assert(upl_end >= PAGE_SIZE);
 598
 599                 upl_size_t assoc_upl_size = upl_get_size(associated_upl);
 600
 601                 /*
 602                  * In the very first transaction in the group, upl_offset will
 603                  * not be page aligned, but after that it will be and in that
 604                  * case we want the preceding page in the associated UPL hence
 605                  * the minus one.
 606                  */
 607                 assert(upl_offset);
 608                 if (upl_offset) {
 609                         upl_offset = trunc_page_32(upl_offset - 1);
 610                 }
 611
 612                 lck_mtx_lock_spin(&iostate->io_mtxp);
 613
 614                 // Look at the first page...
 615                 if (upl_offset
 616                     && !upl_page_get_mark(assoc_pl, upl_offset >> PAGE_SHIFT)) {
 617                         /*
 618                          * The first page isn't marked so let another transaction
 619                          * completion handle it.
 620                          */
 621                         upl_page_set_mark(assoc_pl, upl_offset >> PAGE_SHIFT, true);
 622                         upl_offset += PAGE_SIZE;
 623                 }
 624
 625                 // And now the last page...
 626
 627                 /*
 628                  * This needs to be > rather than >= because if it's equal, it
 629                  * means there's another transaction that is sharing the last
 630                  * page.
 631                  */
 632                 if (upl_end > assoc_upl_size) {
 633                         upl_end = assoc_upl_size;
 634                 } else {
 635                         upl_end = trunc_page_32(upl_end);
 636                         const int last_pg = (upl_end >> PAGE_SHIFT) - 1;
 637
 638                         if (!upl_page_get_mark(assoc_pl, last_pg)) {
 639                                 /*
 640                                  * The last page isn't marked so mark the page and let another
 641                                  * transaction completion handle it.
 642                                  */
 643                                 upl_page_set_mark(assoc_pl, last_pg, true);
 644                                 upl_end -= PAGE_SIZE;
 645                         }
 646                 }
 647
 648                 lck_mtx_unlock(&iostate->io_mtxp);
 649
 650 #if 0
 651                 printf("2: %d %d\n", upl_offset, upl_end);
 652 #endif
 653
 654                 if (upl_end <= upl_offset) {
 655                         return;
 656                 }
 657
 658                 size = upl_end - upl_offset;
 659         } else {
 660                 assert(!(upl_offset & PAGE_MASK));
 661                 assert(!(size & PAGE_MASK));
 662         }
 663
 664         boolean_t empty;
 665
 666         /*
 667          * We can unlock these pages now and as this is for a
 668          * direct/uncached write, we want to dump the pages too.
 669          */
 670         kern_return_t kr = upl_abort_range(associated_upl, upl_offset, size,
 671             UPL_ABORT_DUMP_PAGES, &empty);
 672
 673         assert(!kr);
 674
 675         if (!kr && empty) {
 676                 upl_set_associated_upl(upl, NULL);
 677                 upl_deallocate(associated_upl);
 678         }
 679 }
 680
 681 static int
 682 cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
 683 {
 684         int upl_abort_code = 0;
 685         int page_in  = 0;
 686         int page_out = 0;
 687
 688         if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) {
 689                 /*
 690                  * direct write of any flavor, or a direct read that wasn't aligned
 691                  */
 692                 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
 693         } else {
 694                 if (io_flags & B_PAGEIO) {
 695                         if (io_flags & B_READ) {
 696                                 page_in  = 1;
 697                         } else {
 698                                 page_out = 1;
 699                         }
 700                 }
 701                 if (io_flags & B_CACHE) {
 702                         /*
 703                          * leave pages in the cache unchanged on error
 704                          */
 705                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 706                 } else if (((io_flags & B_READ) == 0) && ((error != ENXIO) || vnode_isswap(vp))) {
 707                         /*
 708                          * transient error on pageout/write path... leave pages unchanged
 709                          */
 710                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 711                 } else if (page_in) {
 712                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 713                 } else {
 714                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 715                 }
 716
 717                 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
 718         }
 719         return upl_abort_code;
 720 }
 721
 722
 723 static int
 724 cluster_iodone(buf_t bp, void *callback_arg)
 725 {
 726         int     b_flags;
 727         int     error;
 728         int     total_size;
 729         int     total_resid;
 730         int     upl_offset;
 731         int     zero_offset;
 732         int     pg_offset = 0;
 733         int     commit_size = 0;
 734         int     upl_flags = 0;
 735         int     transaction_size = 0;
 736         upl_t   upl;
 737         buf_t   cbp;
 738         buf_t   cbp_head;
 739         buf_t   cbp_next;
 740         buf_t   real_bp;
 741         vnode_t vp;
 742         struct  clios *iostate;
 743         boolean_t       transaction_complete = FALSE;
 744
 745         __IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
 746
 747         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 748             cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 749
 750         if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) {
 751                 lck_mtx_lock_spin(&cl_transaction_mtxp);
 752
 753                 bp->b_flags |= B_TDONE;
 754
 755                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 756                         /*
 757                          * all I/O requests that are part of this transaction
 758                          * have to complete before we can process it
 759                          */
 760                         if (!(cbp->b_flags & B_TDONE)) {
 761                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 762                                     cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
 763
 764                                 lck_mtx_unlock(&cl_transaction_mtxp);
 765
 766                                 return 0;
 767                         }
 768
 769                         if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
 770                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 771                                     cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
 772
 773                                 lck_mtx_unlock(&cl_transaction_mtxp);
 774                                 wakeup(cbp);
 775
 776                                 return 0;
 777                         }
 778
 779                         if (cbp->b_flags & B_EOT) {
 780                                 transaction_complete = TRUE;
 781                         }
 782                 }
 783                 lck_mtx_unlock(&cl_transaction_mtxp);
 784
 785                 if (transaction_complete == FALSE) {
 786                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 787                             cbp_head, 0, 0, 0, 0);
 788                         return 0;
 789                 }
 790         }
 791         error       = 0;
 792         total_size  = 0;
 793         total_resid = 0;
 794
 795         cbp        = cbp_head;
 796         vp         = cbp->b_vp;
 797         upl_offset = cbp->b_uploffset;
 798         upl        = cbp->b_upl;
 799         b_flags    = cbp->b_flags;
 800         real_bp    = cbp->b_real_bp;
 801         zero_offset = cbp->b_validend;
 802         iostate    = (struct clios *)cbp->b_iostate;
 803
 804         if (real_bp) {
 805                 real_bp->b_dev = cbp->b_dev;
 806         }
 807
 808         while (cbp) {
 809                 if ((cbp->b_flags & B_ERROR) && error == 0) {
 810                         error = cbp->b_error;
 811                 }
 812
 813                 total_resid += cbp->b_resid;
 814                 total_size  += cbp->b_bcount;
 815
 816                 cbp_next = cbp->b_trans_next;
 817
 818                 if (cbp_next == NULL) {
 819                         /*
 820                          * compute the overall size of the transaction
 821                          * in case we created one that has 'holes' in it
 822                          * 'total_size' represents the amount of I/O we
 823                          * did, not the span of the transaction w/r to the UPL
 824                          */
 825                         transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
 826                 }
 827
 828                 if (cbp != cbp_head) {
 829                         free_io_buf(cbp);
 830                 }
 831
 832                 cbp = cbp_next;
 833         }
 834
 835         if (ISSET(b_flags, B_COMMIT_UPL)) {
 836                 cluster_handle_associated_upl(iostate,
 837                     cbp_head->b_upl,
 838                     upl_offset,
 839                     transaction_size);
 840         }
 841
 842         if (error == 0 && total_resid) {
 843                 error = EIO;
 844         }
 845
 846         if (error == 0) {
 847                 int     (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone);
 848
 849                 if (cliodone_func != NULL) {
 850                         cbp_head->b_bcount = transaction_size;
 851
 852                         error = (*cliodone_func)(cbp_head, callback_arg);
 853                 }
 854         }
 855         if (zero_offset) {
 856                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 857         }
 858
 859         free_io_buf(cbp_head);
 860
 861         if (iostate) {
 862                 int need_wakeup = 0;
 863
 864                 /*
 865                  * someone has issued multiple I/Os asynchrounsly
 866                  * and is waiting for them to complete (streaming)
 867                  */
 868                 lck_mtx_lock_spin(&iostate->io_mtxp);
 869
 870                 if (error && iostate->io_error == 0) {
 871                         iostate->io_error = error;
 872                 }
 873
 874                 iostate->io_completed += total_size;
 875
 876                 if (iostate->io_wanted) {
 877                         /*
 878                          * someone is waiting for the state of
 879                          * this io stream to change
 880                          */
 881                         iostate->io_wanted = 0;
 882                         need_wakeup = 1;
 883                 }
 884                 lck_mtx_unlock(&iostate->io_mtxp);
 885
 886                 if (need_wakeup) {
 887                         wakeup((caddr_t)&iostate->io_wanted);
 888                 }
 889         }
 890
 891         if (b_flags & B_COMMIT_UPL) {
 892                 pg_offset   = upl_offset & PAGE_MASK;
 893                 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 894
 895                 if (error) {
 896                         upl_set_iodone_error(upl, error);
 897
 898                         upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
 899                 } else {
 900                         upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
 901
 902                         if ((b_flags & B_PHYS) && (b_flags & B_READ)) {
 903                                 upl_flags |= UPL_COMMIT_SET_DIRTY;
 904                         }
 905
 906                         if (b_flags & B_AGE) {
 907                                 upl_flags |= UPL_COMMIT_INACTIVATE;
 908                         }
 909
 910                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
 911                 }
 912         }
 913         if (real_bp) {
 914                 if (error) {
 915                         real_bp->b_flags |= B_ERROR;
 916                         real_bp->b_error = error;
 917                 }
 918                 real_bp->b_resid = total_resid;
 919
 920                 buf_biodone(real_bp);
 921         }
 922         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 923             upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0);
 924
 925         return error;
 926 }
 927
 928
 929 uint32_t
 930 cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
 931 {
 932         if (cluster_is_throttled(vp)) {
 933                 *limit = THROTTLE_MAX_IOSIZE;
 934                 return 1;
 935         }
 936         return 0;
 937 }
 938
 939
 940 void
 941 cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
 942 {
 943         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
 944             upl_offset, size, bp, 0, 0);
 945
 946         if (bp == NULL || bp->b_datap == 0) {
 947                 upl_page_info_t *pl;
 948                 addr64_t        zero_addr;
 949
 950                 pl = ubc_upl_pageinfo(upl);
 951
 952                 if (upl_device_page(pl) == TRUE) {
 953                         zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
 954
 955                         bzero_phys_nc(zero_addr, size);
 956                 } else {
 957                         while (size) {
 958                                 int     page_offset;
 959                                 int     page_index;
 960                                 int     zero_cnt;
 961
 962                                 page_index  = upl_offset / PAGE_SIZE;
 963                                 page_offset = upl_offset & PAGE_MASK;
 964
 965                                 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
 966                                 zero_cnt  = min(PAGE_SIZE - page_offset, size);
 967
 968                                 bzero_phys(zero_addr, zero_cnt);
 969
 970                                 size       -= zero_cnt;
 971                                 upl_offset += zero_cnt;
 972                         }
 973                 }
 974         } else {
 975                 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
 976         }
 977
 978         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
 979             upl_offset, size, 0, 0, 0);
 980 }
 981
 982
 983 static void
 984 cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset)
 985 {
 986         cbp_head->b_validend = zero_offset;
 987         cbp_tail->b_flags |= B_EOT;
 988 }
 989
 990 static void
 991 cluster_wait_IO(buf_t cbp_head, int async)
 992 {
 993         buf_t   cbp;
 994
 995         if (async) {
 996                 /*
 997                  * Async callback completion will not normally generate a
 998                  * wakeup upon I/O completion.  To get woken up, we set
 999                  * b_trans_next (which is safe for us to modify) on the last
1000                  * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
1001                  * to wake us up when all buffers as part of this transaction
1002                  * are completed.  This is done under the umbrella of
1003                  * cl_transaction_mtxp which is also taken in cluster_iodone.
1004                  */
1005                 bool done = true;
1006                 buf_t last = NULL;
1007
1008                 lck_mtx_lock_spin(&cl_transaction_mtxp);
1009
1010                 for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
1011                         if (!ISSET(cbp->b_flags, B_TDONE)) {
1012                                 done = false;
1013                         }
1014                 }
1015
1016                 if (!done) {
1017                         last->b_trans_next = CLUSTER_IO_WAITING;
1018
1019                         DTRACE_IO1(wait__start, buf_t, last);
1020                         do {
1021                                 msleep(last, &cl_transaction_mtxp, PSPIN | (PRIBIO + 1), "cluster_wait_IO", NULL);
1022
1023                                 /*
1024                                  * We should only have been woken up if all the
1025                                  * buffers are completed, but just in case...
1026                                  */
1027                                 done = true;
1028                                 for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
1029                                         if (!ISSET(cbp->b_flags, B_TDONE)) {
1030                                                 done = false;
1031                                                 break;
1032                                         }
1033                                 }
1034                         } while (!done);
1035                         DTRACE_IO1(wait__done, buf_t, last);
1036
1037                         last->b_trans_next = NULL;
1038                 }
1039
1040                 lck_mtx_unlock(&cl_transaction_mtxp);
1041         } else { // !async
1042                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1043                         buf_biowait(cbp);
1044                 }
1045         }
1046 }
1047
1048 static void
1049 cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait)
1050 {
1051         buf_t   cbp;
1052         int     error;
1053         boolean_t isswapout = FALSE;
1054
1055         /*
1056          * cluster_complete_transaction will
1057          * only be called if we've issued a complete chain in synchronous mode
1058          * or, we've already done a cluster_wait_IO on an incomplete chain
1059          */
1060         if (needwait) {
1061                 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1062                         buf_biowait(cbp);
1063                 }
1064         }
1065         /*
1066          * we've already waited on all of the I/Os in this transaction,
1067          * so mark all of the buf_t's in this transaction as B_TDONE
1068          * so that cluster_iodone sees the transaction as completed
1069          */
1070         for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1071                 cbp->b_flags |= B_TDONE;
1072         }
1073         cbp = *cbp_head;
1074
1075         if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp)) {
1076                 isswapout = TRUE;
1077         }
1078
1079         error = cluster_iodone(cbp, callback_arg);
1080
1081         if (!(flags & CL_ASYNC) && error && *retval == 0) {
1082                 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) {
1083                         *retval = error;
1084                 } else if (isswapout == TRUE) {
1085                         *retval = error;
1086                 }
1087         }
1088         *cbp_head = (buf_t)NULL;
1089 }
1090
1091
1092 static int
1093 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
1094     int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1095 {
1096         buf_t   cbp;
1097         u_int   size;
1098         u_int   io_size;
1099         int     io_flags;
1100         int     bmap_flags;
1101         int     error = 0;
1102         int     retval = 0;
1103         buf_t   cbp_head = NULL;
1104         buf_t   cbp_tail = NULL;
1105         int     trans_count = 0;
1106         int     max_trans_count;
1107         u_int   pg_count;
1108         int     pg_offset;
1109         u_int   max_iosize;
1110         u_int   max_vectors;
1111         int     priv;
1112         int     zero_offset = 0;
1113         int     async_throttle = 0;
1114         mount_t mp;
1115         vm_offset_t upl_end_offset;
1116         boolean_t   need_EOT = FALSE;
1117
1118         /*
1119          * we currently don't support buffers larger than a page
1120          */
1121         if (real_bp && non_rounded_size > PAGE_SIZE) {
1122                 panic("%s(): Called with real buffer of size %d bytes which "
1123                     "is greater than the maximum allowed size of "
1124                     "%d bytes (the system PAGE_SIZE).\n",
1125                     __FUNCTION__, non_rounded_size, PAGE_SIZE);
1126         }
1127
1128         mp = vp->v_mount;
1129
1130         /*
1131          * we don't want to do any funny rounding of the size for IO requests
1132          * coming through the DIRECT or CONTIGUOUS paths...  those pages don't
1133          * belong to us... we can't extend (nor do we need to) the I/O to fill
1134          * out a page
1135          */
1136         if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) {
1137                 /*
1138                  * round the requested size up so that this I/O ends on a
1139                  * page boundary in case this is a 'write'... if the filesystem
1140                  * has blocks allocated to back the page beyond the EOF, we want to
1141                  * make sure to write out the zero's that are sitting beyond the EOF
1142                  * so that in case the filesystem doesn't explicitly zero this area
1143                  * if a hole is created via a lseek/write beyond the current EOF,
1144                  * it will return zeros when it's read back from the disk.  If the
1145                  * physical allocation doesn't extend for the whole page, we'll
1146                  * only write/read from the disk up to the end of this allocation
1147                  * via the extent info returned from the VNOP_BLOCKMAP call.
1148                  */
1149                 pg_offset = upl_offset & PAGE_MASK;
1150
1151                 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
1152         } else {
1153                 /*
1154                  * anyone advertising a blocksize of 1 byte probably
1155                  * can't deal with us rounding up the request size
1156                  * AFP is one such filesystem/device
1157                  */
1158                 size = non_rounded_size;
1159         }
1160         upl_end_offset = upl_offset + size;
1161
1162         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
1163
1164         /*
1165          * Set the maximum transaction size to the maximum desired number of
1166          * buffers.
1167          */
1168         max_trans_count = 8;
1169         if (flags & CL_DEV_MEMORY) {
1170                 max_trans_count = 16;
1171         }
1172
1173         if (flags & CL_READ) {
1174                 io_flags = B_READ;
1175                 bmap_flags = VNODE_READ;
1176
1177                 max_iosize  = mp->mnt_maxreadcnt;
1178                 max_vectors = mp->mnt_segreadcnt;
1179         } else {
1180                 io_flags = B_WRITE;
1181                 bmap_flags = VNODE_WRITE;
1182
1183                 max_iosize  = mp->mnt_maxwritecnt;
1184                 max_vectors = mp->mnt_segwritecnt;
1185         }
1186         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
1187
1188         /*
1189          * make sure the maximum iosize is a
1190          * multiple of the page size
1191          */
1192         max_iosize  &= ~PAGE_MASK;
1193
1194         /*
1195          * Ensure the maximum iosize is sensible.
1196          */
1197         if (!max_iosize) {
1198                 max_iosize = PAGE_SIZE;
1199         }
1200
1201         if (flags & CL_THROTTLE) {
1202                 if (!(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
1203                         if (max_iosize > THROTTLE_MAX_IOSIZE) {
1204                                 max_iosize = THROTTLE_MAX_IOSIZE;
1205                         }
1206                         async_throttle = THROTTLE_MAXCNT;
1207                 } else {
1208                         if ((flags & CL_DEV_MEMORY)) {
1209                                 async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
1210                         } else {
1211                                 u_int max_cluster;
1212                                 u_int max_cluster_size;
1213                                 u_int scale;
1214
1215                                 if (vp->v_mount->mnt_minsaturationbytecount) {
1216                                         max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
1217
1218                                         scale = 1;
1219                                 } else {
1220                                         max_cluster_size = MAX_CLUSTER_SIZE(vp);
1221
1222                                         if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
1223                                                 scale = WRITE_THROTTLE_SSD;
1224                                         } else {
1225                                                 scale = WRITE_THROTTLE;
1226                                         }
1227                                 }
1228                                 if (max_iosize > max_cluster_size) {
1229                                         max_cluster = max_cluster_size;
1230                                 } else {
1231                                         max_cluster = max_iosize;
1232                                 }
1233
1234                                 if (size < max_cluster) {
1235                                         max_cluster = size;
1236                                 }
1237
1238                                 if (flags & CL_CLOSE) {
1239                                         scale += MAX_CLUSTERS;
1240                                 }
1241
1242                                 async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
1243                         }
1244                 }
1245         }
1246         if (flags & CL_AGE) {
1247                 io_flags |= B_AGE;
1248         }
1249         if (flags & (CL_PAGEIN | CL_PAGEOUT)) {
1250                 io_flags |= B_PAGEIO;
1251         }
1252         if (flags & (CL_IOSTREAMING)) {
1253                 io_flags |= B_IOSTREAMING;
1254         }
1255         if (flags & CL_COMMIT) {
1256                 io_flags |= B_COMMIT_UPL;
1257         }
1258         if (flags & CL_DIRECT_IO) {
1259                 io_flags |= B_PHYS;
1260         }
1261         if (flags & (CL_PRESERVE | CL_KEEPCACHED)) {
1262                 io_flags |= B_CACHE;
1263         }
1264         if (flags & CL_PASSIVE) {
1265                 io_flags |= B_PASSIVE;
1266         }
1267         if (flags & CL_ENCRYPTED) {
1268                 io_flags |= B_ENCRYPTED_IO;
1269         }
1270
1271         if (vp->v_flag & VSYSTEM) {
1272                 io_flags |= B_META;
1273         }
1274
1275         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1276                 /*
1277                  * then we are going to end up
1278                  * with a page that we can't complete (the file size wasn't a multiple
1279                  * of PAGE_SIZE and we're trying to read to the end of the file
1280                  * so we'll go ahead and zero out the portion of the page we can't
1281                  * read in from the file
1282                  */
1283                 zero_offset = (int)(upl_offset + non_rounded_size);
1284         } else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
1285                 assert(ISSET(flags, CL_COMMIT));
1286
1287                 // For a direct/uncached write, we need to lock pages...
1288
1289                 upl_t cached_upl;
1290
1291                 /*
1292                  * Create a UPL to lock the pages in the cache whilst the
1293                  * write is in progress.
1294                  */
1295                 ubc_create_upl_kernel(vp, f_offset, non_rounded_size, &cached_upl,
1296                     NULL, UPL_SET_LITE, VM_KERN_MEMORY_FILE);
1297
1298                 /*
1299                  * Attach this UPL to the other UPL so that we can find it
1300                  * later.
1301                  */
1302                 upl_set_associated_upl(upl, cached_upl);
1303
1304                 if (upl_offset & PAGE_MASK) {
1305                         /*
1306                          * The two UPLs are not aligned, so mark the first page in
1307                          * @upl so that cluster_handle_associated_upl can handle
1308                          * it accordingly.
1309                          */
1310                         upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1311                         upl_page_set_mark(pl, 0, true);
1312                 }
1313         }
1314
1315         while (size) {
1316                 daddr64_t blkno;
1317                 daddr64_t lblkno;
1318                 u_int   io_size_wanted;
1319                 size_t  io_size_tmp;
1320
1321                 if (size > max_iosize) {
1322                         io_size = max_iosize;
1323                 } else {
1324                         io_size = size;
1325                 }
1326
1327                 io_size_wanted = io_size;
1328                 io_size_tmp = (size_t)io_size;
1329
1330                 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) {
1331                         break;
1332                 }
1333
1334                 if (io_size_tmp > io_size_wanted) {
1335                         io_size = io_size_wanted;
1336                 } else {
1337                         io_size = (u_int)io_size_tmp;
1338                 }
1339
1340                 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
1341                         real_bp->b_blkno = blkno;
1342                 }
1343
1344                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
1345                     (int)f_offset, (int)(blkno >> 32), (int)blkno, io_size, 0);
1346
1347                 if (io_size == 0) {
1348                         /*
1349                          * vnop_blockmap didn't return an error... however, it did
1350                          * return an extent size of 0 which means we can't
1351                          * make forward progress on this I/O... a hole in the
1352                          * file would be returned as a blkno of -1 with a non-zero io_size
1353                          * a real extent is returned with a blkno != -1 and a non-zero io_size
1354                          */
1355                         error = EINVAL;
1356                         break;
1357                 }
1358                 if (!(flags & CL_READ) && blkno == -1) {
1359                         off_t   e_offset;
1360                         int     pageout_flags;
1361
1362                         if (upl_get_internal_vectorupl(upl)) {
1363                                 panic("Vector UPLs should not take this code-path\n");
1364                         }
1365                         /*
1366                          * we're writing into a 'hole'
1367                          */
1368                         if (flags & CL_PAGEOUT) {
1369                                 /*
1370                                  * if we got here via cluster_pageout
1371                                  * then just error the request and return
1372                                  * the 'hole' should already have been covered
1373                                  */
1374                                 error = EINVAL;
1375                                 break;
1376                         }
1377                         /*
1378                          * we can get here if the cluster code happens to
1379                          * pick up a page that was dirtied via mmap vs
1380                          * a 'write' and the page targets a 'hole'...
1381                          * i.e. the writes to the cluster were sparse
1382                          * and the file was being written for the first time
1383                          *
1384                          * we can also get here if the filesystem supports
1385                          * 'holes' that are less than PAGE_SIZE.... because
1386                          * we can't know if the range in the page that covers
1387                          * the 'hole' has been dirtied via an mmap or not,
1388                          * we have to assume the worst and try to push the
1389                          * entire page to storage.
1390                          *
1391                          * Try paging out the page individually before
1392                          * giving up entirely and dumping it (the pageout
1393                          * path will insure that the zero extent accounting
1394                          * has been taken care of before we get back into cluster_io)
1395                          *
1396                          * go direct to vnode_pageout so that we don't have to
1397                          * unbusy the page from the UPL... we used to do this
1398                          * so that we could call ubc_msync, but that results
1399                          * in a potential deadlock if someone else races us to acquire
1400                          * that page and wins and in addition needs one of the pages
1401                          * we're continuing to hold in the UPL
1402                          */
1403                         pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT;
1404
1405                         if (!(flags & CL_ASYNC)) {
1406                                 pageout_flags |= UPL_IOSYNC;
1407                         }
1408                         if (!(flags & CL_COMMIT)) {
1409                                 pageout_flags |= UPL_NOCOMMIT;
1410                         }
1411
1412                         if (cbp_head) {
1413                                 buf_t prev_cbp;
1414                                 uint32_t   bytes_in_last_page;
1415
1416                                 /*
1417                                  * first we have to wait for the the current outstanding I/Os
1418                                  * to complete... EOT hasn't been set yet on this transaction
1419                                  * so the pages won't be released
1420                                  */
1421                                 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1422
1423                                 bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
1424                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1425                                         bytes_in_last_page += cbp->b_bcount;
1426                                 }
1427                                 bytes_in_last_page &= PAGE_MASK;
1428
1429                                 while (bytes_in_last_page) {
1430                                         /*
1431                                          * we've got a transcation that
1432                                          * includes the page we're about to push out through vnode_pageout...
1433                                          * find the bp's in the list which intersect this page and either
1434                                          * remove them entirely from the transaction (there could be multiple bp's), or
1435                                          * round it's iosize down to the page boundary (there can only be one)...
1436                                          *
1437                                          * find the last bp in the list and act on it
1438                                          */
1439                                         for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) {
1440                                                 prev_cbp = cbp;
1441                                         }
1442
1443                                         if (bytes_in_last_page >= cbp->b_bcount) {
1444                                                 /*
1445                                                  * this buf no longer has any I/O associated with it
1446                                                  */
1447                                                 bytes_in_last_page -= cbp->b_bcount;
1448                                                 cbp->b_bcount = 0;
1449
1450                                                 free_io_buf(cbp);
1451
1452                                                 if (cbp == cbp_head) {
1453                                                         assert(bytes_in_last_page == 0);
1454                                                         /*
1455                                                          * the buf we just freed was the only buf in
1456                                                          * this transaction... so there's no I/O to do
1457                                                          */
1458                                                         cbp_head = NULL;
1459                                                         cbp_tail = NULL;
1460                                                 } else {
1461                                                         /*
1462                                                          * remove the buf we just freed from
1463                                                          * the transaction list
1464                                                          */
1465                                                         prev_cbp->b_trans_next = NULL;
1466                                                         cbp_tail = prev_cbp;
1467                                                 }
1468                                         } else {
1469                                                 /*
1470                                                  * this is the last bp that has I/O
1471                                                  * intersecting the page of interest
1472                                                  * only some of the I/O is in the intersection
1473                                                  * so clip the size but keep it in the transaction list
1474                                                  */
1475                                                 cbp->b_bcount -= bytes_in_last_page;
1476                                                 cbp_tail = cbp;
1477                                                 bytes_in_last_page = 0;
1478                                         }
1479                                 }
1480                                 if (cbp_head) {
1481                                         /*
1482                                          * there was more to the current transaction
1483                                          * than just the page we are pushing out via vnode_pageout...
1484                                          * mark it as finished and complete it... we've already
1485                                          * waited for the I/Os to complete above in the call to cluster_wait_IO
1486                                          */
1487                                         cluster_EOT(cbp_head, cbp_tail, 0);
1488
1489                                         cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1490
1491                                         trans_count = 0;
1492                                 }
1493                         }
1494                         if (vnode_pageout(vp, upl, (upl_offset_t)trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
1495                                 error = EINVAL;
1496                         }
1497                         e_offset = round_page_64(f_offset + 1);
1498                         io_size = (u_int)(e_offset - f_offset);
1499
1500                         f_offset   += io_size;
1501                         upl_offset += io_size;
1502
1503                         if (size >= io_size) {
1504                                 size -= io_size;
1505                         } else {
1506                                 size = 0;
1507                         }
1508                         /*
1509                          * keep track of how much of the original request
1510                          * that we've actually completed... non_rounded_size
1511                          * may go negative due to us rounding the request
1512                          * to a page size multiple (i.e.  size > non_rounded_size)
1513                          */
1514                         non_rounded_size -= io_size;
1515
1516                         if (non_rounded_size <= 0) {
1517                                 /*
1518                                  * we've transferred all of the data in the original
1519                                  * request, but we were unable to complete the tail
1520                                  * of the last page because the file didn't have
1521                                  * an allocation to back that portion... this is ok.
1522                                  */
1523                                 size = 0;
1524                         }
1525                         if (error) {
1526                                 if (size == 0) {
1527                                         flags &= ~CL_COMMIT;
1528                                 }
1529                                 break;
1530                         }
1531                         continue;
1532                 }
1533                 lblkno = (daddr64_t)(f_offset / 0x1000);
1534                 /*
1535                  * we have now figured out how much I/O we can do - this is in 'io_size'
1536                  * pg_offset is the starting point in the first page for the I/O
1537                  * pg_count is the number of full and partial pages that 'io_size' encompasses
1538                  */
1539                 pg_offset = upl_offset & PAGE_MASK;
1540
1541                 if (flags & CL_DEV_MEMORY) {
1542                         /*
1543                          * treat physical requests as one 'giant' page
1544                          */
1545                         pg_count = 1;
1546                 } else {
1547                         pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
1548                 }
1549
1550                 if ((flags & CL_READ) && blkno == -1) {
1551                         vm_offset_t  commit_offset;
1552                         int bytes_to_zero;
1553                         int complete_transaction_now = 0;
1554
1555                         /*
1556                          * if we're reading and blkno == -1, then we've got a
1557                          * 'hole' in the file that we need to deal with by zeroing
1558                          * out the affected area in the upl
1559                          */
1560                         if (io_size >= (u_int)non_rounded_size) {
1561                                 /*
1562                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1563                                  * than 'zero_offset' will be non-zero
1564                                  * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1565                                  * (indicated by the io_size finishing off the I/O request for this UPL)
1566                                  * than we're not going to issue an I/O for the
1567                                  * last page in this upl... we need to zero both the hole and the tail
1568                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
1569                                  */
1570                                 bytes_to_zero = non_rounded_size;
1571                                 if (!(flags & CL_NOZERO)) {
1572                                         bytes_to_zero = (int)((((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset);
1573                                 }
1574
1575                                 zero_offset = 0;
1576                         } else {
1577                                 bytes_to_zero = io_size;
1578                         }
1579
1580                         pg_count = 0;
1581
1582                         cluster_zero(upl, (upl_offset_t)upl_offset, bytes_to_zero, real_bp);
1583
1584                         if (cbp_head) {
1585                                 int     pg_resid;
1586
1587                                 /*
1588                                  * if there is a current I/O chain pending
1589                                  * then the first page of the group we just zero'd
1590                                  * will be handled by the I/O completion if the zero
1591                                  * fill started in the middle of the page
1592                                  */
1593                                 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1594
1595                                 pg_resid = (int)(commit_offset - upl_offset);
1596
1597                                 if (bytes_to_zero >= pg_resid) {
1598                                         /*
1599                                          * the last page of the current I/O
1600                                          * has been completed...
1601                                          * compute the number of fully zero'd
1602                                          * pages that are beyond it
1603                                          * plus the last page if its partial
1604                                          * and we have no more I/O to issue...
1605                                          * otherwise a partial page is left
1606                                          * to begin the next I/O
1607                                          */
1608                                         if ((int)io_size >= non_rounded_size) {
1609                                                 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
1610                                         } else {
1611                                                 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
1612                                         }
1613
1614                                         complete_transaction_now = 1;
1615                                 }
1616                         } else {
1617                                 /*
1618                                  * no pending I/O to deal with
1619                                  * so, commit all of the fully zero'd pages
1620                                  * plus the last page if its partial
1621                                  * and we have no more I/O to issue...
1622                                  * otherwise a partial page is left
1623                                  * to begin the next I/O
1624                                  */
1625                                 if ((int)io_size >= non_rounded_size) {
1626                                         pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
1627                                 } else {
1628                                         pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
1629                                 }
1630
1631                                 commit_offset = upl_offset & ~PAGE_MASK;
1632                         }
1633
1634                         // Associated UPL is currently only used in the direct write path
1635                         assert(!upl_associated_upl(upl));
1636
1637                         if ((flags & CL_COMMIT) && pg_count) {
1638                                 ubc_upl_commit_range(upl, (upl_offset_t)commit_offset,
1639                                     pg_count * PAGE_SIZE,
1640                                     UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1641                         }
1642                         upl_offset += io_size;
1643                         f_offset   += io_size;
1644                         size       -= io_size;
1645
1646                         /*
1647                          * keep track of how much of the original request
1648                          * that we've actually completed... non_rounded_size
1649                          * may go negative due to us rounding the request
1650                          * to a page size multiple (i.e.  size > non_rounded_size)
1651                          */
1652                         non_rounded_size -= io_size;
1653
1654                         if (non_rounded_size <= 0) {
1655                                 /*
1656                                  * we've transferred all of the data in the original
1657                                  * request, but we were unable to complete the tail
1658                                  * of the last page because the file didn't have
1659                                  * an allocation to back that portion... this is ok.
1660                                  */
1661                                 size = 0;
1662                         }
1663                         if (cbp_head && (complete_transaction_now || size == 0)) {
1664                                 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1665
1666                                 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
1667
1668                                 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1669
1670                                 trans_count = 0;
1671                         }
1672                         continue;
1673                 }
1674                 if (pg_count > max_vectors) {
1675                         if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
1676                                 io_size = PAGE_SIZE - pg_offset;
1677                                 pg_count = 1;
1678                         } else {
1679                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
1680                                 pg_count = max_vectors;
1681                         }
1682                 }
1683                 /*
1684                  * If the transaction is going to reach the maximum number of
1685                  * desired elements, truncate the i/o to the nearest page so
1686                  * that the actual i/o is initiated after this buffer is
1687                  * created and added to the i/o chain.
1688                  *
1689                  * I/O directed to physically contiguous memory
1690                  * doesn't have a requirement to make sure we 'fill' a page
1691                  */
1692                 if (!(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
1693                     ((upl_offset + io_size) & PAGE_MASK)) {
1694                         vm_offset_t aligned_ofs;
1695
1696                         aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
1697                         /*
1698                          * If the io_size does not actually finish off even a
1699                          * single page we have to keep adding buffers to the
1700                          * transaction despite having reached the desired limit.
1701                          *
1702                          * Eventually we get here with the page being finished
1703                          * off (and exceeded) and then we truncate the size of
1704                          * this i/o request so that it is page aligned so that
1705                          * we can finally issue the i/o on the transaction.
1706                          */
1707                         if (aligned_ofs > upl_offset) {
1708                                 io_size = (u_int)(aligned_ofs - upl_offset);
1709                                 pg_count--;
1710                         }
1711                 }
1712
1713                 if (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) {
1714                         /*
1715                          * if we're not targeting a virtual device i.e. a disk image
1716                          * it's safe to dip into the reserve pool since real devices
1717                          * can complete this I/O request without requiring additional
1718                          * bufs from the alloc_io_buf pool
1719                          */
1720                         priv = 1;
1721                 } else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT) && !cbp_head) {
1722                         /*
1723                          * Throttle the speculative IO
1724                          *
1725                          * We can only throttle this if it is the first iobuf
1726                          * for the transaction. alloc_io_buf implements
1727                          * additional restrictions for diskimages anyway.
1728                          */
1729                         priv = 0;
1730                 } else {
1731                         priv = 1;
1732                 }
1733
1734                 cbp = alloc_io_buf(vp, priv);
1735
1736                 if (flags & CL_PAGEOUT) {
1737                         u_int i;
1738
1739                         /*
1740                          * since blocks are in offsets of 0x1000, scale
1741                          * iteration to (PAGE_SIZE * pg_count) of blks.
1742                          */
1743                         for (i = 0; i < (PAGE_SIZE * pg_count) / 0x1000; i++) {
1744                                 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) {
1745                                         panic("BUSY bp found in cluster_io");
1746                                 }
1747                         }
1748                 }
1749                 if (flags & CL_ASYNC) {
1750                         if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg)) {
1751                                 panic("buf_setcallback failed\n");
1752                         }
1753                 }
1754                 cbp->b_cliodone = (void *)callback;
1755                 cbp->b_flags |= io_flags;
1756                 if (flags & CL_NOCACHE) {
1757                         cbp->b_attr.ba_flags |= BA_NOCACHE;
1758                 }
1759
1760                 cbp->b_lblkno = lblkno;
1761                 cbp->b_blkno  = blkno;
1762                 cbp->b_bcount = io_size;
1763
1764                 if (buf_setupl(cbp, upl, (uint32_t)upl_offset)) {
1765                         panic("buf_setupl failed\n");
1766                 }
1767 #if CONFIG_IOSCHED
1768                 upl_set_blkno(upl, upl_offset, io_size, blkno);
1769 #endif
1770                 cbp->b_trans_next = (buf_t)NULL;
1771
1772                 if ((cbp->b_iostate = (void *)iostate)) {
1773                         /*
1774                          * caller wants to track the state of this
1775                          * io... bump the amount issued against this stream
1776                          */
1777                         iostate->io_issued += io_size;
1778                 }
1779
1780                 if (flags & CL_READ) {
1781                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
1782                             (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1783                 } else {
1784                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
1785                             (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1786                 }
1787
1788                 if (cbp_head) {
1789                         cbp_tail->b_trans_next = cbp;
1790                         cbp_tail = cbp;
1791                 } else {
1792                         cbp_head = cbp;
1793                         cbp_tail = cbp;
1794
1795                         if ((cbp_head->b_real_bp = real_bp)) {
1796                                 real_bp = (buf_t)NULL;
1797                         }
1798                 }
1799                 *(buf_t *)(&cbp->b_trans_head) = cbp_head;
1800
1801                 trans_count++;
1802
1803                 upl_offset += io_size;
1804                 f_offset   += io_size;
1805                 size       -= io_size;
1806                 /*
1807                  * keep track of how much of the original request
1808                  * that we've actually completed... non_rounded_size
1809                  * may go negative due to us rounding the request
1810                  * to a page size multiple (i.e.  size > non_rounded_size)
1811                  */
1812                 non_rounded_size -= io_size;
1813
1814                 if (non_rounded_size <= 0) {
1815                         /*
1816                          * we've transferred all of the data in the original
1817                          * request, but we were unable to complete the tail
1818                          * of the last page because the file didn't have
1819                          * an allocation to back that portion... this is ok.
1820                          */
1821                         size = 0;
1822                 }
1823                 if (size == 0) {
1824                         /*
1825                          * we have no more I/O to issue, so go
1826                          * finish the final transaction
1827                          */
1828                         need_EOT = TRUE;
1829                 } else if (((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) &&
1830                     ((flags & CL_ASYNC) || trans_count > max_trans_count)) {
1831                         /*
1832                          * I/O directed to physically contiguous memory...
1833                          * which doesn't have a requirement to make sure we 'fill' a page
1834                          * or...
1835                          * the current I/O we've prepared fully
1836                          * completes the last page in this request
1837                          * and ...
1838                          * it's either an ASYNC request or
1839                          * we've already accumulated more than 8 I/O's into
1840                          * this transaction so mark it as complete so that
1841                          * it can finish asynchronously or via the cluster_complete_transaction
1842                          * below if the request is synchronous
1843                          */
1844                         need_EOT = TRUE;
1845                 }
1846                 if (need_EOT == TRUE) {
1847                         cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
1848                 }
1849
1850                 if (flags & CL_THROTTLE) {
1851                         (void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
1852                 }
1853
1854                 if (!(io_flags & B_READ)) {
1855                         vnode_startwrite(vp);
1856                 }
1857
1858                 if (flags & CL_RAW_ENCRYPTED) {
1859                         /*
1860                          * User requested raw encrypted bytes.
1861                          * Twiddle the bit in the ba_flags for the buffer
1862                          */
1863                         cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO;
1864                 }
1865
1866                 (void) VNOP_STRATEGY(cbp);
1867
1868                 if (need_EOT == TRUE) {
1869                         if (!(flags & CL_ASYNC)) {
1870                                 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
1871                         }
1872
1873                         need_EOT = FALSE;
1874                         trans_count = 0;
1875                         cbp_head = NULL;
1876                 }
1877         }
1878         if (error) {
1879                 int abort_size;
1880
1881                 io_size = 0;
1882
1883                 if (cbp_head) {
1884                         /*
1885                          * Wait until all of the outstanding I/O
1886                          * for this partial transaction has completed
1887                          */
1888                         cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1889
1890                         /*
1891                          * Rewind the upl offset to the beginning of the
1892                          * transaction.
1893                          */
1894                         upl_offset = cbp_head->b_uploffset;
1895                 }
1896
1897                 if (ISSET(flags, CL_COMMIT)) {
1898                         cluster_handle_associated_upl(iostate, upl,
1899                             (upl_offset_t)upl_offset,
1900                             (upl_size_t)(upl_end_offset - upl_offset));
1901                 }
1902
1903                 // Free all the IO buffers in this transaction
1904                 for (cbp = cbp_head; cbp;) {
1905                         buf_t   cbp_next;
1906
1907                         size       += cbp->b_bcount;
1908                         io_size    += cbp->b_bcount;
1909
1910                         cbp_next = cbp->b_trans_next;
1911                         free_io_buf(cbp);
1912                         cbp = cbp_next;
1913                 }
1914
1915                 if (iostate) {
1916                         int need_wakeup = 0;
1917
1918                         /*
1919                          * update the error condition for this stream
1920                          * since we never really issued the io
1921                          * just go ahead and adjust it back
1922                          */
1923                         lck_mtx_lock_spin(&iostate->io_mtxp);
1924
1925                         if (iostate->io_error == 0) {
1926                                 iostate->io_error = error;
1927                         }
1928                         iostate->io_issued -= io_size;
1929
1930                         if (iostate->io_wanted) {
1931                                 /*
1932                                  * someone is waiting for the state of
1933                                  * this io stream to change
1934                                  */
1935                                 iostate->io_wanted = 0;
1936                                 need_wakeup = 1;
1937                         }
1938                         lck_mtx_unlock(&iostate->io_mtxp);
1939
1940                         if (need_wakeup) {
1941                                 wakeup((caddr_t)&iostate->io_wanted);
1942                         }
1943                 }
1944
1945                 if (flags & CL_COMMIT) {
1946                         int     upl_flags;
1947
1948                         pg_offset  = upl_offset & PAGE_MASK;
1949                         abort_size = (int)((upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK);
1950
1951                         upl_flags = cluster_ioerror(upl, (int)(upl_offset - pg_offset),
1952                             abort_size, error, io_flags, vp);
1953
1954                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
1955                             upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0);
1956                 }
1957                 if (retval == 0) {
1958                         retval = error;
1959                 }
1960         } else if (cbp_head) {
1961                 panic("%s(): cbp_head is not NULL.\n", __FUNCTION__);
1962         }
1963
1964         if (real_bp) {
1965                 /*
1966                  * can get here if we either encountered an error
1967                  * or we completely zero-filled the request and
1968                  * no I/O was issued
1969                  */
1970                 if (error) {
1971                         real_bp->b_flags |= B_ERROR;
1972                         real_bp->b_error = error;
1973                 }
1974                 buf_biodone(real_bp);
1975         }
1976         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
1977
1978         return retval;
1979 }
1980
1981 #define reset_vector_run_state()                                                                                \
1982         issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
1983
1984 static int
1985 vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
1986     int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1987 {
1988         vector_upl_set_pagelist(vector_upl);
1989
1990         if (io_flag & CL_READ) {
1991                 if (vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK) == 0)) {
1992                         io_flag &= ~CL_PRESERVE; /*don't zero fill*/
1993                 } else {
1994                         io_flag |= CL_PRESERVE; /*zero fill*/
1995                 }
1996         }
1997         return cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg);
1998 }
1999
2000 static int
2001 cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2002 {
2003         int           pages_in_prefetch;
2004
2005         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
2006             (int)f_offset, size, (int)filesize, 0, 0);
2007
2008         if (f_offset >= filesize) {
2009                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2010                     (int)f_offset, 0, 0, 0, 0);
2011                 return 0;
2012         }
2013         if ((off_t)size > (filesize - f_offset)) {
2014                 size = (u_int)(filesize - f_offset);
2015         }
2016         pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
2017
2018         advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
2019
2020         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2021             (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
2022
2023         return pages_in_prefetch;
2024 }
2025
2026
2027
2028 static void
2029 cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg,
2030     int bflag)
2031 {
2032         daddr64_t       r_addr;
2033         off_t           f_offset;
2034         int             size_of_prefetch;
2035         u_int           max_prefetch;
2036
2037
2038         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
2039             (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
2040
2041         if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
2042                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2043                     rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
2044                 return;
2045         }
2046         if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
2047                 rap->cl_ralen = 0;
2048                 rap->cl_maxra = 0;
2049
2050                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2051                     rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
2052
2053                 return;
2054         }
2055         max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), disk_conditioner_mount_is_ssd(vp->v_mount));
2056
2057         if (max_prefetch > speculative_prefetch_max) {
2058                 max_prefetch = speculative_prefetch_max;
2059         }
2060
2061         if (max_prefetch <= PAGE_SIZE) {
2062                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2063                     rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
2064                 return;
2065         }
2066         if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
2067                 if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
2068                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2069                             rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
2070                         return;
2071                 }
2072         }
2073         r_addr = MAX(extent->e_addr, rap->cl_maxra) + 1;
2074         f_offset = (off_t)(r_addr * PAGE_SIZE_64);
2075
2076         size_of_prefetch = 0;
2077
2078         ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
2079
2080         if (size_of_prefetch) {
2081                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2082                     rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
2083                 return;
2084         }
2085         if (f_offset < filesize) {
2086                 daddr64_t read_size;
2087
2088                 rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
2089
2090                 read_size = (extent->e_addr + 1) - extent->b_addr;
2091
2092                 if (read_size > rap->cl_ralen) {
2093                         if (read_size > max_prefetch / PAGE_SIZE) {
2094                                 rap->cl_ralen = max_prefetch / PAGE_SIZE;
2095                         } else {
2096                                 rap->cl_ralen = (int)read_size;
2097                         }
2098                 }
2099                 size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
2100
2101                 if (size_of_prefetch) {
2102                         rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
2103                 }
2104         }
2105         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2106             rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
2107 }
2108
2109
2110 int
2111 cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2112     int size, off_t filesize, int flags)
2113 {
2114         return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2115 }
2116
2117
2118 int
2119 cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2120     int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2121 {
2122         int           io_size;
2123         int           rounded_size;
2124         off_t         max_size;
2125         int           local_flags;
2126
2127         local_flags = CL_PAGEOUT | CL_THROTTLE;
2128
2129         if ((flags & UPL_IOSYNC) == 0) {
2130                 local_flags |= CL_ASYNC;
2131         }
2132         if ((flags & UPL_NOCOMMIT) == 0) {
2133                 local_flags |= CL_COMMIT;
2134         }
2135         if ((flags & UPL_KEEPCACHED)) {
2136                 local_flags |= CL_KEEPCACHED;
2137         }
2138         if (flags & UPL_PAGING_ENCRYPTED) {
2139                 local_flags |= CL_ENCRYPTED;
2140         }
2141
2142
2143         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
2144             (int)f_offset, size, (int)filesize, local_flags, 0);
2145
2146         /*
2147          * If they didn't specify any I/O, then we are done...
2148          * we can't issue an abort because we don't know how
2149          * big the upl really is
2150          */
2151         if (size <= 0) {
2152                 return EINVAL;
2153         }
2154
2155         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
2156                 if (local_flags & CL_COMMIT) {
2157                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2158                 }
2159                 return EROFS;
2160         }
2161         /*
2162          * can't page-in from a negative offset
2163          * or if we're starting beyond the EOF
2164          * or if the file offset isn't page aligned
2165          * or the size requested isn't a multiple of PAGE_SIZE
2166          */
2167         if (f_offset < 0 || f_offset >= filesize ||
2168             (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
2169                 if (local_flags & CL_COMMIT) {
2170                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2171                 }
2172                 return EINVAL;
2173         }
2174         max_size = filesize - f_offset;
2175
2176         if (size < max_size) {
2177                 io_size = size;
2178         } else {
2179                 io_size = (int)max_size;
2180         }
2181
2182         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2183
2184         if (size > rounded_size) {
2185                 if (local_flags & CL_COMMIT) {
2186                         ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
2187                             UPL_ABORT_FREE_ON_EMPTY);
2188                 }
2189         }
2190         return cluster_io(vp, upl, upl_offset, f_offset, io_size,
2191                    local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2192 }
2193
2194
2195 int
2196 cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2197     int size, off_t filesize, int flags)
2198 {
2199         return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2200 }
2201
2202
2203 int
2204 cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2205     int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2206 {
2207         u_int         io_size;
2208         int           rounded_size;
2209         off_t         max_size;
2210         int           retval;
2211         int           local_flags = 0;
2212
2213         if (upl == NULL || size < 0) {
2214                 panic("cluster_pagein: NULL upl passed in");
2215         }
2216
2217         if ((flags & UPL_IOSYNC) == 0) {
2218                 local_flags |= CL_ASYNC;
2219         }
2220         if ((flags & UPL_NOCOMMIT) == 0) {
2221                 local_flags |= CL_COMMIT;
2222         }
2223         if (flags & UPL_IOSTREAMING) {
2224                 local_flags |= CL_IOSTREAMING;
2225         }
2226         if (flags & UPL_PAGING_ENCRYPTED) {
2227                 local_flags |= CL_ENCRYPTED;
2228         }
2229
2230
2231         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
2232             (int)f_offset, size, (int)filesize, local_flags, 0);
2233
2234         /*
2235          * can't page-in from a negative offset
2236          * or if we're starting beyond the EOF
2237          * or if the file offset isn't page aligned
2238          * or the size requested isn't a multiple of PAGE_SIZE
2239          */
2240         if (f_offset < 0 || f_offset >= filesize ||
2241             (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
2242                 if (local_flags & CL_COMMIT) {
2243                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2244                 }
2245                 return EINVAL;
2246         }
2247         max_size = filesize - f_offset;
2248
2249         if (size < max_size) {
2250                 io_size = size;
2251         } else {
2252                 io_size = (int)max_size;
2253         }
2254
2255         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2256
2257         if (size > rounded_size && (local_flags & CL_COMMIT)) {
2258                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
2259                     size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2260         }
2261
2262         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
2263             local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2264
2265         return retval;
2266 }
2267
2268
2269 int
2270 cluster_bp(buf_t bp)
2271 {
2272         return cluster_bp_ext(bp, NULL, NULL);
2273 }
2274
2275
2276 int
2277 cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg)
2278 {
2279         off_t  f_offset;
2280         int    flags;
2281
2282         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
2283             bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
2284
2285         if (bp->b_flags & B_READ) {
2286                 flags = CL_ASYNC | CL_READ;
2287         } else {
2288                 flags = CL_ASYNC;
2289         }
2290         if (bp->b_flags & B_PASSIVE) {
2291                 flags |= CL_PASSIVE;
2292         }
2293
2294         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
2295
2296         return cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg);
2297 }
2298
2299
2300
2301 int
2302 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
2303 {
2304         return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
2305 }
2306
2307
2308 int
2309 cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
2310     int xflags, int (*callback)(buf_t, void *), void *callback_arg)
2311 {
2312         user_ssize_t    cur_resid;
2313         int             retval = 0;
2314         int             flags;
2315         int             zflags;
2316         int             bflag;
2317         int             write_type = IO_COPY;
2318         u_int32_t       write_length;
2319
2320         flags = xflags;
2321
2322         if (flags & IO_PASSIVE) {
2323                 bflag = CL_PASSIVE;
2324         } else {
2325                 bflag = 0;
2326         }
2327
2328         if (vp->v_flag & VNOCACHE_DATA) {
2329                 flags |= IO_NOCACHE;
2330                 bflag |= CL_NOCACHE;
2331         }
2332         if (uio == NULL) {
2333                 /*
2334                  * no user data...
2335                  * this call is being made to zero-fill some range in the file
2336                  */
2337                 retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
2338
2339                 return retval;
2340         }
2341         /*
2342          * do a write through the cache if one of the following is true....
2343          *   NOCACHE is not true or NODIRECT is true
2344          *   the uio request doesn't target USERSPACE
2345          * otherwise, find out if we want the direct or contig variant for
2346          * the first vector in the uio request
2347          */
2348         if (((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
2349                 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2350         }
2351
2352         if ((flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) {
2353                 /*
2354                  * must go through the cached variant in this case
2355                  */
2356                 write_type = IO_COPY;
2357         }
2358
2359         while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
2360                 switch (write_type) {
2361                 case IO_COPY:
2362                         /*
2363                          * make sure the uio_resid isn't too big...
2364                          * internally, we want to handle all of the I/O in
2365                          * chunk sizes that fit in a 32 bit int
2366                          */
2367                         if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
2368                                 /*
2369                                  * we're going to have to call cluster_write_copy
2370                                  * more than once...
2371                                  *
2372                                  * only want the last call to cluster_write_copy to
2373                                  * have the IO_TAILZEROFILL flag set and only the
2374                                  * first call should have IO_HEADZEROFILL
2375                                  */
2376                                 zflags = flags & ~IO_TAILZEROFILL;
2377                                 flags &= ~IO_HEADZEROFILL;
2378
2379                                 write_length = MAX_IO_REQUEST_SIZE;
2380                         } else {
2381                                 /*
2382                                  * last call to cluster_write_copy
2383                                  */
2384                                 zflags = flags;
2385
2386                                 write_length = (u_int32_t)cur_resid;
2387                         }
2388                         retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
2389                         break;
2390
2391                 case IO_CONTIG:
2392                         zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL);
2393
2394                         if (flags & IO_HEADZEROFILL) {
2395                                 /*
2396                                  * only do this once per request
2397                                  */
2398                                 flags &= ~IO_HEADZEROFILL;
2399
2400                                 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
2401                                     headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2402                                 if (retval) {
2403                                         break;
2404                                 }
2405                         }
2406                         retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
2407
2408                         if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
2409                                 /*
2410                                  * we're done with the data from the user specified buffer(s)
2411                                  * and we've been requested to zero fill at the tail
2412                                  * treat this as an IO_HEADZEROFILL which doesn't require a uio
2413                                  * by rearranging the args and passing in IO_HEADZEROFILL
2414                                  */
2415                                 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset,
2416                                     (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2417                         }
2418                         break;
2419
2420                 case IO_DIRECT:
2421                         /*
2422                          * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
2423                          */
2424                         retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
2425                         break;
2426
2427                 case IO_UNKNOWN:
2428                         retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2429                         break;
2430                 }
2431                 /*
2432                  * in case we end up calling cluster_write_copy (from cluster_write_direct)
2433                  * multiple times to service a multi-vector request that is not aligned properly
2434                  * we need to update the oldEOF so that we
2435                  * don't zero-fill the head of a page if we've successfully written
2436                  * data to that area... 'cluster_write_copy' will zero-fill the head of a
2437                  * page that is beyond the oldEOF if the write is unaligned... we only
2438                  * want that to happen for the very first page of the cluster_write,
2439                  * NOT the first page of each vector making up a multi-vector write.
2440                  */
2441                 if (uio->uio_offset > oldEOF) {
2442                         oldEOF = uio->uio_offset;
2443                 }
2444         }
2445         return retval;
2446 }
2447
2448
2449 static int
2450 cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
2451     int flags, int (*callback)(buf_t, void *), void *callback_arg)
2452 {
2453         upl_t            upl;
2454         upl_page_info_t  *pl;
2455         vm_offset_t      upl_offset;
2456         vm_offset_t      vector_upl_offset = 0;
2457         u_int32_t        io_req_size;
2458         u_int32_t        offset_in_file;
2459         u_int32_t        offset_in_iovbase;
2460         u_int32_t        io_size;
2461         int              io_flag = 0;
2462         upl_size_t       upl_size, vector_upl_size = 0;
2463         vm_size_t        upl_needed_size;
2464         mach_msg_type_number_t  pages_in_pl;
2465         upl_control_flags_t upl_flags;
2466         kern_return_t    kret;
2467         mach_msg_type_number_t  i;
2468         int              force_data_sync;
2469         int              retval = 0;
2470         int              first_IO = 1;
2471         struct clios     iostate;
2472         user_addr_t      iov_base;
2473         u_int32_t        mem_alignment_mask;
2474         u_int32_t        devblocksize;
2475         u_int32_t        max_io_size;
2476         u_int32_t        max_upl_size;
2477         u_int32_t        max_vector_size;
2478         u_int32_t        bytes_outstanding_limit;
2479         boolean_t        io_throttled = FALSE;
2480
2481         u_int32_t        vector_upl_iosize = 0;
2482         int              issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
2483         off_t            v_upl_uio_offset = 0;
2484         int              vector_upl_index = 0;
2485         upl_t            vector_upl = NULL;
2486
2487
2488         /*
2489          * When we enter this routine, we know
2490          *  -- the resid will not exceed iov_len
2491          */
2492         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
2493             (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2494
2495         assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
2496
2497         max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
2498
2499         io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO;
2500
2501         if (flags & IO_PASSIVE) {
2502                 io_flag |= CL_PASSIVE;
2503         }
2504
2505         if (flags & IO_NOCACHE) {
2506                 io_flag |= CL_NOCACHE;
2507         }
2508
2509         if (flags & IO_SKIP_ENCRYPTION) {
2510                 io_flag |= CL_ENCRYPTED;
2511         }
2512
2513         iostate.io_completed = 0;
2514         iostate.io_issued = 0;
2515         iostate.io_error = 0;
2516         iostate.io_wanted = 0;
2517
2518         lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
2519
2520         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2521         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2522
2523         if (devblocksize == 1) {
2524                 /*
2525                  * the AFP client advertises a devblocksize of 1
2526                  * however, its BLOCKMAP routine maps to physical
2527                  * blocks that are PAGE_SIZE in size...
2528                  * therefore we can't ask for I/Os that aren't page aligned
2529                  * or aren't multiples of PAGE_SIZE in size
2530                  * by setting devblocksize to PAGE_SIZE, we re-instate
2531                  * the old behavior we had before the mem_alignment_mask
2532                  * changes went in...
2533                  */
2534                 devblocksize = PAGE_SIZE;
2535         }
2536
2537 next_dwrite:
2538         io_req_size = *write_length;
2539         iov_base = uio_curriovbase(uio);
2540
2541         offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
2542         offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
2543
2544         if (offset_in_file || offset_in_iovbase) {
2545                 /*
2546                  * one of the 2 important offsets is misaligned
2547                  * so fire an I/O through the cache for this entire vector
2548                  */
2549                 goto wait_for_dwrites;
2550         }
2551         if (iov_base & (devblocksize - 1)) {
2552                 /*
2553                  * the offset in memory must be on a device block boundary
2554                  * so that we can guarantee that we can generate an
2555                  * I/O that ends on a page boundary in cluster_io
2556                  */
2557                 goto wait_for_dwrites;
2558         }
2559
2560         task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
2561         while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
2562                 int     throttle_type;
2563
2564                 if ((throttle_type = cluster_is_throttled(vp))) {
2565                         /*
2566                          * we're in the throttle window, at the very least
2567                          * we want to limit the size of the I/O we're about
2568                          * to issue
2569                          */
2570                         if ((flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
2571                                 /*
2572                                  * we're in the throttle window and at least 1 I/O
2573                                  * has already been issued by a throttleable thread
2574                                  * in this window, so return with EAGAIN to indicate
2575                                  * to the FS issuing the cluster_write call that it
2576                                  * should now throttle after dropping any locks
2577                                  */
2578                                 throttle_info_update_by_mount(vp->v_mount);
2579
2580                                 io_throttled = TRUE;
2581                                 goto wait_for_dwrites;
2582                         }
2583                         max_vector_size = THROTTLE_MAX_IOSIZE;
2584                         max_io_size = THROTTLE_MAX_IOSIZE;
2585                 } else {
2586                         max_vector_size = MAX_VECTOR_UPL_SIZE;
2587                         max_io_size = max_upl_size;
2588                 }
2589
2590                 if (first_IO) {
2591                         cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
2592                         first_IO = 0;
2593                 }
2594                 io_size  = io_req_size & ~PAGE_MASK;
2595                 iov_base = uio_curriovbase(uio);
2596
2597                 if (io_size > max_io_size) {
2598                         io_size = max_io_size;
2599                 }
2600
2601                 if (useVectorUPL && (iov_base & PAGE_MASK)) {
2602                         /*
2603                          * We have an iov_base that's not page-aligned.
2604                          * Issue all I/O's that have been collected within
2605                          * this Vectored UPL.
2606                          */
2607                         if (vector_upl_index) {
2608                                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2609                                 reset_vector_run_state();
2610                         }
2611
2612                         /*
2613                          * After this point, if we are using the Vector UPL path and the base is
2614                          * not page-aligned then the UPL with that base will be the first in the vector UPL.
2615                          */
2616                 }
2617
2618                 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2619                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2620
2621                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
2622                     (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
2623
2624                 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2625                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2626                         pages_in_pl = 0;
2627                         upl_size = (upl_size_t)upl_needed_size;
2628                         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2629                             UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2630
2631                         kret = vm_map_get_upl(map,
2632                             vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
2633                             &upl_size,
2634                             &upl,
2635                             NULL,
2636                             &pages_in_pl,
2637                             &upl_flags,
2638                             VM_KERN_MEMORY_FILE,
2639                             force_data_sync);
2640
2641                         if (kret != KERN_SUCCESS) {
2642                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2643                                     0, 0, 0, kret, 0);
2644                                 /*
2645                                  * failed to get pagelist
2646                                  *
2647                                  * we may have already spun some portion of this request
2648                                  * off as async requests... we need to wait for the I/O
2649                                  * to complete before returning
2650                                  */
2651                                 goto wait_for_dwrites;
2652                         }
2653                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2654                         pages_in_pl = upl_size / PAGE_SIZE;
2655
2656                         for (i = 0; i < pages_in_pl; i++) {
2657                                 if (!upl_valid_page(pl, i)) {
2658                                         break;
2659                                 }
2660                         }
2661                         if (i == pages_in_pl) {
2662                                 break;
2663                         }
2664
2665                         /*
2666                          * didn't get all the pages back that we
2667                          * needed... release this upl and try again
2668                          */
2669                         ubc_upl_abort(upl, 0);
2670                 }
2671                 if (force_data_sync >= 3) {
2672                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2673                             i, pages_in_pl, upl_size, kret, 0);
2674                         /*
2675                          * for some reason, we couldn't acquire a hold on all
2676                          * the pages needed in the user's address space
2677                          *
2678                          * we may have already spun some portion of this request
2679                          * off as async requests... we need to wait for the I/O
2680                          * to complete before returning
2681                          */
2682                         goto wait_for_dwrites;
2683                 }
2684
2685                 /*
2686                  * Consider the possibility that upl_size wasn't satisfied.
2687                  */
2688                 if (upl_size < upl_needed_size) {
2689                         if (upl_size && upl_offset == 0) {
2690                                 io_size = upl_size;
2691                         } else {
2692                                 io_size = 0;
2693                         }
2694                 }
2695                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2696                     (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
2697
2698                 if (io_size == 0) {
2699                         ubc_upl_abort(upl, 0);
2700                         /*
2701                          * we may have already spun some portion of this request
2702                          * off as async requests... we need to wait for the I/O
2703                          * to complete before returning
2704                          */
2705                         goto wait_for_dwrites;
2706                 }
2707
2708                 if (useVectorUPL) {
2709                         vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
2710                         if (end_off) {
2711                                 issueVectorUPL = 1;
2712                         }
2713                         /*
2714                          * After this point, if we are using a vector UPL, then
2715                          * either all the UPL elements end on a page boundary OR
2716                          * this UPL is the last element because it does not end
2717                          * on a page boundary.
2718                          */
2719                 }
2720
2721                 /*
2722                  * we want push out these writes asynchronously so that we can overlap
2723                  * the preparation of the next I/O
2724                  * if there are already too many outstanding writes
2725                  * wait until some complete before issuing the next
2726                  */
2727                 if (vp->v_mount->mnt_minsaturationbytecount) {
2728                         bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
2729                 } else {
2730                         bytes_outstanding_limit = max_upl_size * IO_SCALE(vp, 2);
2731                 }
2732
2733                 cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct");
2734
2735                 if (iostate.io_error) {
2736                         /*
2737                          * one of the earlier writes we issued ran into a hard error
2738                          * don't issue any more writes, cleanup the UPL
2739                          * that was just created but not used, then
2740                          * go wait for all writes that are part of this stream
2741                          * to complete before returning the error to the caller
2742                          */
2743                         ubc_upl_abort(upl, 0);
2744
2745                         goto wait_for_dwrites;
2746                 }
2747
2748                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
2749                     (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
2750
2751                 if (!useVectorUPL) {
2752                         retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2753                             io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2754                 } else {
2755                         if (!vector_upl_index) {
2756                                 vector_upl = vector_upl_create(upl_offset);
2757                                 v_upl_uio_offset = uio->uio_offset;
2758                                 vector_upl_offset = upl_offset;
2759                         }
2760
2761                         vector_upl_set_subupl(vector_upl, upl, upl_size);
2762                         vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
2763                         vector_upl_index++;
2764                         vector_upl_iosize += io_size;
2765                         vector_upl_size += upl_size;
2766
2767                         if (issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
2768                                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2769                                 reset_vector_run_state();
2770                         }
2771                 }
2772
2773                 /*
2774                  * update the uio structure to
2775                  * reflect the I/O that we just issued
2776                  */
2777                 uio_update(uio, (user_size_t)io_size);
2778
2779                 /*
2780                  * in case we end up calling through to cluster_write_copy to finish
2781                  * the tail of this request, we need to update the oldEOF so that we
2782                  * don't zero-fill the head of a page if we've successfully written
2783                  * data to that area... 'cluster_write_copy' will zero-fill the head of a
2784                  * page that is beyond the oldEOF if the write is unaligned... we only
2785                  * want that to happen for the very first page of the cluster_write,
2786                  * NOT the first page of each vector making up a multi-vector write.
2787                  */
2788                 if (uio->uio_offset > oldEOF) {
2789                         oldEOF = uio->uio_offset;
2790                 }
2791
2792                 io_req_size -= io_size;
2793
2794                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
2795                     (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
2796         } /* end while */
2797
2798         if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
2799                 retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
2800
2801                 if (retval == 0 && *write_type == IO_DIRECT) {
2802                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE,
2803                             (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2804
2805                         goto next_dwrite;
2806                 }
2807         }
2808
2809 wait_for_dwrites:
2810
2811         if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
2812                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2813                 reset_vector_run_state();
2814         }
2815         /*
2816          * make sure all async writes issued as part of this stream
2817          * have completed before we return
2818          */
2819         cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
2820
2821         if (iostate.io_error) {
2822                 retval = iostate.io_error;
2823         }
2824
2825         lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
2826
2827         if (io_throttled == TRUE && retval == 0) {
2828                 retval = EAGAIN;
2829         }
2830
2831         if (io_req_size && retval == 0) {
2832                 /*
2833                  * we couldn't handle the tail of this request in DIRECT mode
2834                  * so fire it through the copy path
2835                  *
2836                  * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2837                  * so we can just pass 0 in for the headOff and tailOff
2838                  */
2839                 if (uio->uio_offset > oldEOF) {
2840                         oldEOF = uio->uio_offset;
2841                 }
2842
2843                 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
2844
2845                 *write_type = IO_UNKNOWN;
2846         }
2847         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
2848             (int)uio->uio_offset, io_req_size, retval, 4, 0);
2849
2850         return retval;
2851 }
2852
2853
2854 static int
2855 cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length,
2856     int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2857 {
2858         upl_page_info_t *pl;
2859         addr64_t         src_paddr = 0;
2860         upl_t            upl[MAX_VECTS];
2861         vm_offset_t      upl_offset;
2862         u_int32_t        tail_size = 0;
2863         u_int32_t        io_size;
2864         u_int32_t        xsize;
2865         upl_size_t       upl_size;
2866         vm_size_t        upl_needed_size;
2867         mach_msg_type_number_t  pages_in_pl;
2868         upl_control_flags_t upl_flags;
2869         kern_return_t    kret;
2870         struct clios     iostate;
2871         int              error  = 0;
2872         int              cur_upl = 0;
2873         int              num_upl = 0;
2874         int              n;
2875         user_addr_t      iov_base;
2876         u_int32_t        devblocksize;
2877         u_int32_t        mem_alignment_mask;
2878
2879         /*
2880          * When we enter this routine, we know
2881          *  -- the io_req_size will not exceed iov_len
2882          *  -- the target address is physically contiguous
2883          */
2884         cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
2885
2886         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2887         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2888
2889         iostate.io_completed = 0;
2890         iostate.io_issued = 0;
2891         iostate.io_error = 0;
2892         iostate.io_wanted = 0;
2893
2894         lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
2895
2896 next_cwrite:
2897         io_size = *write_length;
2898
2899         iov_base = uio_curriovbase(uio);
2900
2901         upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2902         upl_needed_size = upl_offset + io_size;
2903
2904         pages_in_pl = 0;
2905         upl_size = (upl_size_t)upl_needed_size;
2906         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2907             UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2908
2909         vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2910         kret = vm_map_get_upl(map,
2911             vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
2912             &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
2913
2914         if (kret != KERN_SUCCESS) {
2915                 /*
2916                  * failed to get pagelist
2917                  */
2918                 error = EINVAL;
2919                 goto wait_for_cwrites;
2920         }
2921         num_upl++;
2922
2923         /*
2924          * Consider the possibility that upl_size wasn't satisfied.
2925          */
2926         if (upl_size < upl_needed_size) {
2927                 /*
2928                  * This is a failure in the physical memory case.
2929                  */
2930                 error = EINVAL;
2931                 goto wait_for_cwrites;
2932         }
2933         pl = ubc_upl_pageinfo(upl[cur_upl]);
2934
2935         src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
2936
2937         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
2938                 u_int32_t   head_size;
2939
2940                 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
2941
2942                 if (head_size > io_size) {
2943                         head_size = io_size;
2944                 }
2945
2946                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
2947
2948                 if (error) {
2949                         goto wait_for_cwrites;
2950                 }
2951
2952                 upl_offset += head_size;
2953                 src_paddr  += head_size;
2954                 io_size    -= head_size;
2955
2956                 iov_base   += head_size;
2957         }
2958         if ((u_int32_t)iov_base & mem_alignment_mask) {
2959                 /*
2960                  * request doesn't set up on a memory boundary
2961                  * the underlying DMA engine can handle...
2962                  * return an error instead of going through
2963                  * the slow copy path since the intent of this
2964                  * path is direct I/O from device memory
2965                  */
2966                 error = EINVAL;
2967                 goto wait_for_cwrites;
2968         }
2969
2970         tail_size = io_size & (devblocksize - 1);
2971         io_size  -= tail_size;
2972
2973         while (io_size && error == 0) {
2974                 if (io_size > MAX_IO_CONTIG_SIZE) {
2975                         xsize = MAX_IO_CONTIG_SIZE;
2976                 } else {
2977                         xsize = io_size;
2978                 }
2979                 /*
2980                  * request asynchronously so that we can overlap
2981                  * the preparation of the next I/O... we'll do
2982                  * the commit after all the I/O has completed
2983                  * since its all issued against the same UPL
2984                  * if there are already too many outstanding writes
2985                  * wait until some have completed before issuing the next
2986                  */
2987                 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
2988
2989                 if (iostate.io_error) {
2990                         /*
2991                          * one of the earlier writes we issued ran into a hard error
2992                          * don't issue any more writes...
2993                          * go wait for all writes that are part of this stream
2994                          * to complete before returning the error to the caller
2995                          */
2996                         goto wait_for_cwrites;
2997                 }
2998                 /*
2999                  * issue an asynchronous write to cluster_io
3000                  */
3001                 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
3002                     xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
3003
3004                 if (error == 0) {
3005                         /*
3006                          * The cluster_io write completed successfully,
3007                          * update the uio structure
3008                          */
3009                         uio_update(uio, (user_size_t)xsize);
3010
3011                         upl_offset += xsize;
3012                         src_paddr  += xsize;
3013                         io_size    -= xsize;
3014                 }
3015         }
3016         if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
3017                 error = cluster_io_type(uio, write_type, write_length, 0);
3018
3019                 if (error == 0 && *write_type == IO_CONTIG) {
3020                         cur_upl++;
3021                         goto next_cwrite;
3022                 }
3023         } else {
3024                 *write_type = IO_UNKNOWN;
3025         }
3026
3027 wait_for_cwrites:
3028         /*
3029          * make sure all async writes that are part of this stream
3030          * have completed before we proceed
3031          */
3032         cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
3033
3034         if (iostate.io_error) {
3035                 error = iostate.io_error;
3036         }
3037
3038         lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
3039
3040         if (error == 0 && tail_size) {
3041                 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
3042         }
3043
3044         for (n = 0; n < num_upl; n++) {
3045                 /*
3046                  * just release our hold on each physically contiguous
3047                  * region without changing any state
3048                  */
3049                 ubc_upl_abort(upl[n], 0);
3050         }
3051
3052         return error;
3053 }
3054
3055
3056 /*
3057  * need to avoid a race between an msync of a range of pages dirtied via mmap
3058  * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
3059  * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
3060  *
3061  * we should never force-zero-fill pages that are already valid in the cache...
3062  * the entire page contains valid data (either from disk, zero-filled or dirtied
3063  * via an mmap) so we can only do damage by trying to zero-fill
3064  *
3065  */
3066 static int
3067 cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
3068 {
3069         int zero_pg_index;
3070         boolean_t need_cluster_zero = TRUE;
3071
3072         if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
3073                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
3074                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
3075
3076                 if (upl_valid_page(pl, zero_pg_index)) {
3077                         /*
3078                          * never force zero valid pages - dirty or clean
3079                          * we'll leave these in the UPL for cluster_write_copy to deal with
3080                          */
3081                         need_cluster_zero = FALSE;
3082                 }
3083         }
3084         if (need_cluster_zero == TRUE) {
3085                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
3086         }
3087
3088         return bytes_to_zero;
3089 }
3090
3091
3092 void
3093 cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated)
3094 {
3095         struct cl_extent cl;
3096         boolean_t first_pass = TRUE;
3097
3098         assert(s_offset < e_offset);
3099         assert((s_offset & PAGE_MASK_64) == 0);
3100         assert((e_offset & PAGE_MASK_64) == 0);
3101
3102         cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64);
3103         cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64);
3104
3105         cluster_update_state_internal(vp, &cl, 0, TRUE, &first_pass, s_offset, (int)(e_offset - s_offset),
3106             vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated);
3107 }
3108
3109
3110 static void
3111 cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes,
3112     boolean_t *first_pass, off_t write_off, int write_cnt, off_t newEOF,
3113     int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
3114 {
3115         struct cl_writebehind *wbp;
3116         int     cl_index;
3117         int     ret_cluster_try_push;
3118         u_int   max_cluster_pgcount;
3119
3120
3121         max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
3122
3123         /*
3124          * take the lock to protect our accesses
3125          * of the writebehind and sparse cluster state
3126          */
3127         wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3128
3129         if (wbp->cl_scmap) {
3130                 if (!(flags & IO_NOCACHE)) {
3131                         /*
3132                          * we've fallen into the sparse
3133                          * cluster method of delaying dirty pages
3134                          */
3135                         sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3136
3137                         lck_mtx_unlock(&wbp->cl_lockw);
3138                         return;
3139                 }
3140                 /*
3141                  * must have done cached writes that fell into
3142                  * the sparse cluster mechanism... we've switched
3143                  * to uncached writes on the file, so go ahead
3144                  * and push whatever's in the sparse map
3145                  * and switch back to normal clustering
3146                  */
3147                 wbp->cl_number = 0;
3148
3149                 sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg, vm_initiated);
3150                 /*
3151                  * no clusters of either type present at this point
3152                  * so just go directly to start_new_cluster since
3153                  * we know we need to delay this I/O since we've
3154                  * already released the pages back into the cache
3155                  * to avoid the deadlock with sparse_cluster_push
3156                  */
3157                 goto start_new_cluster;
3158         }
3159         if (*first_pass == TRUE) {
3160                 if (write_off == wbp->cl_last_write) {
3161                         wbp->cl_seq_written += write_cnt;
3162                 } else {
3163                         wbp->cl_seq_written = write_cnt;
3164                 }
3165
3166                 wbp->cl_last_write = write_off + write_cnt;
3167
3168                 *first_pass = FALSE;
3169         }
3170         if (wbp->cl_number == 0) {
3171                 /*
3172                  * no clusters currently present
3173                  */
3174                 goto start_new_cluster;
3175         }
3176
3177         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3178                 /*
3179                  * check each cluster that we currently hold
3180                  * try to merge some or all of this write into
3181                  * one or more of the existing clusters... if
3182                  * any portion of the write remains, start a
3183                  * new cluster
3184                  */
3185                 if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) {
3186                         /*
3187                          * the current write starts at or after the current cluster
3188                          */
3189                         if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3190                                 /*
3191                                  * we have a write that fits entirely
3192                                  * within the existing cluster limits
3193                                  */
3194                                 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3195                                         /*
3196                                          * update our idea of where the cluster ends
3197                                          */
3198                                         wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3199                                 }
3200                                 break;
3201                         }
3202                         if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3203                                 /*
3204                                  * we have a write that starts in the middle of the current cluster
3205                                  * but extends beyond the cluster's limit... we know this because
3206                                  * of the previous checks
3207                                  * we'll extend the current cluster to the max
3208                                  * and update the b_addr for the current write to reflect that
3209                                  * the head of it was absorbed into this cluster...
3210                                  * note that we'll always have a leftover tail in this case since
3211                                  * full absorbtion would have occurred in the clause above
3212                                  */
3213                                 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
3214
3215                                 cl->b_addr = wbp->cl_clusters[cl_index].e_addr;
3216                         }
3217                         /*
3218                          * we come here for the case where the current write starts
3219                          * beyond the limit of the existing cluster or we have a leftover
3220                          * tail after a partial absorbtion
3221                          *
3222                          * in either case, we'll check the remaining clusters before
3223                          * starting a new one
3224                          */
3225                 } else {
3226                         /*
3227                          * the current write starts in front of the cluster we're currently considering
3228                          */
3229                         if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) {
3230                                 /*
3231                                  * we can just merge the new request into
3232                                  * this cluster and leave it in the cache
3233                                  * since the resulting cluster is still
3234                                  * less than the maximum allowable size
3235                                  */
3236                                 wbp->cl_clusters[cl_index].b_addr = cl->b_addr;
3237
3238                                 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3239                                         /*
3240                                          * the current write completely
3241                                          * envelops the existing cluster and since
3242                                          * each write is limited to at most max_cluster_pgcount pages
3243                                          * we can just use the start and last blocknos of the write
3244                                          * to generate the cluster limits
3245                                          */
3246                                         wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3247                                 }
3248                                 break;
3249                         }
3250                         /*
3251                          * if we were to combine this write with the current cluster
3252                          * we would exceed the cluster size limit.... so,
3253                          * let's see if there's any overlap of the new I/O with
3254                          * the cluster we're currently considering... in fact, we'll
3255                          * stretch the cluster out to it's full limit and see if we
3256                          * get an intersection with the current write
3257                          *
3258                          */
3259                         if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
3260                                 /*
3261                                  * the current write extends into the proposed cluster
3262                                  * clip the length of the current write after first combining it's
3263                                  * tail with the newly shaped cluster
3264                                  */
3265                                 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
3266
3267                                 cl->e_addr = wbp->cl_clusters[cl_index].b_addr;
3268                         }
3269                         /*
3270                          * if we get here, there was no way to merge
3271                          * any portion of this write with this cluster
3272                          * or we could only merge part of it which
3273                          * will leave a tail...
3274                          * we'll check the remaining clusters before starting a new one
3275                          */
3276                 }
3277         }
3278         if (cl_index < wbp->cl_number) {
3279                 /*
3280                  * we found an existing cluster(s) that we
3281                  * could entirely merge this I/O into
3282                  */
3283                 goto delay_io;
3284         }
3285
3286         if (defer_writes == FALSE &&
3287             wbp->cl_number == MAX_CLUSTERS &&
3288             wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
3289                 uint32_t        n;
3290
3291                 if (vp->v_mount->mnt_minsaturationbytecount) {
3292                         n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
3293
3294                         if (n > MAX_CLUSTERS) {
3295                                 n = MAX_CLUSTERS;
3296                         }
3297                 } else {
3298                         n = 0;
3299                 }
3300
3301                 if (n == 0) {
3302                         if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
3303                                 n = WRITE_BEHIND_SSD;
3304                         } else {
3305                                 n = WRITE_BEHIND;
3306                         }
3307                 }
3308                 while (n--) {
3309                         cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL, vm_initiated);
3310                 }
3311         }
3312         if (wbp->cl_number < MAX_CLUSTERS) {
3313                 /*
3314                  * we didn't find an existing cluster to
3315                  * merge into, but there's room to start
3316                  * a new one
3317                  */
3318                 goto start_new_cluster;
3319         }
3320         /*
3321          * no exisitng cluster to merge with and no
3322          * room to start a new one... we'll try
3323          * pushing one of the existing ones... if none of
3324          * them are able to be pushed, we'll switch
3325          * to the sparse cluster mechanism
3326          * cluster_try_push updates cl_number to the
3327          * number of remaining clusters... and
3328          * returns the number of currently unused clusters
3329          */
3330         ret_cluster_try_push = 0;
3331
3332         /*
3333          * if writes are not deferred, call cluster push immediately
3334          */
3335         if (defer_writes == FALSE) {
3336                 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL, vm_initiated);
3337         }
3338         /*
3339          * execute following regardless of writes being deferred or not
3340          */
3341         if (ret_cluster_try_push == 0) {
3342                 /*
3343                  * no more room in the normal cluster mechanism
3344                  * so let's switch to the more expansive but expensive
3345                  * sparse mechanism....
3346                  */
3347                 sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg, vm_initiated);
3348                 sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3349
3350                 lck_mtx_unlock(&wbp->cl_lockw);
3351                 return;
3352         }
3353 start_new_cluster:
3354         wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr;
3355         wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr;
3356
3357         wbp->cl_clusters[wbp->cl_number].io_flags = 0;
3358
3359         if (flags & IO_NOCACHE) {
3360                 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
3361         }
3362
3363         if (flags & IO_PASSIVE) {
3364                 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
3365         }
3366
3367         wbp->cl_number++;
3368 delay_io:
3369         lck_mtx_unlock(&wbp->cl_lockw);
3370         return;
3371 }
3372
3373
3374 static int
3375 cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
3376     off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3377 {
3378         upl_page_info_t *pl;
3379         upl_t            upl;
3380         vm_offset_t      upl_offset = 0;
3381         vm_size_t        upl_size;
3382         off_t            upl_f_offset;
3383         int              pages_in_upl;
3384         int              start_offset;
3385         int              xfer_resid;
3386         int              io_size;
3387         int              io_offset;
3388         int              bytes_to_zero;
3389         int              bytes_to_move;
3390         kern_return_t    kret;
3391         int              retval = 0;
3392         int              io_resid;
3393         long long        total_size;
3394         long long        zero_cnt;
3395         off_t            zero_off;
3396         long long        zero_cnt1;
3397         off_t            zero_off1;
3398         off_t            write_off = 0;
3399         int              write_cnt = 0;
3400         boolean_t        first_pass = FALSE;
3401         struct cl_extent cl;
3402         int              bflag;
3403         u_int            max_io_size;
3404
3405         if (uio) {
3406                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3407                     (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
3408
3409                 io_resid = io_req_size;
3410         } else {
3411                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3412                     0, 0, (int)oldEOF, (int)newEOF, 0);
3413
3414                 io_resid = 0;
3415         }
3416         if (flags & IO_PASSIVE) {
3417                 bflag = CL_PASSIVE;
3418         } else {
3419                 bflag = 0;
3420         }
3421         if (flags & IO_NOCACHE) {
3422                 bflag |= CL_NOCACHE;
3423         }
3424
3425         if (flags & IO_SKIP_ENCRYPTION) {
3426                 bflag |= CL_ENCRYPTED;
3427         }
3428
3429         zero_cnt  = 0;
3430         zero_cnt1 = 0;
3431         zero_off  = 0;
3432         zero_off1 = 0;
3433
3434         max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
3435
3436         if (flags & IO_HEADZEROFILL) {
3437                 /*
3438                  * some filesystems (HFS is one) don't support unallocated holes within a file...
3439                  * so we zero fill the intervening space between the old EOF and the offset
3440                  * where the next chunk of real data begins.... ftruncate will also use this
3441                  * routine to zero fill to the new EOF when growing a file... in this case, the
3442                  * uio structure will not be provided
3443                  */
3444                 if (uio) {
3445                         if (headOff < uio->uio_offset) {
3446                                 zero_cnt = uio->uio_offset - headOff;
3447                                 zero_off = headOff;
3448                         }
3449                 } else if (headOff < newEOF) {
3450                         zero_cnt = newEOF - headOff;
3451                         zero_off = headOff;
3452                 }
3453         } else {
3454                 if (uio && uio->uio_offset > oldEOF) {
3455                         zero_off = uio->uio_offset & ~PAGE_MASK_64;
3456
3457                         if (zero_off >= oldEOF) {
3458                                 zero_cnt = uio->uio_offset - zero_off;
3459
3460                                 flags |= IO_HEADZEROFILL;
3461                         }
3462                 }
3463         }
3464         if (flags & IO_TAILZEROFILL) {
3465                 if (uio) {
3466                         zero_off1 = uio->uio_offset + io_req_size;
3467
3468                         if (zero_off1 < tailOff) {
3469                                 zero_cnt1 = tailOff - zero_off1;
3470                         }
3471                 }
3472         } else {
3473                 if (uio && newEOF > oldEOF) {
3474                         zero_off1 = uio->uio_offset + io_req_size;
3475
3476                         if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
3477                                 zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
3478
3479                                 flags |= IO_TAILZEROFILL;
3480                         }
3481                 }
3482         }
3483         if (zero_cnt == 0 && uio == (struct uio *) 0) {
3484                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
3485                     retval, 0, 0, 0, 0);
3486                 return 0;
3487         }
3488         if (uio) {
3489                 write_off = uio->uio_offset;
3490                 write_cnt = (int)uio_resid(uio);
3491                 /*
3492                  * delay updating the sequential write info
3493                  * in the control block until we've obtained
3494                  * the lock for it
3495                  */
3496                 first_pass = TRUE;
3497         }
3498         while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
3499                 /*
3500                  * for this iteration of the loop, figure out where our starting point is
3501                  */
3502                 if (zero_cnt) {
3503                         start_offset = (int)(zero_off & PAGE_MASK_64);
3504                         upl_f_offset = zero_off - start_offset;
3505                 } else if (io_resid) {
3506                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3507                         upl_f_offset = uio->uio_offset - start_offset;
3508                 } else {
3509                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
3510                         upl_f_offset = zero_off1 - start_offset;
3511                 }
3512                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
3513                     (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
3514
3515                 if (total_size > max_io_size) {
3516                         total_size = max_io_size;
3517                 }
3518
3519                 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
3520
3521                 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
3522                         /*
3523                          * assumption... total_size <= io_resid
3524                          * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
3525                          */
3526                         if ((start_offset + total_size) > max_io_size) {
3527                                 total_size = max_io_size - start_offset;
3528                         }
3529                         xfer_resid = (int)total_size;
3530
3531                         retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
3532
3533                         if (retval) {
3534                                 break;
3535                         }
3536
3537                         io_resid    -= (total_size - xfer_resid);
3538                         total_size   = xfer_resid;
3539                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3540                         upl_f_offset = uio->uio_offset - start_offset;
3541
3542                         if (total_size == 0) {
3543                                 if (start_offset) {
3544                                         /*
3545                                          * the write did not finish on a page boundary
3546                                          * which will leave upl_f_offset pointing to the
3547                                          * beginning of the last page written instead of
3548                                          * the page beyond it... bump it in this case
3549                                          * so that the cluster code records the last page
3550                                          * written as dirty
3551                                          */
3552                                         upl_f_offset += PAGE_SIZE_64;
3553                                 }
3554                                 upl_size = 0;
3555
3556                                 goto check_cluster;
3557                         }
3558                 }
3559                 /*
3560                  * compute the size of the upl needed to encompass
3561                  * the requested write... limit each call to cluster_io
3562                  * to the maximum UPL size... cluster_io will clip if
3563                  * this exceeds the maximum io_size for the device,
3564                  * make sure to account for
3565                  * a starting offset that's not page aligned
3566                  */
3567                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3568
3569                 if (upl_size > max_io_size) {
3570                         upl_size = max_io_size;
3571                 }
3572
3573                 pages_in_upl = (int)(upl_size / PAGE_SIZE);
3574                 io_size      = (int)(upl_size - start_offset);
3575
3576                 if ((long long)io_size > total_size) {
3577                         io_size = (int)total_size;
3578                 }
3579
3580                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
3581
3582
3583                 /*
3584                  * Gather the pages from the buffer cache.
3585                  * The UPL_WILL_MODIFY flag lets the UPL subsystem know
3586                  * that we intend to modify these pages.
3587                  */
3588                 kret = ubc_create_upl_kernel(vp,
3589                     upl_f_offset,
3590                     (int)upl_size,
3591                     &upl,
3592                     &pl,
3593                     UPL_SET_LITE | ((uio != NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY),
3594                     VM_KERN_MEMORY_FILE);
3595                 if (kret != KERN_SUCCESS) {
3596                         panic("cluster_write_copy: failed to get pagelist");
3597                 }
3598
3599                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
3600                     upl, (int)upl_f_offset, start_offset, 0, 0);
3601
3602                 if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
3603                         int   read_size;
3604
3605                         /*
3606                          * we're starting in the middle of the first page of the upl
3607                          * and the page isn't currently valid, so we're going to have
3608                          * to read it in first... this is a synchronous operation
3609                          */
3610                         read_size = PAGE_SIZE;
3611
3612                         if ((upl_f_offset + read_size) > oldEOF) {
3613                                 read_size = (int)(oldEOF - upl_f_offset);
3614                         }
3615
3616                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
3617                             CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3618                         if (retval) {
3619                                 /*
3620                                  * we had an error during the read which causes us to abort
3621                                  * the current cluster_write request... before we do, we need
3622                                  * to release the rest of the pages in the upl without modifying
3623                                  * there state and mark the failed page in error
3624                                  */
3625                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3626
3627                                 if (upl_size > PAGE_SIZE) {
3628                                         ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size,
3629                                             UPL_ABORT_FREE_ON_EMPTY);
3630                                 }
3631
3632                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3633                                     upl, 0, 0, retval, 0);
3634                                 break;
3635                         }
3636                 }
3637                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
3638                         /*
3639                          * the last offset we're writing to in this upl does not end on a page
3640                          * boundary... if it's not beyond the old EOF, then we'll also need to
3641                          * pre-read this page in if it isn't already valid
3642                          */
3643                         upl_offset = upl_size - PAGE_SIZE;
3644
3645                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
3646                             !upl_valid_page(pl, (int)(upl_offset / PAGE_SIZE))) {
3647                                 int   read_size;
3648
3649                                 read_size = PAGE_SIZE;
3650
3651                                 if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) {
3652                                         read_size = (int)(oldEOF - (upl_f_offset + upl_offset));
3653                                 }
3654
3655                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
3656                                     CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3657                                 if (retval) {
3658                                         /*
3659                                          * we had an error during the read which causes us to abort
3660                                          * the current cluster_write request... before we do, we
3661                                          * need to release the rest of the pages in the upl without
3662                                          * modifying there state and mark the failed page in error
3663                                          */
3664                                         ubc_upl_abort_range(upl, (upl_offset_t)upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3665
3666                                         if (upl_size > PAGE_SIZE) {
3667                                                 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
3668                                         }
3669
3670                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3671                                             upl, 0, 0, retval, 0);
3672                                         break;
3673                                 }
3674                         }
3675                 }
3676                 xfer_resid = io_size;
3677                 io_offset = start_offset;
3678
3679                 while (zero_cnt && xfer_resid) {
3680                         if (zero_cnt < (long long)xfer_resid) {
3681                                 bytes_to_zero = (int)zero_cnt;
3682                         } else {
3683                                 bytes_to_zero = xfer_resid;
3684                         }
3685
3686                         bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
3687
3688                         xfer_resid -= bytes_to_zero;
3689                         zero_cnt   -= bytes_to_zero;
3690                         zero_off   += bytes_to_zero;
3691                         io_offset  += bytes_to_zero;
3692                 }
3693                 if (xfer_resid && io_resid) {
3694                         u_int32_t  io_requested;
3695
3696                         bytes_to_move = min(io_resid, xfer_resid);
3697                         io_requested = bytes_to_move;
3698
3699                         retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
3700
3701                         if (retval) {
3702                                 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
3703
3704                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3705                                     upl, 0, 0, retval, 0);
3706                         } else {
3707                                 io_resid   -= bytes_to_move;
3708                                 xfer_resid -= bytes_to_move;
3709                                 io_offset  += bytes_to_move;
3710                         }
3711                 }
3712                 while (xfer_resid && zero_cnt1 && retval == 0) {
3713                         if (zero_cnt1 < (long long)xfer_resid) {
3714                                 bytes_to_zero = (int)zero_cnt1;
3715                         } else {
3716                                 bytes_to_zero = xfer_resid;
3717                         }
3718
3719                         bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
3720
3721                         xfer_resid -= bytes_to_zero;
3722                         zero_cnt1  -= bytes_to_zero;
3723                         zero_off1  += bytes_to_zero;
3724                         io_offset  += bytes_to_zero;
3725                 }
3726                 if (retval == 0) {
3727                         int do_zeroing = 1;
3728
3729                         io_size += start_offset;
3730
3731                         /* Force more restrictive zeroing behavior only on APFS */
3732                         if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) {
3733                                 do_zeroing = 0;
3734                         }
3735
3736                         if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
3737                                 /*
3738                                  * if we're extending the file with this write
3739                                  * we'll zero fill the rest of the page so that
3740                                  * if the file gets extended again in such a way as to leave a
3741                                  * hole starting at this EOF, we'll have zero's in the correct spot
3742                                  */
3743                                 cluster_zero(upl, io_size, (int)(upl_size - io_size), NULL);
3744                         }
3745                         /*
3746                          * release the upl now if we hold one since...
3747                          * 1) pages in it may be present in the sparse cluster map
3748                          *    and may span 2 separate buckets there... if they do and
3749                          *    we happen to have to flush a bucket to make room and it intersects
3750                          *    this upl, a deadlock may result on page BUSY
3751                          * 2) we're delaying the I/O... from this point forward we're just updating
3752                          *    the cluster state... no need to hold the pages, so commit them
3753                          * 3) IO_SYNC is set...
3754                          *    because we had to ask for a UPL that provides currenty non-present pages, the
3755                          *    UPL has been automatically set to clear the dirty flags (both software and hardware)
3756                          *    upon committing it... this is not the behavior we want since it's possible for
3757                          *    pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3758                          *    we'll pick these pages back up later with the correct behavior specified.
3759                          * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3760                          *    of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3761                          *    we hold since the flushing context is holding the cluster lock.
3762                          */
3763                         ubc_upl_commit_range(upl, 0, (upl_size_t)upl_size,
3764                             UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
3765 check_cluster:
3766                         /*
3767                          * calculate the last logical block number
3768                          * that this delayed I/O encompassed
3769                          */
3770                         cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
3771
3772                         if (flags & IO_SYNC) {
3773                                 /*
3774                                  * if the IO_SYNC flag is set than we need to bypass
3775                                  * any clustering and immediately issue the I/O
3776                                  *
3777                                  * we don't hold the lock at this point
3778                                  *
3779                                  * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3780                                  * so that we correctly deal with a change in state of the hardware modify bit...
3781                                  * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3782                                  * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3783                                  * responsible for generating the correct sized I/O(s)
3784                                  */
3785                                 retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg, FALSE);
3786                         } else {
3787                                 boolean_t defer_writes = FALSE;
3788
3789                                 if (vfs_flags(vp->v_mount) & MNT_DEFWRITE) {
3790                                         defer_writes = TRUE;
3791                                 }
3792
3793                                 cluster_update_state_internal(vp, &cl, flags, defer_writes, &first_pass,
3794                                     write_off, write_cnt, newEOF, callback, callback_arg, FALSE);
3795                         }
3796                 }
3797         }
3798         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0);
3799
3800         return retval;
3801 }
3802
3803
3804
3805 int
3806 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
3807 {
3808         return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
3809 }
3810
3811
3812 int
3813 cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg)
3814 {
3815         int             retval = 0;
3816         int             flags;
3817         user_ssize_t    cur_resid;
3818         u_int32_t       io_size;
3819         u_int32_t       read_length = 0;
3820         int             read_type = IO_COPY;
3821
3822         flags = xflags;
3823
3824         if (vp->v_flag & VNOCACHE_DATA) {
3825                 flags |= IO_NOCACHE;
3826         }
3827         if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) {
3828                 flags |= IO_RAOFF;
3829         }
3830
3831         if (flags & IO_SKIP_ENCRYPTION) {
3832                 flags |= IO_ENCRYPTED;
3833         }
3834
3835         /*
3836          * do a read through the cache if one of the following is true....
3837          *   NOCACHE is not true
3838          *   the uio request doesn't target USERSPACE
3839          * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
3840          * Reading encrypted data from a CP filesystem should never result in the data touching
3841          * the UBC.
3842          *
3843          * otherwise, find out if we want the direct or contig variant for
3844          * the first vector in the uio request
3845          */
3846         if (((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) || (flags & IO_ENCRYPTED)) {
3847                 retval = cluster_io_type(uio, &read_type, &read_length, 0);
3848         }
3849
3850         while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
3851                 switch (read_type) {
3852                 case IO_COPY:
3853                         /*
3854                          * make sure the uio_resid isn't too big...
3855                          * internally, we want to handle all of the I/O in
3856                          * chunk sizes that fit in a 32 bit int
3857                          */
3858                         if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
3859                                 io_size = MAX_IO_REQUEST_SIZE;
3860                         } else {
3861                                 io_size = (u_int32_t)cur_resid;
3862                         }
3863
3864                         retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
3865                         break;
3866
3867                 case IO_DIRECT:
3868                         retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
3869                         break;
3870
3871                 case IO_CONTIG:
3872                         retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
3873                         break;
3874
3875                 case IO_UNKNOWN:
3876                         retval = cluster_io_type(uio, &read_type, &read_length, 0);
3877                         break;
3878                 }
3879         }
3880         return retval;
3881 }
3882
3883
3884
3885 static void
3886 cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
3887 {
3888         int range;
3889         int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
3890
3891         if ((range = last_pg - start_pg)) {
3892                 if (take_reference) {
3893                         abort_flags |= UPL_ABORT_REFERENCE;
3894                 }
3895
3896                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
3897         }
3898 }
3899
3900
3901 static int
3902 cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3903 {
3904         upl_page_info_t *pl;
3905         upl_t            upl;
3906         vm_offset_t      upl_offset;
3907         u_int32_t        upl_size;
3908         off_t            upl_f_offset;
3909         int              start_offset;
3910         int              start_pg;
3911         int              last_pg;
3912         int              uio_last = 0;
3913         int              pages_in_upl;
3914         off_t            max_size;
3915         off_t            last_ioread_offset;
3916         off_t            last_request_offset;
3917         kern_return_t    kret;
3918         int              error  = 0;
3919         int              retval = 0;
3920         u_int32_t        size_of_prefetch;
3921         u_int32_t        xsize;
3922         u_int32_t        io_size;
3923         u_int32_t        max_rd_size;
3924         u_int32_t        max_io_size;
3925         u_int32_t        max_prefetch;
3926         u_int            rd_ahead_enabled = 1;
3927         u_int            prefetch_enabled = 1;
3928         struct cl_readahead *   rap;
3929         struct clios            iostate;
3930         struct cl_extent        extent;
3931         int              bflag;
3932         int              take_reference = 1;
3933         int              policy = IOPOL_DEFAULT;
3934         boolean_t        iolock_inited = FALSE;
3935
3936         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
3937             (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
3938
3939         if (flags & IO_ENCRYPTED) {
3940                 panic("encrypted blocks will hit UBC!");
3941         }
3942
3943         policy = throttle_get_io_policy(NULL);
3944
3945         if (policy == THROTTLE_LEVEL_TIER3 || policy == THROTTLE_LEVEL_TIER2 || (flags & IO_NOCACHE)) {
3946                 take_reference = 0;
3947         }
3948
3949         if (flags & IO_PASSIVE) {
3950                 bflag = CL_PASSIVE;
3951         } else {
3952                 bflag = 0;
3953         }
3954
3955         if (flags & IO_NOCACHE) {
3956                 bflag |= CL_NOCACHE;
3957         }
3958
3959         if (flags & IO_SKIP_ENCRYPTION) {
3960                 bflag |= CL_ENCRYPTED;
3961         }
3962
3963         max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
3964         max_prefetch = MAX_PREFETCH(vp, max_io_size, disk_conditioner_mount_is_ssd(vp->v_mount));
3965         max_rd_size = max_prefetch;
3966
3967         last_request_offset = uio->uio_offset + io_req_size;
3968
3969         if (last_request_offset > filesize) {
3970                 last_request_offset = filesize;
3971         }
3972
3973         if ((flags & (IO_RAOFF | IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
3974                 rd_ahead_enabled = 0;
3975                 rap = NULL;
3976         } else {
3977                 if (cluster_is_throttled(vp)) {
3978                         /*
3979                          * we're in the throttle window, at the very least
3980                          * we want to limit the size of the I/O we're about
3981                          * to issue
3982                          */
3983                         rd_ahead_enabled = 0;
3984                         prefetch_enabled = 0;
3985
3986                         max_rd_size = THROTTLE_MAX_IOSIZE;
3987                 }
3988                 if ((rap = cluster_get_rap(vp)) == NULL) {
3989                         rd_ahead_enabled = 0;
3990                 } else {
3991                         extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
3992                         extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
3993                 }
3994         }
3995         if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
3996                 /*
3997                  * determine if we already have a read-ahead in the pipe courtesy of the
3998                  * last read systemcall that was issued...
3999                  * if so, pick up it's extent to determine where we should start
4000                  * with respect to any read-ahead that might be necessary to
4001                  * garner all the data needed to complete this read systemcall
4002                  */
4003                 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
4004
4005                 if (last_ioread_offset < uio->uio_offset) {
4006                         last_ioread_offset = (off_t)0;
4007                 } else if (last_ioread_offset > last_request_offset) {
4008                         last_ioread_offset = last_request_offset;
4009                 }
4010         } else {
4011                 last_ioread_offset = (off_t)0;
4012         }
4013
4014         while (io_req_size && uio->uio_offset < filesize && retval == 0) {
4015                 max_size = filesize - uio->uio_offset;
4016                 bool leftover_upl_aborted = false;
4017
4018                 if ((off_t)(io_req_size) < max_size) {
4019                         io_size = io_req_size;
4020                 } else {
4021                         io_size = (u_int32_t)max_size;
4022                 }
4023
4024                 if (!(flags & IO_NOCACHE)) {
4025                         while (io_size) {
4026                                 u_int32_t io_resid;
4027                                 u_int32_t io_requested;
4028
4029                                 /*
4030                                  * if we keep finding the pages we need already in the cache, then
4031                                  * don't bother to call cluster_read_prefetch since it costs CPU cycles
4032                                  * to determine that we have all the pages we need... once we miss in
4033                                  * the cache and have issued an I/O, than we'll assume that we're likely
4034                                  * to continue to miss in the cache and it's to our advantage to try and prefetch
4035                                  */
4036                                 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset))) {
4037                                         if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
4038                                                 /*
4039                                                  * we've already issued I/O for this request and
4040                                                  * there's still work to do and
4041                                                  * our prefetch stream is running dry, so issue a
4042                                                  * pre-fetch I/O... the I/O latency will overlap
4043                                                  * with the copying of the data
4044                                                  */
4045                                                 if (size_of_prefetch > max_rd_size) {
4046                                                         size_of_prefetch = max_rd_size;
4047                                                 }
4048
4049                                                 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4050
4051                                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4052
4053                                                 if (last_ioread_offset > last_request_offset) {
4054                                                         last_ioread_offset = last_request_offset;
4055                                                 }
4056                                         }
4057                                 }
4058                                 /*
4059                                  * limit the size of the copy we're about to do so that
4060                                  * we can notice that our I/O pipe is running dry and
4061                                  * get the next I/O issued before it does go dry
4062                                  */
4063                                 if (last_ioread_offset && io_size > (max_io_size / 4)) {
4064                                         io_resid = (max_io_size / 4);
4065                                 } else {
4066                                         io_resid = io_size;
4067                                 }
4068
4069                                 io_requested = io_resid;
4070
4071                                 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
4072
4073                                 xsize = io_requested - io_resid;
4074
4075                                 io_size -= xsize;
4076                                 io_req_size -= xsize;
4077
4078                                 if (retval || io_resid) {
4079                                         /*
4080                                          * if we run into a real error or
4081                                          * a page that is not in the cache
4082                                          * we need to leave streaming mode
4083                                          */
4084                                         break;
4085                                 }
4086
4087                                 if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) {
4088                                         /*
4089                                          * we're already finished the I/O for this read request
4090                                          * let's see if we should do a read-ahead
4091                                          */
4092                                         cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4093                                 }
4094                         }
4095                         if (retval) {
4096                                 break;
4097                         }
4098                         if (io_size == 0) {
4099                                 if (rap != NULL) {
4100                                         if (extent.e_addr < rap->cl_lastr) {
4101                                                 rap->cl_maxra = 0;
4102                                         }
4103                                         rap->cl_lastr = extent.e_addr;
4104                                 }
4105                                 break;
4106                         }
4107                         /*
4108                          * recompute max_size since cluster_copy_ubc_data_internal
4109                          * may have advanced uio->uio_offset
4110                          */
4111                         max_size = filesize - uio->uio_offset;
4112                 }
4113
4114                 iostate.io_completed = 0;
4115                 iostate.io_issued = 0;
4116                 iostate.io_error = 0;
4117                 iostate.io_wanted = 0;
4118
4119                 if ((flags & IO_RETURN_ON_THROTTLE)) {
4120                         if (cluster_is_throttled(vp) == THROTTLE_NOW) {
4121                                 if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
4122                                         /*
4123                                          * we're in the throttle window and at least 1 I/O
4124                                          * has already been issued by a throttleable thread
4125                                          * in this window, so return with EAGAIN to indicate
4126                                          * to the FS issuing the cluster_read call that it
4127                                          * should now throttle after dropping any locks
4128                                          */
4129                                         throttle_info_update_by_mount(vp->v_mount);
4130
4131                                         retval = EAGAIN;
4132                                         break;
4133                                 }
4134                         }
4135                 }
4136
4137                 /*
4138                  * compute the size of the upl needed to encompass
4139                  * the requested read... limit each call to cluster_io
4140                  * to the maximum UPL size... cluster_io will clip if
4141                  * this exceeds the maximum io_size for the device,
4142                  * make sure to account for
4143                  * a starting offset that's not page aligned
4144                  */
4145                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4146                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
4147
4148                 if (io_size > max_rd_size) {
4149                         io_size = max_rd_size;
4150                 }
4151
4152                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4153
4154                 if (flags & IO_NOCACHE) {
4155                         if (upl_size > max_io_size) {
4156                                 upl_size = max_io_size;
4157                         }
4158                 } else {
4159                         if (upl_size > max_io_size / 4) {
4160                                 upl_size = max_io_size / 4;
4161                                 upl_size &= ~PAGE_MASK;
4162
4163                                 if (upl_size == 0) {
4164                                         upl_size = PAGE_SIZE;
4165                                 }
4166                         }
4167                 }
4168                 pages_in_upl = upl_size / PAGE_SIZE;
4169
4170                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
4171                     upl, (int)upl_f_offset, upl_size, start_offset, 0);
4172
4173                 kret = ubc_create_upl_kernel(vp,
4174                     upl_f_offset,
4175                     upl_size,
4176                     &upl,
4177                     &pl,
4178                     UPL_FILE_IO | UPL_SET_LITE,
4179                     VM_KERN_MEMORY_FILE);
4180                 if (kret != KERN_SUCCESS) {
4181                         panic("cluster_read_copy: failed to get pagelist");
4182                 }
4183
4184                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
4185                     upl, (int)upl_f_offset, upl_size, start_offset, 0);
4186
4187                 /*
4188                  * scan from the beginning of the upl looking for the first
4189                  * non-valid page.... this will become the first page in
4190                  * the request we're going to make to 'cluster_io'... if all
4191                  * of the pages are valid, we won't call through to 'cluster_io'
4192                  */
4193                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
4194                         if (!upl_valid_page(pl, start_pg)) {
4195                                 break;
4196                         }
4197                 }
4198
4199                 /*
4200                  * scan from the starting invalid page looking for a valid
4201                  * page before the end of the upl is reached, if we
4202                  * find one, then it will be the last page of the request to
4203                  * 'cluster_io'
4204                  */
4205                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4206                         if (upl_valid_page(pl, last_pg)) {
4207                                 break;
4208                         }
4209                 }
4210
4211                 if (start_pg < last_pg) {
4212                         /*
4213                          * we found a range of 'invalid' pages that must be filled
4214                          * if the last page in this range is the last page of the file
4215                          * we may have to clip the size of it to keep from reading past
4216                          * the end of the last physical block associated with the file
4217                          */
4218                         if (iolock_inited == FALSE) {
4219                                 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
4220
4221                                 iolock_inited = TRUE;
4222                         }
4223                         upl_offset = start_pg * PAGE_SIZE;
4224                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
4225
4226                         if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
4227                                 io_size = (u_int32_t)(filesize - (upl_f_offset + upl_offset));
4228                         }
4229
4230                         /*
4231                          * Find out if this needs verification, we'll have to manage the UPL
4232                          * diffrently if so. Note that this call only lets us know if
4233                          * verification is enabled on this mount point, the actual verification
4234                          * is performed in the File system.
4235                          */
4236                         size_t verify_block_size = 0;
4237                         if ((VNOP_VERIFY(vp, start_offset, NULL, 0, &verify_block_size, VNODE_VERIFY_DEFAULT, NULL) == 0) /* && verify_block_size */) {
4238                                 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4239                                         if (!upl_valid_page(pl, uio_last)) {
4240                                                 break;
4241                                         }
4242                                 }
4243                                 if (uio_last < pages_in_upl) {
4244                                         /*
4245                                          * there were some invalid pages beyond the valid pages
4246                                          * that we didn't issue an I/O for, just release them
4247                                          * unchanged now, so that any prefetch/readahed can
4248                                          * include them
4249                                          */
4250                                         ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4251                                             (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4252                                         leftover_upl_aborted = true;
4253                                 }
4254                         }
4255
4256                         /*
4257                          * issue an asynchronous read to cluster_io
4258                          */
4259
4260                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
4261                             io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg);
4262
4263                         if (rap) {
4264                                 if (extent.e_addr < rap->cl_maxra) {
4265                                         /*
4266                                          * we've just issued a read for a block that should have been
4267                                          * in the cache courtesy of the read-ahead engine... something
4268                                          * has gone wrong with the pipeline, so reset the read-ahead
4269                                          * logic which will cause us to restart from scratch
4270                                          */
4271                                         rap->cl_maxra = 0;
4272                                 }
4273                         }
4274                 }
4275                 if (error == 0) {
4276                         /*
4277                          * if the read completed successfully, or there was no I/O request
4278                          * issued, than copy the data into user land via 'cluster_upl_copy_data'
4279                          * we'll first add on any 'valid'
4280                          * pages that were present in the upl when we acquired it.
4281                          */
4282                         u_int  val_size;
4283
4284                         if (!leftover_upl_aborted) {
4285                                 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4286                                         if (!upl_valid_page(pl, uio_last)) {
4287                                                 break;
4288                                         }
4289                                 }
4290                                 if (uio_last < pages_in_upl) {
4291                                         /*
4292                                          * there were some invalid pages beyond the valid pages
4293                                          * that we didn't issue an I/O for, just release them
4294                                          * unchanged now, so that any prefetch/readahed can
4295                                          * include them
4296                                          */
4297                                         ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4298                                             (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4299                                 }
4300                         }
4301
4302                         /*
4303                          * compute size to transfer this round,  if io_req_size is
4304                          * still non-zero after this attempt, we'll loop around and
4305                          * set up for another I/O.
4306                          */
4307                         val_size = (uio_last * PAGE_SIZE) - start_offset;
4308
4309                         if (val_size > max_size) {
4310                                 val_size = (u_int)max_size;
4311                         }
4312
4313                         if (val_size > io_req_size) {
4314                                 val_size = io_req_size;
4315                         }
4316
4317                         if ((uio->uio_offset + val_size) > last_ioread_offset) {
4318                                 last_ioread_offset = uio->uio_offset + val_size;
4319                         }
4320
4321                         if ((size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset)) && prefetch_enabled) {
4322                                 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
4323                                         /*
4324                                          * if there's still I/O left to do for this request, and...
4325                                          * we're not in hard throttle mode, and...
4326                                          * we're close to using up the previous prefetch, then issue a
4327                                          * new pre-fetch I/O... the I/O latency will overlap
4328                                          * with the copying of the data
4329                                          */
4330                                         if (size_of_prefetch > max_rd_size) {
4331                                                 size_of_prefetch = max_rd_size;
4332                                         }
4333
4334                                         size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4335
4336                                         last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4337
4338                                         if (last_ioread_offset > last_request_offset) {
4339                                                 last_ioread_offset = last_request_offset;
4340                                         }
4341                                 }
4342                         } else if ((uio->uio_offset + val_size) == last_request_offset) {
4343                                 /*
4344                                  * this transfer will finish this request, so...
4345                                  * let's try to read ahead if we're in
4346                                  * a sequential access pattern and we haven't
4347                                  * explicitly disabled it
4348                                  */
4349                                 if (rd_ahead_enabled) {
4350                                         cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4351                                 }
4352
4353                                 if (rap != NULL) {
4354                                         if (extent.e_addr < rap->cl_lastr) {
4355                                                 rap->cl_maxra = 0;
4356                                         }
4357                                         rap->cl_lastr = extent.e_addr;
4358                                 }
4359                         }
4360                         if (iolock_inited == TRUE) {
4361                                 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4362                         }
4363
4364                         if (iostate.io_error) {
4365                                 error = iostate.io_error;
4366                         } else {
4367                                 u_int32_t io_requested;
4368
4369                                 io_requested = val_size;
4370
4371                                 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
4372
4373                                 io_req_size -= (val_size - io_requested);
4374                         }
4375                 } else {
4376                         if (iolock_inited == TRUE) {
4377                                 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4378                         }
4379                 }
4380                 if (start_pg < last_pg) {
4381                         /*
4382                          * compute the range of pages that we actually issued an I/O for
4383                          * and either commit them as valid if the I/O succeeded
4384                          * or abort them if the I/O failed or we're not supposed to
4385                          * keep them in the cache
4386                          */
4387                         io_size = (last_pg - start_pg) * PAGE_SIZE;
4388
4389                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4390
4391                         if (error || (flags & IO_NOCACHE)) {
4392                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
4393                                     UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4394                         } else {
4395                                 int     commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
4396
4397                                 if (take_reference) {
4398                                         commit_flags |= UPL_COMMIT_INACTIVATE;
4399                                 } else {
4400                                         commit_flags |= UPL_COMMIT_SPECULATE;
4401                                 }
4402
4403                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
4404                         }
4405                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4406                 }
4407                 if ((last_pg - start_pg) < pages_in_upl) {
4408                         /*
4409                          * the set of pages that we issued an I/O for did not encompass
4410                          * the entire upl... so just release these without modifying
4411                          * their state
4412                          */
4413                         if (error) {
4414                                 if (leftover_upl_aborted) {
4415                                         ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, (uio_last - start_pg) * PAGE_SIZE,
4416                                             UPL_ABORT_FREE_ON_EMPTY);
4417                                 } else {
4418                                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
4419                                 }
4420                         } else {
4421                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
4422                                     upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
4423
4424                                 /*
4425                                  * handle any valid pages at the beginning of
4426                                  * the upl... release these appropriately
4427                                  */
4428                                 cluster_read_upl_release(upl, 0, start_pg, take_reference);
4429
4430                                 /*
4431                                  * handle any valid pages immediately after the
4432                                  * pages we issued I/O for... ... release these appropriately
4433                                  */
4434                                 cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
4435
4436                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0);
4437                         }
4438                 }
4439                 if (retval == 0) {
4440                         retval = error;
4441                 }
4442
4443                 if (io_req_size) {
4444                         if (cluster_is_throttled(vp)) {
4445                                 /*
4446                                  * we're in the throttle window, at the very least
4447                                  * we want to limit the size of the I/O we're about
4448                                  * to issue
4449                                  */
4450                                 rd_ahead_enabled = 0;
4451                                 prefetch_enabled = 0;
4452                                 max_rd_size = THROTTLE_MAX_IOSIZE;
4453                         } else {
4454                                 if (max_rd_size == THROTTLE_MAX_IOSIZE) {
4455                                         /*
4456                                          * coming out of throttled state
4457                                          */
4458                                         if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
4459                                                 if (rap != NULL) {
4460                                                         rd_ahead_enabled = 1;
4461                                                 }
4462                                                 prefetch_enabled = 1;
4463                                         }
4464                                         max_rd_size = max_prefetch;
4465                                         last_ioread_offset = 0;
4466                                 }
4467                         }
4468                 }
4469         }
4470         if (iolock_inited == TRUE) {
4471                 /*
4472                  * cluster_io returned an error after it
4473                  * had already issued some I/O.  we need
4474                  * to wait for that I/O to complete before
4475                  * we can destroy the iostate mutex...
4476                  * 'retval' already contains the early error
4477                  * so no need to pick it up from iostate.io_error
4478                  */
4479                 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4480
4481                 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
4482         }
4483         if (rap != NULL) {
4484                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4485                     (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
4486
4487                 lck_mtx_unlock(&rap->cl_lockr);
4488         } else {
4489                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4490                     (int)uio->uio_offset, io_req_size, 0, retval, 0);
4491         }
4492
4493         return retval;
4494 }
4495
4496 /*
4497  * We don't want another read/write lock for every vnode in the system
4498  * so we keep a hash of them here.  There should never be very many of
4499  * these around at any point in time.
4500  */
4501 cl_direct_read_lock_t *
4502 cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
4503 {
4504         struct cl_direct_read_locks *head
4505                 = &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
4506             % CL_DIRECT_READ_LOCK_BUCKETS];
4507
4508         struct cl_direct_read_lock *lck, *new_lck = NULL;
4509
4510         for (;;) {
4511                 lck_spin_lock(&cl_direct_read_spin_lock);
4512
4513                 LIST_FOREACH(lck, head, chain) {
4514                         if (lck->vp == vp) {
4515                                 ++lck->ref_count;
4516                                 lck_spin_unlock(&cl_direct_read_spin_lock);
4517                                 if (new_lck) {
4518                                         // Someone beat us to it, ditch the allocation
4519                                         lck_rw_destroy(&new_lck->rw_lock, &cl_mtx_grp);
4520                                         kheap_free(KHEAP_DEFAULT, new_lck, sizeof(cl_direct_read_lock_t));
4521                                 }
4522                                 lck_rw_lock(&lck->rw_lock, type);
4523                                 return lck;
4524                         }
4525                 }
4526
4527                 if (new_lck) {
4528                         // Use the lock we allocated
4529                         LIST_INSERT_HEAD(head, new_lck, chain);
4530                         lck_spin_unlock(&cl_direct_read_spin_lock);
4531                         lck_rw_lock(&new_lck->rw_lock, type);
4532                         return new_lck;
4533                 }
4534
4535                 lck_spin_unlock(&cl_direct_read_spin_lock);
4536
4537                 // Allocate a new lock
4538                 new_lck = kheap_alloc(KHEAP_DEFAULT, sizeof(cl_direct_read_lock_t),
4539                     Z_WAITOK);
4540                 lck_rw_init(&new_lck->rw_lock, &cl_mtx_grp, LCK_ATTR_NULL);
4541                 new_lck->vp = vp;
4542                 new_lck->ref_count = 1;
4543
4544                 // Got to go round again
4545         }
4546 }
4547
4548 void
4549 cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
4550 {
4551         lck_rw_done(&lck->rw_lock);
4552
4553         lck_spin_lock(&cl_direct_read_spin_lock);
4554         if (lck->ref_count == 1) {
4555                 LIST_REMOVE(lck, chain);
4556                 lck_spin_unlock(&cl_direct_read_spin_lock);
4557                 lck_rw_destroy(&lck->rw_lock, &cl_mtx_grp);
4558                 kheap_free(KHEAP_DEFAULT, lck, sizeof(cl_direct_read_lock_t));
4559         } else {
4560                 --lck->ref_count;
4561                 lck_spin_unlock(&cl_direct_read_spin_lock);
4562         }
4563 }
4564
4565 static int
4566 cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
4567     int flags, int (*callback)(buf_t, void *), void *callback_arg)
4568 {
4569         upl_t            upl;
4570         upl_page_info_t  *pl;
4571         off_t            max_io_size;
4572         vm_offset_t      upl_offset, vector_upl_offset = 0;
4573         upl_size_t       upl_size, vector_upl_size = 0;
4574         vm_size_t        upl_needed_size;
4575         unsigned int     pages_in_pl;
4576         upl_control_flags_t upl_flags;
4577         kern_return_t    kret;
4578         unsigned int     i;
4579         int              force_data_sync;
4580         int              retval = 0;
4581         int              no_zero_fill = 0;
4582         int              io_flag = 0;
4583         int              misaligned = 0;
4584         struct clios     iostate;
4585         user_addr_t      iov_base;
4586         u_int32_t        io_req_size;
4587         u_int32_t        offset_in_file;
4588         u_int32_t        offset_in_iovbase;
4589         u_int32_t        io_size;
4590         u_int32_t        io_min;
4591         u_int32_t        xsize;
4592         u_int32_t        devblocksize;
4593         u_int32_t        mem_alignment_mask;
4594         u_int32_t        max_upl_size;
4595         u_int32_t        max_rd_size;
4596         u_int32_t        max_rd_ahead;
4597         u_int32_t        max_vector_size;
4598         boolean_t        io_throttled = FALSE;
4599
4600         u_int32_t        vector_upl_iosize = 0;
4601         int              issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
4602         off_t            v_upl_uio_offset = 0;
4603         int              vector_upl_index = 0;
4604         upl_t            vector_upl = NULL;
4605         cl_direct_read_lock_t *lock = NULL;
4606
4607         user_addr_t      orig_iov_base = 0;
4608         user_addr_t      last_iov_base = 0;
4609         user_addr_t      next_iov_base = 0;
4610
4611         assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
4612
4613         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
4614             (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
4615
4616         max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
4617
4618         max_rd_size = max_upl_size;
4619         max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4620
4621         io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO;
4622
4623         if (flags & IO_PASSIVE) {
4624                 io_flag |= CL_PASSIVE;
4625         }
4626
4627         if (flags & IO_ENCRYPTED) {
4628                 io_flag |= CL_RAW_ENCRYPTED;
4629         }
4630
4631         if (flags & IO_NOCACHE) {
4632                 io_flag |= CL_NOCACHE;
4633         }
4634
4635         if (flags & IO_SKIP_ENCRYPTION) {
4636                 io_flag |= CL_ENCRYPTED;
4637         }
4638
4639         iostate.io_completed = 0;
4640         iostate.io_issued = 0;
4641         iostate.io_error = 0;
4642         iostate.io_wanted = 0;
4643
4644         lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
4645
4646         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
4647         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
4648
4649         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
4650             (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
4651
4652         if (devblocksize == 1) {
4653                 /*
4654                  * the AFP client advertises a devblocksize of 1
4655                  * however, its BLOCKMAP routine maps to physical
4656                  * blocks that are PAGE_SIZE in size...
4657                  * therefore we can't ask for I/Os that aren't page aligned
4658                  * or aren't multiples of PAGE_SIZE in size
4659                  * by setting devblocksize to PAGE_SIZE, we re-instate
4660                  * the old behavior we had before the mem_alignment_mask
4661                  * changes went in...
4662                  */
4663                 devblocksize = PAGE_SIZE;
4664         }
4665
4666         orig_iov_base = uio_curriovbase(uio);
4667         last_iov_base = orig_iov_base;
4668
4669 next_dread:
4670         io_req_size = *read_length;
4671         iov_base = uio_curriovbase(uio);
4672
4673         offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
4674         offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
4675
4676         if (vm_map_page_mask(current_map()) < PAGE_MASK) {
4677                 /*
4678                  * XXX TODO4K
4679                  * Direct I/O might not work as expected from a 16k kernel space
4680                  * to a 4k user space because each 4k chunk might point to
4681                  * a different 16k physical page...
4682                  * Let's go the "misaligned" way.
4683                  */
4684                 if (!misaligned) {
4685                         DEBUG4K_VFS("forcing misaligned\n");
4686                 }
4687                 misaligned = 1;
4688         }
4689
4690         if (offset_in_file || offset_in_iovbase) {
4691                 /*
4692                  * one of the 2 important offsets is misaligned
4693                  * so fire an I/O through the cache for this entire vector
4694                  */
4695                 misaligned = 1;
4696         }
4697         if (iov_base & (devblocksize - 1)) {
4698                 /*
4699                  * the offset in memory must be on a device block boundary
4700                  * so that we can guarantee that we can generate an
4701                  * I/O that ends on a page boundary in cluster_io
4702                  */
4703                 misaligned = 1;
4704         }
4705
4706         max_io_size = filesize - uio->uio_offset;
4707
4708         /*
4709          * The user must request IO in aligned chunks.  If the
4710          * offset into the file is bad, or the userland pointer
4711          * is non-aligned, then we cannot service the encrypted IO request.
4712          */
4713         if (flags & IO_ENCRYPTED) {
4714                 if (misaligned || (io_req_size & (devblocksize - 1))) {
4715                         retval = EINVAL;
4716                 }
4717
4718                 max_io_size = roundup(max_io_size, devblocksize);
4719         }
4720
4721         if ((off_t)io_req_size > max_io_size) {
4722                 io_req_size = (u_int32_t)max_io_size;
4723         }
4724
4725         /*
4726          * When we get to this point, we know...
4727          *  -- the offset into the file is on a devblocksize boundary
4728          */
4729
4730         while (io_req_size && retval == 0) {
4731                 u_int32_t io_start;
4732
4733                 if (cluster_is_throttled(vp)) {
4734                         /*
4735                          * we're in the throttle window, at the very least
4736                          * we want to limit the size of the I/O we're about
4737                          * to issue
4738                          */
4739                         max_rd_size  = THROTTLE_MAX_IOSIZE;
4740                         max_rd_ahead = THROTTLE_MAX_IOSIZE - 1;
4741                         max_vector_size = THROTTLE_MAX_IOSIZE;
4742                 } else {
4743                         max_rd_size  = max_upl_size;
4744                         max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4745                         max_vector_size = MAX_VECTOR_UPL_SIZE;
4746                 }
4747                 io_start = io_size = io_req_size;
4748
4749                 /*
4750                  * First look for pages already in the cache
4751                  * and move them to user space.  But only do this
4752                  * check if we are not retrieving encrypted data directly
4753                  * from the filesystem;  those blocks should never
4754                  * be in the UBC.
4755                  *
4756                  * cluster_copy_ubc_data returns the resid
4757                  * in io_size
4758                  */
4759                 if ((flags & IO_ENCRYPTED) == 0) {
4760                         retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
4761                 }
4762                 /*
4763                  * calculate the number of bytes actually copied
4764                  * starting size - residual
4765                  */
4766                 xsize = io_start - io_size;
4767
4768                 io_req_size -= xsize;
4769
4770                 if (useVectorUPL && (xsize || (iov_base & PAGE_MASK))) {
4771                         /*
4772                          * We found something in the cache or we have an iov_base that's not
4773                          * page-aligned.
4774                          *
4775                          * Issue all I/O's that have been collected within this Vectored UPL.
4776                          */
4777                         if (vector_upl_index) {
4778                                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4779                                 reset_vector_run_state();
4780                         }
4781
4782                         if (xsize) {
4783                                 useVectorUPL = 0;
4784                         }
4785
4786                         /*
4787                          * After this point, if we are using the Vector UPL path and the base is
4788                          * not page-aligned then the UPL with that base will be the first in the vector UPL.
4789                          */
4790                 }
4791
4792                 /*
4793                  * check to see if we are finished with this request.
4794                  *
4795                  * If we satisfied this IO already, then io_req_size will be 0.
4796                  * Otherwise, see if the IO was mis-aligned and needs to go through
4797                  * the UBC to deal with the 'tail'.
4798                  *
4799                  */
4800                 if (io_req_size == 0 || (misaligned)) {
4801                         /*
4802                          * see if there's another uio vector to
4803                          * process that's of type IO_DIRECT
4804                          *
4805                          * break out of while loop to get there
4806                          */
4807                         break;
4808                 }
4809                 /*
4810                  * assume the request ends on a device block boundary
4811                  */
4812                 io_min = devblocksize;
4813
4814                 /*
4815                  * we can handle I/O's in multiples of the device block size
4816                  * however, if io_size isn't a multiple of devblocksize we
4817                  * want to clip it back to the nearest page boundary since
4818                  * we are going to have to go through cluster_read_copy to
4819                  * deal with the 'overhang'... by clipping it to a PAGE_SIZE
4820                  * multiple, we avoid asking the drive for the same physical
4821                  * blocks twice.. once for the partial page at the end of the
4822                  * request and a 2nd time for the page we read into the cache
4823                  * (which overlaps the end of the direct read) in order to
4824                  * get at the overhang bytes
4825                  */
4826                 if (io_size & (devblocksize - 1)) {
4827                         assert(!(flags & IO_ENCRYPTED));
4828                         /*
4829                          * Clip the request to the previous page size boundary
4830                          * since request does NOT end on a device block boundary
4831                          */
4832                         io_size &= ~PAGE_MASK;
4833                         io_min = PAGE_SIZE;
4834                 }
4835                 if (retval || io_size < io_min) {
4836                         /*
4837                          * either an error or we only have the tail left to
4838                          * complete via the copy path...
4839                          * we may have already spun some portion of this request
4840                          * off as async requests... we need to wait for the I/O
4841                          * to complete before returning
4842                          */
4843                         goto wait_for_dreads;
4844                 }
4845
4846                 /*
4847                  * Don't re-check the UBC data if we are looking for uncached IO
4848                  * or asking for encrypted blocks.
4849                  */
4850                 if ((flags & IO_ENCRYPTED) == 0) {
4851                         if ((xsize = io_size) > max_rd_size) {
4852                                 xsize = max_rd_size;
4853                         }
4854
4855                         io_size = 0;
4856
4857                         if (!lock) {
4858                                 /*
4859                                  * We hold a lock here between the time we check the
4860                                  * cache and the time we issue I/O.  This saves us
4861                                  * from having to lock the pages in the cache.  Not
4862                                  * all clients will care about this lock but some
4863                                  * clients may want to guarantee stability between
4864                                  * here and when the I/O is issued in which case they
4865                                  * will take the lock exclusively.
4866                                  */
4867                                 lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
4868                         }
4869
4870                         ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
4871
4872                         if (io_size == 0) {
4873                                 /*
4874                                  * a page must have just come into the cache
4875                                  * since the first page in this range is no
4876                                  * longer absent, go back and re-evaluate
4877                                  */
4878                                 continue;
4879                         }
4880                 }
4881                 if ((flags & IO_RETURN_ON_THROTTLE)) {
4882                         if (cluster_is_throttled(vp) == THROTTLE_NOW) {
4883                                 if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
4884                                         /*
4885                                          * we're in the throttle window and at least 1 I/O
4886                                          * has already been issued by a throttleable thread
4887                                          * in this window, so return with EAGAIN to indicate
4888                                          * to the FS issuing the cluster_read call that it
4889                                          * should now throttle after dropping any locks
4890                                          */
4891                                         throttle_info_update_by_mount(vp->v_mount);
4892
4893                                         io_throttled = TRUE;
4894                                         goto wait_for_dreads;
4895                                 }
4896                         }
4897                 }
4898                 if (io_size > max_rd_size) {
4899                         io_size = max_rd_size;
4900                 }
4901
4902                 iov_base = uio_curriovbase(uio);
4903
4904                 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
4905                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4906
4907                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
4908                     (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
4909
4910                 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
4911                         no_zero_fill = 1;
4912                 } else {
4913                         no_zero_fill = 0;
4914                 }
4915
4916                 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
4917                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
4918                         pages_in_pl = 0;
4919                         upl_size = (upl_size_t)upl_needed_size;
4920                         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
4921                         if (no_zero_fill) {
4922                                 upl_flags |= UPL_NOZEROFILL;
4923                         }
4924                         if (force_data_sync) {
4925                                 upl_flags |= UPL_FORCE_DATA_SYNC;
4926                         }
4927
4928                         kret = vm_map_create_upl(map,
4929                             (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
4930                             &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE);
4931
4932                         if (kret != KERN_SUCCESS) {
4933                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4934                                     (int)upl_offset, upl_size, io_size, kret, 0);
4935                                 /*
4936                                  * failed to get pagelist
4937                                  *
4938                                  * we may have already spun some portion of this request
4939                                  * off as async requests... we need to wait for the I/O
4940                                  * to complete before returning
4941                                  */
4942                                 goto wait_for_dreads;
4943                         }
4944                         pages_in_pl = upl_size / PAGE_SIZE;
4945                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
4946
4947                         for (i = 0; i < pages_in_pl; i++) {
4948                                 if (!upl_page_present(pl, i)) {
4949                                         break;
4950                                 }
4951                         }
4952                         if (i == pages_in_pl) {
4953                                 break;
4954                         }
4955
4956                         ubc_upl_abort(upl, 0);
4957                 }
4958                 if (force_data_sync >= 3) {
4959                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4960                             (int)upl_offset, upl_size, io_size, kret, 0);
4961
4962                         goto wait_for_dreads;
4963                 }
4964                 /*
4965                  * Consider the possibility that upl_size wasn't satisfied.
4966                  */
4967                 if (upl_size < upl_needed_size) {
4968                         if (upl_size && upl_offset == 0) {
4969                                 io_size = upl_size;
4970                         } else {
4971                                 io_size = 0;
4972                         }
4973                 }
4974                 if (io_size == 0) {
4975                         ubc_upl_abort(upl, 0);
4976                         goto wait_for_dreads;
4977                 }
4978                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4979                     (int)upl_offset, upl_size, io_size, kret, 0);
4980
4981                 if (useVectorUPL) {
4982                         vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
4983                         if (end_off) {
4984                                 issueVectorUPL = 1;
4985                         }
4986                         /*
4987                          * After this point, if we are using a vector UPL, then
4988                          * either all the UPL elements end on a page boundary OR
4989                          * this UPL is the last element because it does not end
4990                          * on a page boundary.
4991                          */
4992                 }
4993
4994                 /*
4995                  * request asynchronously so that we can overlap
4996                  * the preparation of the next I/O
4997                  * if there are already too many outstanding reads
4998                  * wait until some have completed before issuing the next read
4999                  */
5000                 cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
5001
5002                 if (iostate.io_error) {
5003                         /*
5004                          * one of the earlier reads we issued ran into a hard error
5005                          * don't issue any more reads, cleanup the UPL
5006                          * that was just created but not used, then
5007                          * go wait for any other reads to complete before
5008                          * returning the error to the caller
5009                          */
5010                         ubc_upl_abort(upl, 0);
5011
5012                         goto wait_for_dreads;
5013                 }
5014                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
5015                     upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
5016
5017                 if (!useVectorUPL) {
5018                         if (no_zero_fill) {
5019                                 io_flag &= ~CL_PRESERVE;
5020                         } else {
5021                                 io_flag |= CL_PRESERVE;
5022                         }
5023
5024                         retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5025                 } else {
5026                         if (!vector_upl_index) {
5027                                 vector_upl = vector_upl_create(upl_offset);
5028                                 v_upl_uio_offset = uio->uio_offset;
5029                                 vector_upl_offset = upl_offset;
5030                         }
5031
5032                         vector_upl_set_subupl(vector_upl, upl, upl_size);
5033                         vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
5034                         vector_upl_index++;
5035                         vector_upl_size += upl_size;
5036                         vector_upl_iosize += io_size;
5037
5038                         if (issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
5039                                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5040                                 reset_vector_run_state();
5041                         }
5042                 }
5043                 last_iov_base = iov_base + io_size;
5044
5045                 if (lock) {
5046                         // We don't need to wait for the I/O to complete
5047                         cluster_unlock_direct_read(lock);
5048                         lock = NULL;
5049                 }
5050
5051                 /*
5052                  * update the uio structure
5053                  */
5054                 if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
5055                         uio_update(uio, (user_size_t)max_io_size);
5056                 } else {
5057                         uio_update(uio, (user_size_t)io_size);
5058                 }
5059
5060                 io_req_size -= io_size;
5061
5062                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
5063                     upl, (int)uio->uio_offset, io_req_size, retval, 0);
5064         } /* end while */
5065
5066         if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
5067                 retval = cluster_io_type(uio, read_type, read_length, 0);
5068
5069                 if (retval == 0 && *read_type == IO_DIRECT) {
5070                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
5071                             (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
5072
5073                         goto next_dread;
5074                 }
5075         }
5076
5077 wait_for_dreads:
5078
5079         if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
5080                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5081                 reset_vector_run_state();
5082         }
5083
5084         // We don't need to wait for the I/O to complete
5085         if (lock) {
5086                 cluster_unlock_direct_read(lock);
5087         }
5088
5089         /*
5090          * make sure all async reads that are part of this stream
5091          * have completed before we return
5092          */
5093         cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
5094
5095         if (iostate.io_error) {
5096                 retval = iostate.io_error;
5097         }
5098
5099         lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
5100
5101         if (io_throttled == TRUE && retval == 0) {
5102                 retval = EAGAIN;
5103         }
5104
5105         vm_map_offset_t current_page_size, current_page_mask;
5106         current_page_size = vm_map_page_size(current_map());
5107         current_page_mask = vm_map_page_mask(current_map());
5108         for (next_iov_base = orig_iov_base;
5109             next_iov_base < last_iov_base;
5110             next_iov_base += current_page_size) {
5111                 /*
5112                  * This is specifically done for pmap accounting purposes.
5113                  * vm_pre_fault() will call vm_fault() to enter the page into
5114                  * the pmap if there isn't _a_ physical page for that VA already.
5115                  */
5116                 vm_pre_fault(vm_map_trunc_page(next_iov_base, current_page_mask), VM_PROT_READ);
5117         }
5118
5119         if (io_req_size && retval == 0) {
5120                 /*
5121                  * we couldn't handle the tail of this request in DIRECT mode
5122                  * so fire it through the copy path
5123                  */
5124                 if (flags & IO_ENCRYPTED) {
5125                         /*
5126                          * We cannot fall back to the copy path for encrypted I/O. If this
5127                          * happens, there is something wrong with the user buffer passed
5128                          * down.
5129                          */
5130                         retval = EFAULT;
5131                 } else {
5132                         retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
5133                 }
5134
5135                 *read_type = IO_UNKNOWN;
5136         }
5137         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
5138             (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
5139
5140         return retval;
5141 }
5142
5143
5144 static int
5145 cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
5146     int (*callback)(buf_t, void *), void *callback_arg, int flags)
5147 {
5148         upl_page_info_t *pl;
5149         upl_t            upl[MAX_VECTS];
5150         vm_offset_t      upl_offset;
5151         addr64_t         dst_paddr = 0;
5152         user_addr_t      iov_base;
5153         off_t            max_size;
5154         upl_size_t       upl_size;
5155         vm_size_t        upl_needed_size;
5156         mach_msg_type_number_t  pages_in_pl;
5157         upl_control_flags_t upl_flags;
5158         kern_return_t    kret;
5159         struct clios     iostate;
5160         int              error = 0;
5161         int              cur_upl = 0;
5162         int              num_upl = 0;
5163         int              n;
5164         u_int32_t        xsize;
5165         u_int32_t        io_size;
5166         u_int32_t        devblocksize;
5167         u_int32_t        mem_alignment_mask;
5168         u_int32_t        tail_size = 0;
5169         int              bflag;
5170
5171         if (flags & IO_PASSIVE) {
5172                 bflag = CL_PASSIVE;
5173         } else {
5174                 bflag = 0;
5175         }
5176
5177         if (flags & IO_NOCACHE) {
5178                 bflag |= CL_NOCACHE;
5179         }
5180
5181         /*
5182          * When we enter this routine, we know
5183          *  -- the read_length will not exceed the current iov_len
5184          *  -- the target address is physically contiguous for read_length
5185          */
5186         cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
5187
5188         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
5189         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
5190
5191         iostate.io_completed = 0;
5192         iostate.io_issued = 0;
5193         iostate.io_error = 0;
5194         iostate.io_wanted = 0;
5195
5196         lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
5197
5198 next_cread:
5199         io_size = *read_length;
5200
5201         max_size = filesize - uio->uio_offset;
5202
5203         if (io_size > max_size) {
5204                 io_size = (u_int32_t)max_size;
5205         }
5206
5207         iov_base = uio_curriovbase(uio);
5208
5209         upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5210         upl_needed_size = upl_offset + io_size;
5211
5212         pages_in_pl = 0;
5213         upl_size = (upl_size_t)upl_needed_size;
5214         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
5215
5216
5217         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START,
5218             (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
5219
5220         vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5221         kret = vm_map_get_upl(map,
5222             vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
5223             &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
5224
5225         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END,
5226             (int)upl_offset, upl_size, io_size, kret, 0);
5227
5228         if (kret != KERN_SUCCESS) {
5229                 /*
5230                  * failed to get pagelist
5231                  */
5232                 error = EINVAL;
5233                 goto wait_for_creads;
5234         }
5235         num_upl++;
5236
5237         if (upl_size < upl_needed_size) {
5238                 /*
5239                  * The upl_size wasn't satisfied.
5240                  */
5241                 error = EINVAL;
5242                 goto wait_for_creads;
5243         }
5244         pl = ubc_upl_pageinfo(upl[cur_upl]);
5245
5246         dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
5247
5248         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
5249                 u_int32_t   head_size;
5250
5251                 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
5252
5253                 if (head_size > io_size) {
5254                         head_size = io_size;
5255                 }
5256
5257                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
5258
5259                 if (error) {
5260                         goto wait_for_creads;
5261                 }
5262
5263                 upl_offset += head_size;
5264                 dst_paddr  += head_size;
5265                 io_size    -= head_size;
5266
5267                 iov_base   += head_size;
5268         }
5269         if ((u_int32_t)iov_base & mem_alignment_mask) {
5270                 /*
5271                  * request doesn't set up on a memory boundary
5272                  * the underlying DMA engine can handle...
5273                  * return an error instead of going through
5274                  * the slow copy path since the intent of this
5275                  * path is direct I/O to device memory
5276                  */
5277                 error = EINVAL;
5278                 goto wait_for_creads;
5279         }
5280
5281         tail_size = io_size & (devblocksize - 1);
5282
5283         io_size  -= tail_size;
5284
5285         while (io_size && error == 0) {
5286                 if (io_size > MAX_IO_CONTIG_SIZE) {
5287                         xsize = MAX_IO_CONTIG_SIZE;
5288                 } else {
5289                         xsize = io_size;
5290                 }
5291                 /*
5292                  * request asynchronously so that we can overlap
5293                  * the preparation of the next I/O... we'll do
5294                  * the commit after all the I/O has completed
5295                  * since its all issued against the same UPL
5296                  * if there are already too many outstanding reads
5297                  * wait until some have completed before issuing the next
5298                  */
5299                 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
5300
5301                 if (iostate.io_error) {
5302                         /*
5303                          * one of the earlier reads we issued ran into a hard error
5304                          * don't issue any more reads...
5305                          * go wait for any other reads to complete before
5306                          * returning the error to the caller
5307                          */
5308                         goto wait_for_creads;
5309                 }
5310                 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
5311                     CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag,
5312                     (buf_t)NULL, &iostate, callback, callback_arg);
5313                 /*
5314                  * The cluster_io read was issued successfully,
5315                  * update the uio structure
5316                  */
5317                 if (error == 0) {
5318                         uio_update(uio, (user_size_t)xsize);
5319
5320                         dst_paddr  += xsize;
5321                         upl_offset += xsize;
5322                         io_size    -= xsize;
5323                 }
5324         }
5325         if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
5326                 error = cluster_io_type(uio, read_type, read_length, 0);
5327
5328                 if (error == 0 && *read_type == IO_CONTIG) {
5329                         cur_upl++;
5330                         goto next_cread;
5331                 }
5332         } else {
5333                 *read_type = IO_UNKNOWN;
5334         }
5335
5336 wait_for_creads:
5337         /*
5338          * make sure all async reads that are part of this stream
5339          * have completed before we proceed
5340          */
5341         cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
5342
5343         if (iostate.io_error) {
5344                 error = iostate.io_error;
5345         }
5346
5347         lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
5348
5349         if (error == 0 && tail_size) {
5350                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
5351         }
5352
5353         for (n = 0; n < num_upl; n++) {
5354                 /*
5355                  * just release our hold on each physically contiguous
5356                  * region without changing any state
5357                  */
5358                 ubc_upl_abort(upl[n], 0);
5359         }
5360
5361         return error;
5362 }
5363
5364
5365 static int
5366 cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length)
5367 {
5368         user_size_t      iov_len;
5369         user_addr_t      iov_base = 0;
5370         upl_t            upl;
5371         upl_size_t       upl_size;
5372         upl_control_flags_t upl_flags;
5373         int              retval = 0;
5374
5375         /*
5376          * skip over any emtpy vectors
5377          */
5378         uio_update(uio, (user_size_t)0);
5379
5380         iov_len = uio_curriovlen(uio);
5381
5382         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
5383
5384         if (iov_len) {
5385                 iov_base = uio_curriovbase(uio);
5386                 /*
5387                  * make sure the size of the vector isn't too big...
5388                  * internally, we want to handle all of the I/O in
5389                  * chunk sizes that fit in a 32 bit int
5390                  */
5391                 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) {
5392                         upl_size = MAX_IO_REQUEST_SIZE;
5393                 } else {
5394                         upl_size = (u_int32_t)iov_len;
5395                 }
5396
5397                 upl_flags = UPL_QUERY_OBJECT_TYPE;
5398
5399                 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5400                 if ((vm_map_get_upl(map,
5401                     vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
5402                     &upl_size, &upl, NULL, NULL, &upl_flags, VM_KERN_MEMORY_FILE, 0)) != KERN_SUCCESS) {
5403                         /*
5404                          * the user app must have passed in an invalid address
5405                          */
5406                         retval = EFAULT;
5407                 }
5408                 if (upl_size == 0) {
5409                         retval = EFAULT;
5410                 }
5411
5412                 *io_length = upl_size;
5413
5414                 if (upl_flags & UPL_PHYS_CONTIG) {
5415                         *io_type = IO_CONTIG;
5416                 } else if (iov_len >= min_length) {
5417                         *io_type = IO_DIRECT;
5418                 } else {
5419                         *io_type = IO_COPY;
5420                 }
5421         } else {
5422                 /*
5423                  * nothing left to do for this uio
5424                  */
5425                 *io_length = 0;
5426                 *io_type   = IO_UNKNOWN;
5427         }
5428         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0);
5429
5430         if (*io_type == IO_DIRECT &&
5431             vm_map_page_shift(current_map()) < PAGE_SHIFT) {
5432                 /* no direct I/O for sub-page-size address spaces */
5433                 DEBUG4K_VFS("io_type IO_DIRECT -> IO_COPY\n");
5434                 *io_type = IO_COPY;
5435         }
5436
5437         return retval;
5438 }
5439
5440
5441 /*
5442  * generate advisory I/O's in the largest chunks possible
5443  * the completed pages will be released into the VM cache
5444  */
5445 int
5446 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
5447 {
5448         return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
5449 }
5450
5451 int
5452 advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
5453 {
5454         upl_page_info_t *pl;
5455         upl_t            upl;
5456         vm_offset_t      upl_offset;
5457         int              upl_size;
5458         off_t            upl_f_offset;
5459         int              start_offset;
5460         int              start_pg;
5461         int              last_pg;
5462         int              pages_in_upl;
5463         off_t            max_size;
5464         int              io_size;
5465         kern_return_t    kret;
5466         int              retval = 0;
5467         int              issued_io;
5468         int              skip_range;
5469         uint32_t         max_io_size;
5470
5471
5472         if (!UBCINFOEXISTS(vp)) {
5473                 return EINVAL;
5474         }
5475
5476         if (f_offset < 0 || resid < 0) {
5477                 return EINVAL;
5478         }
5479
5480         max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
5481
5482         if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
5483                 if (max_io_size > speculative_prefetch_max_iosize) {
5484                         max_io_size = speculative_prefetch_max_iosize;
5485                 }
5486         }
5487
5488         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
5489             (int)f_offset, resid, (int)filesize, 0, 0);
5490
5491         while (resid && f_offset < filesize && retval == 0) {
5492                 /*
5493                  * compute the size of the upl needed to encompass
5494                  * the requested read... limit each call to cluster_io
5495                  * to the maximum UPL size... cluster_io will clip if
5496                  * this exceeds the maximum io_size for the device,
5497                  * make sure to account for
5498                  * a starting offset that's not page aligned
5499                  */
5500                 start_offset = (int)(f_offset & PAGE_MASK_64);
5501                 upl_f_offset = f_offset - (off_t)start_offset;
5502                 max_size     = filesize - f_offset;
5503
5504                 if (resid < max_size) {
5505                         io_size = resid;
5506                 } else {
5507                         io_size = (int)max_size;
5508                 }
5509
5510                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5511                 if ((uint32_t)upl_size > max_io_size) {
5512                         upl_size = max_io_size;
5513                 }
5514
5515                 skip_range = 0;
5516                 /*
5517                  * return the number of contiguously present pages in the cache
5518                  * starting at upl_f_offset within the file
5519                  */
5520                 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
5521
5522                 if (skip_range) {
5523                         /*
5524                          * skip over pages already present in the cache
5525                          */
5526                         io_size = skip_range - start_offset;
5527
5528                         f_offset += io_size;
5529                         resid    -= io_size;
5530
5531                         if (skip_range == upl_size) {
5532                                 continue;
5533                         }
5534                         /*
5535                          * have to issue some real I/O
5536                          * at this point, we know it's starting on a page boundary
5537                          * because we've skipped over at least the first page in the request
5538                          */
5539                         start_offset = 0;
5540                         upl_f_offset += skip_range;
5541                         upl_size     -= skip_range;
5542                 }
5543                 pages_in_upl = upl_size / PAGE_SIZE;
5544
5545                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
5546                     upl, (int)upl_f_offset, upl_size, start_offset, 0);
5547
5548                 kret = ubc_create_upl_kernel(vp,
5549                     upl_f_offset,
5550                     upl_size,
5551                     &upl,
5552                     &pl,
5553                     UPL_RET_ONLY_ABSENT | UPL_SET_LITE,
5554                     VM_KERN_MEMORY_FILE);
5555                 if (kret != KERN_SUCCESS) {
5556                         return retval;
5557                 }
5558                 issued_io = 0;
5559
5560                 /*
5561                  * before we start marching forward, we must make sure we end on
5562                  * a present page, otherwise we will be working with a freed
5563                  * upl
5564                  */
5565                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
5566                         if (upl_page_present(pl, last_pg)) {
5567                                 break;
5568                         }
5569                 }
5570                 pages_in_upl = last_pg + 1;
5571
5572
5573                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
5574                     upl, (int)upl_f_offset, upl_size, start_offset, 0);
5575
5576
5577                 for (last_pg = 0; last_pg < pages_in_upl;) {
5578                         /*
5579                          * scan from the beginning of the upl looking for the first
5580                          * page that is present.... this will become the first page in
5581                          * the request we're going to make to 'cluster_io'... if all
5582                          * of the pages are absent, we won't call through to 'cluster_io'
5583                          */
5584                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
5585                                 if (upl_page_present(pl, start_pg)) {
5586                                         break;
5587                                 }
5588                         }
5589
5590                         /*
5591                          * scan from the starting present page looking for an absent
5592                          * page before the end of the upl is reached, if we
5593                          * find one, then it will terminate the range of pages being
5594                          * presented to 'cluster_io'
5595                          */
5596                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
5597                                 if (!upl_page_present(pl, last_pg)) {
5598                                         break;
5599                                 }
5600                         }
5601
5602                         if (last_pg > start_pg) {
5603                                 /*
5604                                  * we found a range of pages that must be filled
5605                                  * if the last page in this range is the last page of the file
5606                                  * we may have to clip the size of it to keep from reading past
5607                                  * the end of the last physical block associated with the file
5608                                  */
5609                                 upl_offset = start_pg * PAGE_SIZE;
5610                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
5611
5612                                 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
5613                                         io_size = (int)(filesize - (upl_f_offset + upl_offset));
5614                                 }
5615
5616                                 /*
5617                                  * issue an asynchronous read to cluster_io
5618                                  */
5619                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
5620                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
5621
5622                                 issued_io = 1;
5623                         }
5624                 }
5625                 if (issued_io == 0) {
5626                         ubc_upl_abort(upl, 0);
5627                 }
5628
5629                 io_size = upl_size - start_offset;
5630
5631                 if (io_size > resid) {
5632                         io_size = resid;
5633                 }
5634                 f_offset += io_size;
5635                 resid    -= io_size;
5636         }
5637
5638         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
5639             (int)f_offset, resid, retval, 0, 0);
5640
5641         return retval;
5642 }
5643
5644
5645 int
5646 cluster_push(vnode_t vp, int flags)
5647 {
5648         return cluster_push_ext(vp, flags, NULL, NULL);
5649 }
5650
5651
5652 int
5653 cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg)
5654 {
5655         return cluster_push_err(vp, flags, callback, callback_arg, NULL);
5656 }
5657
5658 /* write errors via err, but return the number of clusters written */
5659 int
5660 cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg, int *err)
5661 {
5662         int     retval;
5663         int     my_sparse_wait = 0;
5664         struct  cl_writebehind *wbp;
5665         int     local_err = 0;
5666
5667         if (err) {
5668                 *err = 0;
5669         }
5670
5671         if (!UBCINFOEXISTS(vp)) {
5672                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0);
5673                 return 0;
5674         }
5675         /* return if deferred write is set */
5676         if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
5677                 return 0;
5678         }
5679         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
5680                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0);
5681                 return 0;
5682         }
5683         if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
5684                 lck_mtx_unlock(&wbp->cl_lockw);
5685
5686                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0);
5687                 return 0;
5688         }
5689         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
5690             wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
5691
5692         /*
5693          * if we have an fsync in progress, we don't want to allow any additional
5694          * sync/fsync/close(s) to occur until it finishes.
5695          * note that its possible for writes to continue to occur to this file
5696          * while we're waiting and also once the fsync starts to clean if we're
5697          * in the sparse map case
5698          */
5699         while (wbp->cl_sparse_wait) {
5700                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5701
5702                 msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5703
5704                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5705         }
5706         if (flags & IO_SYNC) {
5707                 my_sparse_wait = 1;
5708                 wbp->cl_sparse_wait = 1;
5709
5710                 /*
5711                  * this is an fsync (or equivalent)... we must wait for any existing async
5712                  * cleaning operations to complete before we evaulate the current state
5713                  * and finish cleaning... this insures that all writes issued before this
5714                  * fsync actually get cleaned to the disk before this fsync returns
5715                  */
5716                 while (wbp->cl_sparse_pushes) {
5717                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5718
5719                         msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5720
5721                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5722                 }
5723         }
5724         if (wbp->cl_scmap) {
5725                 void    *scmap;
5726
5727                 if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
5728                         scmap = wbp->cl_scmap;
5729                         wbp->cl_scmap = NULL;
5730
5731                         wbp->cl_sparse_pushes++;
5732
5733                         lck_mtx_unlock(&wbp->cl_lockw);
5734
5735                         retval = sparse_cluster_push(wbp, &scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5736
5737                         lck_mtx_lock(&wbp->cl_lockw);
5738
5739                         wbp->cl_sparse_pushes--;
5740
5741                         if (retval) {
5742                                 if (wbp->cl_scmap != NULL) {
5743                                         panic("cluster_push_err: Expected NULL cl_scmap\n");
5744                                 }
5745
5746                                 wbp->cl_scmap = scmap;
5747                         }
5748
5749                         if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) {
5750                                 wakeup((caddr_t)&wbp->cl_sparse_pushes);
5751                         }
5752                 } else {
5753                         retval = sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5754                 }
5755
5756                 local_err = retval;
5757
5758                 if (err) {
5759                         *err = retval;
5760                 }
5761                 retval = 1;
5762         } else {
5763                 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, &local_err, FALSE);
5764                 if (err) {
5765                         *err = local_err;
5766                 }
5767         }
5768         lck_mtx_unlock(&wbp->cl_lockw);
5769
5770         if (flags & IO_SYNC) {
5771                 (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
5772         }
5773
5774         if (my_sparse_wait) {
5775                 /*
5776                  * I'm the owner of the serialization token
5777                  * clear it and wakeup anyone that is waiting
5778                  * for me to finish
5779                  */
5780                 lck_mtx_lock(&wbp->cl_lockw);
5781
5782                 wbp->cl_sparse_wait = 0;
5783                 wakeup((caddr_t)&wbp->cl_sparse_wait);
5784
5785                 lck_mtx_unlock(&wbp->cl_lockw);
5786         }
5787         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
5788             wbp->cl_scmap, wbp->cl_number, retval, local_err, 0);
5789
5790         return retval;
5791 }
5792
5793
5794 __private_extern__ void
5795 cluster_release(struct ubc_info *ubc)
5796 {
5797         struct cl_writebehind *wbp;
5798         struct cl_readahead   *rap;
5799
5800         if ((wbp = ubc->cl_wbehind)) {
5801                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
5802
5803                 if (wbp->cl_scmap) {
5804                         vfs_drt_control(&(wbp->cl_scmap), 0);
5805                 }
5806                 lck_mtx_destroy(&wbp->cl_lockw, &cl_mtx_grp);
5807                 zfree(cl_wr_zone, wbp);
5808                 ubc->cl_wbehind = NULL;
5809         } else {
5810                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0);
5811         }
5812
5813         if ((rap = ubc->cl_rahead)) {
5814                 lck_mtx_destroy(&rap->cl_lockr, &cl_mtx_grp);
5815                 zfree(cl_rd_zone, rap);
5816                 ubc->cl_rahead  = NULL;
5817         }
5818
5819         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0);
5820 }
5821
5822
5823 static int
5824 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err, boolean_t vm_initiated)
5825 {
5826         int cl_index;
5827         int cl_index1;
5828         int min_index;
5829         int cl_len;
5830         int cl_pushed = 0;
5831         struct cl_wextent l_clusters[MAX_CLUSTERS];
5832         u_int  max_cluster_pgcount;
5833         int error = 0;
5834
5835         max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
5836         /*
5837          * the write behind context exists and has
5838          * already been locked...
5839          */
5840         if (wbp->cl_number == 0) {
5841                 /*
5842                  * no clusters to push
5843                  * return number of empty slots
5844                  */
5845                 return MAX_CLUSTERS;
5846         }
5847
5848         /*
5849          * make a local 'sorted' copy of the clusters
5850          * and clear wbp->cl_number so that new clusters can
5851          * be developed
5852          */
5853         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
5854                 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
5855                         if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) {
5856                                 continue;
5857                         }
5858                         if (min_index == -1) {
5859                                 min_index = cl_index1;
5860                         } else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) {
5861                                 min_index = cl_index1;
5862                         }
5863                 }
5864                 if (min_index == -1) {
5865                         break;
5866                 }
5867
5868                 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
5869                 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
5870                 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
5871
5872                 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
5873         }
5874         wbp->cl_number = 0;
5875
5876         cl_len = cl_index;
5877
5878         /* skip switching to the sparse cluster mechanism if on diskimage */
5879         if (((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS) &&
5880             !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)) {
5881                 int   i;
5882
5883                 /*
5884                  * determine if we appear to be writing the file sequentially
5885                  * if not, by returning without having pushed any clusters
5886                  * we will cause this vnode to be pushed into the sparse cluster mechanism
5887                  * used for managing more random I/O patterns
5888                  *
5889                  * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
5890                  * that's why we're in try_push with PUSH_DELAY...
5891                  *
5892                  * check to make sure that all the clusters except the last one are 'full'... and that each cluster
5893                  * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
5894                  * so we can just make a simple pass through, up to, but not including the last one...
5895                  * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
5896                  * are sequential
5897                  *
5898                  * we let the last one be partial as long as it was adjacent to the previous one...
5899                  * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
5900                  * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
5901                  */
5902                 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
5903                         if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) {
5904                                 goto dont_try;
5905                         }
5906                         if (l_clusters[i].e_addr != l_clusters[i + 1].b_addr) {
5907                                 goto dont_try;
5908                         }
5909                 }
5910         }
5911         if (vm_initiated == TRUE) {
5912                 lck_mtx_unlock(&wbp->cl_lockw);
5913         }
5914
5915         for (cl_index = 0; cl_index < cl_len; cl_index++) {
5916                 int     flags;
5917                 struct  cl_extent cl;
5918                 int retval;
5919
5920                 flags = io_flags & (IO_PASSIVE | IO_CLOSE);
5921
5922                 /*
5923                  * try to push each cluster in turn...
5924                  */
5925                 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) {
5926                         flags |= IO_NOCACHE;
5927                 }
5928
5929                 if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) {
5930                         flags |= IO_PASSIVE;
5931                 }
5932
5933                 if (push_flag & PUSH_SYNC) {
5934                         flags |= IO_SYNC;
5935                 }
5936
5937                 cl.b_addr = l_clusters[cl_index].b_addr;
5938                 cl.e_addr = l_clusters[cl_index].e_addr;
5939
5940                 retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_initiated);
5941
5942                 if (retval == 0) {
5943                         cl_pushed++;
5944
5945                         l_clusters[cl_index].b_addr = 0;
5946                         l_clusters[cl_index].e_addr = 0;
5947                 } else if (error == 0) {
5948                         error = retval;
5949                 }
5950
5951                 if (!(push_flag & PUSH_ALL)) {
5952                         break;
5953                 }
5954         }
5955         if (vm_initiated == TRUE) {
5956                 lck_mtx_lock(&wbp->cl_lockw);
5957         }
5958
5959         if (err) {
5960                 *err = error;
5961         }
5962
5963 dont_try:
5964         if (cl_len > cl_pushed) {
5965                 /*
5966                  * we didn't push all of the clusters, so
5967                  * lets try to merge them back in to the vnode
5968                  */
5969                 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
5970                         /*
5971                          * we picked up some new clusters while we were trying to
5972                          * push the old ones... this can happen because I've dropped
5973                          * the vnode lock... the sum of the
5974                          * leftovers plus the new cluster count exceeds our ability
5975                          * to represent them, so switch to the sparse cluster mechanism
5976                          *
5977                          * collect the active public clusters...
5978                          */
5979                         sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
5980
5981                         for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
5982                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
5983                                         continue;
5984                                 }
5985                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
5986                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
5987                                 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
5988
5989                                 cl_index1++;
5990                         }
5991                         /*
5992                          * update the cluster count
5993                          */
5994                         wbp->cl_number = cl_index1;
5995
5996                         /*
5997                          * and collect the original clusters that were moved into the
5998                          * local storage for sorting purposes
5999                          */
6000                         sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6001                 } else {
6002                         /*
6003                          * we've got room to merge the leftovers back in
6004                          * just append them starting at the next 'hole'
6005                          * represented by wbp->cl_number
6006                          */
6007                         for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
6008                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6009                                         continue;
6010                                 }
6011
6012                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6013                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6014                                 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6015
6016                                 cl_index1++;
6017                         }
6018                         /*
6019                          * update the cluster count
6020                          */
6021                         wbp->cl_number = cl_index1;
6022                 }
6023         }
6024         return MAX_CLUSTERS - wbp->cl_number;
6025 }
6026
6027
6028
6029 static int
6030 cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags,
6031     int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6032 {
6033         upl_page_info_t *pl;
6034         upl_t            upl;
6035         vm_offset_t      upl_offset;
6036         int              upl_size;
6037         off_t            upl_f_offset;
6038         int              pages_in_upl;
6039         int              start_pg;
6040         int              last_pg;
6041         int              io_size;
6042         int              io_flags;
6043         int              upl_flags;
6044         int              bflag;
6045         int              size;
6046         int              error = 0;
6047         int              retval;
6048         kern_return_t    kret;
6049
6050         if (flags & IO_PASSIVE) {
6051                 bflag = CL_PASSIVE;
6052         } else {
6053                 bflag = 0;
6054         }
6055
6056         if (flags & IO_SKIP_ENCRYPTION) {
6057                 bflag |= CL_ENCRYPTED;
6058         }
6059
6060         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
6061             (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
6062
6063         if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
6064                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
6065
6066                 return 0;
6067         }
6068         upl_size = pages_in_upl * PAGE_SIZE;
6069         upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6070
6071         if (upl_f_offset + upl_size >= EOF) {
6072                 if (upl_f_offset >= EOF) {
6073                         /*
6074                          * must have truncated the file and missed
6075                          * clearing a dangling cluster (i.e. it's completely
6076                          * beyond the new EOF
6077                          */
6078                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
6079
6080                         return 0;
6081                 }
6082                 size = (int)(EOF - upl_f_offset);
6083
6084                 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
6085                 pages_in_upl = upl_size / PAGE_SIZE;
6086         } else {
6087                 size = upl_size;
6088         }
6089
6090
6091         if (vm_initiated) {
6092                 vnode_pageout(vp, NULL, (upl_offset_t)0, upl_f_offset, (upl_size_t)upl_size,
6093                     UPL_MSYNC | UPL_VNODE_PAGER | UPL_KEEPCACHED, &error);
6094
6095                 return error;
6096         }
6097         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
6098
6099         /*
6100          * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
6101          *
6102          * - only pages that are currently dirty are returned... these are the ones we need to clean
6103          * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
6104          * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
6105          * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
6106          *   someone dirties this page while the I/O is in progress, we don't lose track of the new state
6107          *
6108          * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
6109          */
6110
6111         if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE)) {
6112                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
6113         } else {
6114                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
6115         }
6116
6117         kret = ubc_create_upl_kernel(vp,
6118             upl_f_offset,
6119             upl_size,
6120             &upl,
6121             &pl,
6122             upl_flags,
6123             VM_KERN_MEMORY_FILE);
6124         if (kret != KERN_SUCCESS) {
6125                 panic("cluster_push: failed to get pagelist");
6126         }
6127
6128         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
6129
6130         /*
6131          * since we only asked for the dirty pages back
6132          * it's possible that we may only get a few or even none, so...
6133          * before we start marching forward, we must make sure we know
6134          * where the last present page is in the UPL, otherwise we could
6135          * end up working with a freed upl due to the FREE_ON_EMPTY semantics
6136          * employed by commit_range and abort_range.
6137          */
6138         for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
6139                 if (upl_page_present(pl, last_pg)) {
6140                         break;
6141                 }
6142         }
6143         pages_in_upl = last_pg + 1;
6144
6145         if (pages_in_upl == 0) {
6146                 ubc_upl_abort(upl, 0);
6147
6148                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
6149                 return 0;
6150         }
6151
6152         for (last_pg = 0; last_pg < pages_in_upl;) {
6153                 /*
6154                  * find the next dirty page in the UPL
6155                  * this will become the first page in the
6156                  * next I/O to generate
6157                  */
6158                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
6159                         if (upl_dirty_page(pl, start_pg)) {
6160                                 break;
6161                         }
6162                         if (upl_page_present(pl, start_pg)) {
6163                                 /*
6164                                  * RET_ONLY_DIRTY will return non-dirty 'precious' pages
6165                                  * just release these unchanged since we're not going
6166                                  * to steal them or change their state
6167                                  */
6168                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
6169                         }
6170                 }
6171                 if (start_pg >= pages_in_upl) {
6172                         /*
6173                          * done... no more dirty pages to push
6174                          */
6175                         break;
6176                 }
6177                 if (start_pg > last_pg) {
6178                         /*
6179                          * skipped over some non-dirty pages
6180                          */
6181                         size -= ((start_pg - last_pg) * PAGE_SIZE);
6182                 }
6183
6184                 /*
6185                  * find a range of dirty pages to write
6186                  */
6187                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
6188                         if (!upl_dirty_page(pl, last_pg)) {
6189                                 break;
6190                         }
6191                 }
6192                 upl_offset = start_pg * PAGE_SIZE;
6193
6194                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
6195
6196                 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag;
6197
6198                 if (!(flags & IO_SYNC)) {
6199                         io_flags |= CL_ASYNC;
6200                 }
6201
6202                 if (flags & IO_CLOSE) {
6203                         io_flags |= CL_CLOSE;
6204                 }
6205
6206                 if (flags & IO_NOCACHE) {
6207                         io_flags |= CL_NOCACHE;
6208                 }
6209
6210                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
6211                     io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6212
6213                 if (error == 0 && retval) {
6214                         error = retval;
6215                 }
6216
6217                 size -= io_size;
6218         }
6219         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, error, 0, 0);
6220
6221         return error;
6222 }
6223
6224
6225 /*
6226  * sparse_cluster_switch is called with the write behind lock held
6227  */
6228 static int
6229 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6230 {
6231         int     cl_index;
6232         int     error;
6233
6234         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, 0, 0);
6235
6236         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
6237                 int       flags;
6238                 struct cl_extent cl;
6239
6240                 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
6241                         if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
6242                                 if (flags & UPL_POP_DIRTY) {
6243                                         cl.e_addr = cl.b_addr + 1;
6244
6245                                         error = sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated);
6246
6247                                         if (error) {
6248                                                 break;
6249                                         }
6250                                 }
6251                         }
6252                 }
6253         }
6254         wbp->cl_number -= cl_index;
6255
6256         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, 0);
6257
6258         return error;
6259 }
6260
6261
6262 /*
6263  * sparse_cluster_push must be called with the write-behind lock held if the scmap is
6264  * still associated with the write-behind context... however, if the scmap has been disassociated
6265  * from the write-behind context (the cluster_push case), the wb lock is not held
6266  */
6267 static int
6268 sparse_cluster_push(struct cl_writebehind *wbp, void **scmap, vnode_t vp, off_t EOF, int push_flag,
6269     int io_flags, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6270 {
6271         struct cl_extent cl;
6272         off_t           offset;
6273         u_int           length;
6274         void            *l_scmap;
6275         int error = 0;
6276
6277         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
6278
6279         if (push_flag & PUSH_ALL) {
6280                 vfs_drt_control(scmap, 1);
6281         }
6282
6283         l_scmap = *scmap;
6284
6285         for (;;) {
6286                 int retval;
6287
6288                 if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) {
6289                         break;
6290                 }
6291
6292                 if (vm_initiated == TRUE) {
6293                         lck_mtx_unlock(&wbp->cl_lockw);
6294                 }
6295
6296                 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
6297                 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
6298
6299                 retval = cluster_push_now(vp, &cl, EOF, io_flags, callback, callback_arg, vm_initiated);
6300                 if (error == 0 && retval) {
6301                         error = retval;
6302                 }
6303
6304                 if (vm_initiated == TRUE) {
6305                         lck_mtx_lock(&wbp->cl_lockw);
6306
6307                         if (*scmap != l_scmap) {
6308                                 break;
6309                         }
6310                 }
6311
6312                 if (error) {
6313                         if (vfs_drt_mark_pages(scmap, offset, length, NULL) != KERN_SUCCESS) {
6314                                 panic("Failed to restore dirty state on failure\n");
6315                         }
6316
6317                         break;
6318                 }
6319
6320                 if (!(push_flag & PUSH_ALL)) {
6321                         break;
6322                 }
6323         }
6324         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6325
6326         return error;
6327 }
6328
6329
6330 /*
6331  * sparse_cluster_add is called with the write behind lock held
6332  */
6333 static int
6334 sparse_cluster_add(struct cl_writebehind *wbp, void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF,
6335     int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6336 {
6337         u_int   new_dirty;
6338         u_int   length;
6339         off_t   offset;
6340         int     error;
6341         int     push_flag = 0; /* Is this a valid value? */
6342
6343         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
6344
6345         offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6346         length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
6347
6348         while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
6349                 /*
6350                  * no room left in the map
6351                  * only a partial update was done
6352                  * push out some pages and try again
6353                  */
6354
6355                 if (vfs_get_scmap_push_behavior_internal(scmap, &push_flag)) {
6356                         push_flag = 0;
6357                 }
6358
6359                 error = sparse_cluster_push(wbp, scmap, vp, EOF, push_flag, 0, callback, callback_arg, vm_initiated);
6360
6361                 if (error) {
6362                         break;
6363                 }
6364
6365                 offset += (new_dirty * PAGE_SIZE_64);
6366                 length -= (new_dirty * PAGE_SIZE);
6367         }
6368         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6369
6370         return error;
6371 }
6372
6373
6374 static int
6375 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
6376 {
6377         upl_page_info_t  *pl;
6378         upl_t            upl;
6379         addr64_t         ubc_paddr;
6380         kern_return_t    kret;
6381         int              error = 0;
6382         int              did_read = 0;
6383         int              abort_flags;
6384         int              upl_flags;
6385         int              bflag;
6386
6387         if (flags & IO_PASSIVE) {
6388                 bflag = CL_PASSIVE;
6389         } else {
6390                 bflag = 0;
6391         }
6392
6393         if (flags & IO_NOCACHE) {
6394                 bflag |= CL_NOCACHE;
6395         }
6396
6397         upl_flags = UPL_SET_LITE;
6398
6399         if (!(flags & CL_READ)) {
6400                 /*
6401                  * "write" operation:  let the UPL subsystem know
6402                  * that we intend to modify the buffer cache pages
6403                  * we're gathering.
6404                  */
6405                 upl_flags |= UPL_WILL_MODIFY;
6406         } else {
6407                 /*
6408                  * indicate that there is no need to pull the
6409                  * mapping for this page... we're only going
6410                  * to read from it, not modify it.
6411                  */
6412                 upl_flags |= UPL_FILE_IO;
6413         }
6414         kret = ubc_create_upl_kernel(vp,
6415             uio->uio_offset & ~PAGE_MASK_64,
6416             PAGE_SIZE,
6417             &upl,
6418             &pl,
6419             upl_flags,
6420             VM_KERN_MEMORY_FILE);
6421
6422         if (kret != KERN_SUCCESS) {
6423                 return EINVAL;
6424         }
6425
6426         if (!upl_valid_page(pl, 0)) {
6427                 /*
6428                  * issue a synchronous read to cluster_io
6429                  */
6430                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6431                     CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6432                 if (error) {
6433                         ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
6434
6435                         return error;
6436                 }
6437                 did_read = 1;
6438         }
6439         ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
6440
6441 /*
6442  *      NOTE:  There is no prototype for the following in BSD. It, and the definitions
6443  *      of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
6444  *      osfmk/ppc/mappings.h.  They are not included here because there appears to be no
6445  *      way to do so without exporting them to kexts as well.
6446  */
6447         if (flags & CL_READ) {
6448 //              copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);    /* Copy physical to physical and flush the destination */
6449                 copypv(ubc_paddr, usr_paddr, xsize, 2 |        1 |        4);           /* Copy physical to physical and flush the destination */
6450         } else {
6451 //              copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);    /* Copy physical to physical and flush the source */
6452                 copypv(usr_paddr, ubc_paddr, xsize, 2 |        1 |        8);           /* Copy physical to physical and flush the source */
6453         }
6454         if (!(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
6455                 /*
6456                  * issue a synchronous write to cluster_io
6457                  */
6458                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6459                     bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6460         }
6461         if (error == 0) {
6462                 uio_update(uio, (user_size_t)xsize);
6463         }
6464
6465         if (did_read) {
6466                 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
6467         } else {
6468                 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
6469         }
6470
6471         ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
6472
6473         return error;
6474 }
6475
6476 int
6477 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid)
6478 {
6479         int       pg_offset;
6480         int       pg_index;
6481         int       csize;
6482         int       segflg;
6483         int       retval = 0;
6484         int       xsize;
6485         upl_page_info_t *pl;
6486         int       dirty_count;
6487
6488         xsize = *io_resid;
6489
6490         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6491             (int)uio->uio_offset, upl_offset, xsize, 0, 0);
6492
6493         segflg = uio->uio_segflg;
6494
6495         switch (segflg) {
6496         case UIO_USERSPACE32:
6497         case UIO_USERISPACE32:
6498                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6499                 break;
6500
6501         case UIO_USERSPACE:
6502         case UIO_USERISPACE:
6503                 uio->uio_segflg = UIO_PHYS_USERSPACE;
6504                 break;
6505
6506         case UIO_USERSPACE64:
6507         case UIO_USERISPACE64:
6508                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6509                 break;
6510
6511         case UIO_SYSSPACE:
6512                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6513                 break;
6514         }
6515         pl = ubc_upl_pageinfo(upl);
6516
6517         pg_index  = upl_offset / PAGE_SIZE;
6518         pg_offset = upl_offset & PAGE_MASK;
6519         csize     = min(PAGE_SIZE - pg_offset, xsize);
6520
6521         dirty_count = 0;
6522         while (xsize && retval == 0) {
6523                 addr64_t  paddr;
6524
6525                 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
6526                 if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE)) {
6527                         dirty_count++;
6528                 }
6529
6530                 retval = uiomove64(paddr, csize, uio);
6531
6532                 pg_index += 1;
6533                 pg_offset = 0;
6534                 xsize    -= csize;
6535                 csize     = min(PAGE_SIZE, xsize);
6536         }
6537         *io_resid = xsize;
6538
6539         uio->uio_segflg = segflg;
6540
6541         task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl));
6542         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6543             (int)uio->uio_offset, xsize, retval, segflg, 0);
6544
6545         return retval;
6546 }
6547
6548
6549 int
6550 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
6551 {
6552         return cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1);
6553 }
6554
6555
6556 static int
6557 cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference)
6558 {
6559         int       segflg;
6560         int       io_size;
6561         int       xsize;
6562         int       start_offset;
6563         int       retval = 0;
6564         memory_object_control_t  control;
6565
6566         io_size = *io_resid;
6567
6568         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6569             (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
6570
6571         control = ubc_getobject(vp, UBC_FLAGS_NONE);
6572
6573         if (control == MEMORY_OBJECT_CONTROL_NULL) {
6574                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6575                     (int)uio->uio_offset, io_size, retval, 3, 0);
6576
6577                 return 0;
6578         }
6579         segflg = uio->uio_segflg;
6580
6581         switch (segflg) {
6582         case UIO_USERSPACE32:
6583         case UIO_USERISPACE32:
6584                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6585                 break;
6586
6587         case UIO_USERSPACE64:
6588         case UIO_USERISPACE64:
6589                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6590                 break;
6591
6592         case UIO_USERSPACE:
6593         case UIO_USERISPACE:
6594                 uio->uio_segflg = UIO_PHYS_USERSPACE;
6595                 break;
6596
6597         case UIO_SYSSPACE:
6598                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6599                 break;
6600         }
6601
6602         if ((io_size = *io_resid)) {
6603                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
6604                 xsize = (int)uio_resid(uio);
6605
6606                 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
6607                     start_offset, io_size, mark_dirty, take_reference);
6608                 xsize -= uio_resid(uio);
6609                 io_size -= xsize;
6610         }
6611         uio->uio_segflg = segflg;
6612         *io_resid       = io_size;
6613
6614         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6615             (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0);
6616
6617         return retval;
6618 }
6619
6620
6621 int
6622 is_file_clean(vnode_t vp, off_t filesize)
6623 {
6624         off_t f_offset;
6625         int   flags;
6626         int   total_dirty = 0;
6627
6628         for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
6629                 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
6630                         if (flags & UPL_POP_DIRTY) {
6631                                 total_dirty++;
6632                         }
6633                 }
6634         }
6635         if (total_dirty) {
6636                 return EINVAL;
6637         }
6638
6639         return 0;
6640 }
6641
6642
6643
6644 /*
6645  * Dirty region tracking/clustering mechanism.
6646  *
6647  * This code (vfs_drt_*) provides a mechanism for tracking and clustering
6648  * dirty regions within a larger space (file).  It is primarily intended to
6649  * support clustering in large files with many dirty areas.
6650  *
6651  * The implementation assumes that the dirty regions are pages.
6652  *
6653  * To represent dirty pages within the file, we store bit vectors in a
6654  * variable-size circular hash.
6655  */
6656
6657 /*
6658  * Bitvector size.  This determines the number of pages we group in a
6659  * single hashtable entry.  Each hashtable entry is aligned to this
6660  * size within the file.
6661  */
6662 #define DRT_BITVECTOR_PAGES             ((1024 * 256) / PAGE_SIZE)
6663
6664 /*
6665  * File offset handling.
6666  *
6667  * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
6668  * the correct formula is  (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6669  */
6670 #define DRT_ADDRESS_MASK                (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6671 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK)
6672
6673 /*
6674  * Hashtable address field handling.
6675  *
6676  * The low-order bits of the hashtable address are used to conserve
6677  * space.
6678  *
6679  * DRT_HASH_COUNT_MASK must be large enough to store the range
6680  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
6681  * to indicate that the bucket is actually unoccupied.
6682  */
6683 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
6684 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \
6685         do {                                                                                            \
6686                 (scm)->scm_hashtable[(i)].dhe_control =                                                 \
6687                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
6688         } while (0)
6689 #define DRT_HASH_COUNT_MASK             0x1ff
6690 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
6691 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \
6692         do {                                                                                                            \
6693                 (scm)->scm_hashtable[(i)].dhe_control =                                                                 \
6694                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \
6695         } while (0)
6696 #define DRT_HASH_CLEAR(scm, i)                                                                                          \
6697         do {                                                                                                            \
6698                 (scm)->scm_hashtable[(i)].dhe_control = 0;                                                              \
6699         } while (0)
6700 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
6701 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
6702 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \
6703         do {                                                                                            \
6704                 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \
6705                 DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \
6706         } while(0);
6707
6708
6709 #if !defined(XNU_TARGET_OS_OSX)
6710 /*
6711  * Hash table moduli.
6712  *
6713  * Since the hashtable entry's size is dependent on the size of
6714  * the bitvector, and since the hashtable size is constrained to
6715  * both being prime and fitting within the desired allocation
6716  * size, these values need to be manually determined.
6717  *
6718  * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6719  *
6720  * The small hashtable allocation is 4096 bytes, so the modulus is 251.
6721  * The large hashtable allocation is 32768 bytes, so the modulus is 2039.
6722  * The xlarge hashtable allocation is 131072 bytes, so the modulus is 8179.
6723  */
6724
6725 #define DRT_HASH_SMALL_MODULUS  251
6726 #define DRT_HASH_LARGE_MODULUS  2039
6727 #define DRT_HASH_XLARGE_MODULUS  8179
6728
6729 /*
6730  * Physical memory required before the large hash modulus is permitted.
6731  *
6732  * On small memory systems, the large hash modulus can lead to phsyical
6733  * memory starvation, so we avoid using it there.
6734  */
6735 #define DRT_HASH_LARGE_MEMORY_REQUIRED  (1024LL * 1024LL * 1024LL)      /* 1GiB */
6736 #define DRT_HASH_XLARGE_MEMORY_REQUIRED  (8 * 1024LL * 1024LL * 1024LL)  /* 8GiB */
6737
6738 #define DRT_SMALL_ALLOCATION    4096    /* 80 bytes spare */
6739 #define DRT_LARGE_ALLOCATION    32768   /* 144 bytes spare */
6740 #define DRT_XLARGE_ALLOCATION    131072  /* 208 bytes spare */
6741
6742 #else /* XNU_TARGET_OS_OSX */
6743 /*
6744  * Hash table moduli.
6745  *
6746  * Since the hashtable entry's size is dependent on the size of
6747  * the bitvector, and since the hashtable size is constrained to
6748  * both being prime and fitting within the desired allocation
6749  * size, these values need to be manually determined.
6750  *
6751  * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6752  *
6753  * The small hashtable allocation is 16384 bytes, so the modulus is 1019.
6754  * The large hashtable allocation is 131072 bytes, so the modulus is 8179.
6755  * The xlarge hashtable allocation is 524288 bytes, so the modulus is 32749.
6756  */
6757
6758 #define DRT_HASH_SMALL_MODULUS  1019
6759 #define DRT_HASH_LARGE_MODULUS  8179
6760 #define DRT_HASH_XLARGE_MODULUS  32749
6761
6762 /*
6763  * Physical memory required before the large hash modulus is permitted.
6764  *
6765  * On small memory systems, the large hash modulus can lead to phsyical
6766  * memory starvation, so we avoid using it there.
6767  */
6768 #define DRT_HASH_LARGE_MEMORY_REQUIRED  (4 * 1024LL * 1024LL * 1024LL)  /* 4GiB */
6769 #define DRT_HASH_XLARGE_MEMORY_REQUIRED  (32 * 1024LL * 1024LL * 1024LL)  /* 32GiB */
6770
6771 #define DRT_SMALL_ALLOCATION    16384   /* 80 bytes spare */
6772 #define DRT_LARGE_ALLOCATION    131072  /* 208 bytes spare */
6773 #define DRT_XLARGE_ALLOCATION   524288  /* 304 bytes spare */
6774
6775 #endif /* ! XNU_TARGET_OS_OSX */
6776
6777 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
6778
6779 /*
6780  * Hashtable entry.
6781  */
6782 struct vfs_drt_hashentry {
6783         u_int64_t       dhe_control;
6784 /*
6785  * dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
6786  * DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
6787  * Since PAGE_SIZE is only known at boot time,
6788  *      -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
6789  *      -declare dhe_bitvector array for largest possible length
6790  */
6791 #define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
6792         u_int32_t       dhe_bitvector[MAX_DRT_BITVECTOR_PAGES / 32];
6793 };
6794
6795 /*
6796  * Hashtable bitvector handling.
6797  *
6798  * Bitvector fields are 32 bits long.
6799  */
6800
6801 #define DRT_HASH_SET_BIT(scm, i, bit)                           \
6802         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
6803
6804 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \
6805         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
6806
6807 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \
6808         ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
6809
6810 #define DRT_BITVECTOR_CLEAR(scm, i)                             \
6811         bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6812
6813 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \
6814         bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \
6815             &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \
6816             (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6817
6818 /*
6819  * Dirty Region Tracking structure.
6820  *
6821  * The hashtable is allocated entirely inside the DRT structure.
6822  *
6823  * The hash is a simple circular prime modulus arrangement, the structure
6824  * is resized from small to large if it overflows.
6825  */
6826
6827 struct vfs_drt_clustermap {
6828         u_int32_t               scm_magic;      /* sanity/detection */
6829 #define DRT_SCM_MAGIC           0x12020003
6830         u_int32_t               scm_modulus;    /* current ring size */
6831         u_int32_t               scm_buckets;    /* number of occupied buckets */
6832         u_int32_t               scm_lastclean;  /* last entry we cleaned */
6833         u_int32_t               scm_iskips;     /* number of slot skips */
6834
6835         struct vfs_drt_hashentry scm_hashtable[0];
6836 };
6837
6838
6839 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus)
6840 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus)
6841
6842 /*
6843  * Debugging codes and arguments.
6844  */
6845 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
6846 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
6847 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
6848 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
6849 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
6850                                                             * dirty */
6851                                                            /* 0, setcount */
6852                                                            /* 1 (clean, no map) */
6853                                                            /* 2 (map alloc fail) */
6854                                                            /* 3, resid (partial) */
6855 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87))
6856 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
6857                                                             * lastclean, iskips */
6858
6859
6860 static kern_return_t    vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
6861 static kern_return_t    vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
6862 static kern_return_t    vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
6863     u_int64_t offset, int *indexp);
6864 static kern_return_t    vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
6865     u_int64_t offset,
6866     int *indexp,
6867     int recursed);
6868 static kern_return_t    vfs_drt_do_mark_pages(
6869         void            **cmapp,
6870         u_int64_t       offset,
6871         u_int           length,
6872         u_int           *setcountp,
6873         int             dirty);
6874 static void             vfs_drt_trace(
6875         struct vfs_drt_clustermap *cmap,
6876         int code,
6877         int arg1,
6878         int arg2,
6879         int arg3,
6880         int arg4);
6881
6882
6883 /*
6884  * Allocate and initialise a sparse cluster map.
6885  *
6886  * Will allocate a new map, resize or compact an existing map.
6887  *
6888  * XXX we should probably have at least one intermediate map size,
6889  * as the 1:16 ratio seems a bit drastic.
6890  */
6891 static kern_return_t
6892 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
6893 {
6894         struct vfs_drt_clustermap *cmap = NULL, *ocmap = NULL;
6895         kern_return_t   kret = KERN_SUCCESS;
6896         u_int64_t       offset = 0;
6897         u_int32_t       i = 0;
6898         int             modulus_size = 0, map_size = 0, active_buckets = 0, index = 0, copycount = 0;
6899
6900         ocmap = NULL;
6901         if (cmapp != NULL) {
6902                 ocmap = *cmapp;
6903         }
6904
6905         /*
6906          * Decide on the size of the new map.
6907          */
6908         if (ocmap == NULL) {
6909                 modulus_size = DRT_HASH_SMALL_MODULUS;
6910                 map_size = DRT_SMALL_ALLOCATION;
6911         } else {
6912                 /* count the number of active buckets in the old map */
6913                 active_buckets = 0;
6914                 for (i = 0; i < ocmap->scm_modulus; i++) {
6915                         if (!DRT_HASH_VACANT(ocmap, i) &&
6916                             (DRT_HASH_GET_COUNT(ocmap, i) != 0)) {
6917                                 active_buckets++;
6918                         }
6919                 }
6920                 /*
6921                  * If we're currently using the small allocation, check to
6922                  * see whether we should grow to the large one.
6923                  */
6924                 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
6925                         /*
6926                          * If the ring is nearly full and we are allowed to
6927                          * use the large modulus, upgrade.
6928                          */
6929                         if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
6930                             (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
6931                                 modulus_size = DRT_HASH_LARGE_MODULUS;
6932                                 map_size = DRT_LARGE_ALLOCATION;
6933                         } else {
6934                                 modulus_size = DRT_HASH_SMALL_MODULUS;
6935                                 map_size = DRT_SMALL_ALLOCATION;
6936                         }
6937                 } else if (ocmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
6938                         if ((active_buckets > (DRT_HASH_LARGE_MODULUS - 5)) &&
6939                             (max_mem >= DRT_HASH_XLARGE_MEMORY_REQUIRED)) {
6940                                 modulus_size = DRT_HASH_XLARGE_MODULUS;
6941                                 map_size = DRT_XLARGE_ALLOCATION;
6942                         } else {
6943                                 /*
6944                                  * If the ring is completely full and we can't
6945                                  * expand, there's nothing useful for us to do.
6946                                  * Behave as though we had compacted into the new
6947                                  * array and return.
6948                                  */
6949                                 return KERN_SUCCESS;
6950                         }
6951                 } else {
6952                         /* already using the xlarge modulus */
6953                         modulus_size = DRT_HASH_XLARGE_MODULUS;
6954                         map_size = DRT_XLARGE_ALLOCATION;
6955
6956                         /*
6957                          * If the ring is completely full, there's
6958                          * nothing useful for us to do.  Behave as
6959                          * though we had compacted into the new
6960                          * array and return.
6961                          */
6962                         if (active_buckets >= DRT_HASH_XLARGE_MODULUS) {
6963                                 return KERN_SUCCESS;
6964                         }
6965                 }
6966         }
6967
6968         /*
6969          * Allocate and initialise the new map.
6970          */
6971
6972         kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, map_size, VM_KERN_MEMORY_FILE);
6973         if (kret != KERN_SUCCESS) {
6974                 return kret;
6975         }
6976         cmap->scm_magic = DRT_SCM_MAGIC;
6977         cmap->scm_modulus = modulus_size;
6978         cmap->scm_buckets = 0;
6979         cmap->scm_lastclean = 0;
6980         cmap->scm_iskips = 0;
6981         for (i = 0; i < cmap->scm_modulus; i++) {
6982                 DRT_HASH_CLEAR(cmap, i);
6983                 DRT_HASH_VACATE(cmap, i);
6984                 DRT_BITVECTOR_CLEAR(cmap, i);
6985         }
6986
6987         /*
6988          * If there's an old map, re-hash entries from it into the new map.
6989          */
6990         copycount = 0;
6991         if (ocmap != NULL) {
6992                 for (i = 0; i < ocmap->scm_modulus; i++) {
6993                         /* skip empty buckets */
6994                         if (DRT_HASH_VACANT(ocmap, i) ||
6995                             (DRT_HASH_GET_COUNT(ocmap, i) == 0)) {
6996                                 continue;
6997                         }
6998                         /* get new index */
6999                         offset = DRT_HASH_GET_ADDRESS(ocmap, i);
7000                         kret = vfs_drt_get_index(&cmap, offset, &index, 1);
7001                         if (kret != KERN_SUCCESS) {
7002                                 /* XXX need to bail out gracefully here */
7003                                 panic("vfs_drt: new cluster map mysteriously too small");
7004                                 index = 0;
7005                         }
7006                         /* copy */
7007                         DRT_HASH_COPY(ocmap, i, cmap, index);
7008                         copycount++;
7009                 }
7010         }
7011
7012         /* log what we've done */
7013         vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
7014
7015         /*
7016          * It's important to ensure that *cmapp always points to
7017          * a valid map, so we must overwrite it before freeing
7018          * the old map.
7019          */
7020         *cmapp = cmap;
7021         if (ocmap != NULL) {
7022                 /* emit stats into trace buffer */
7023                 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
7024                     ocmap->scm_modulus,
7025                     ocmap->scm_buckets,
7026                     ocmap->scm_lastclean,
7027                     ocmap->scm_iskips);
7028
7029                 vfs_drt_free_map(ocmap);
7030         }
7031         return KERN_SUCCESS;
7032 }
7033
7034
7035 /*
7036  * Free a sparse cluster map.
7037  */
7038 static kern_return_t
7039 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
7040 {
7041         vm_size_t map_size = 0;
7042
7043         if (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7044                 map_size = DRT_SMALL_ALLOCATION;
7045         } else if (cmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7046                 map_size = DRT_LARGE_ALLOCATION;
7047         } else if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7048                 map_size = DRT_XLARGE_ALLOCATION;
7049         } else {
7050                 panic("vfs_drt_free_map: Invalid modulus %d\n", cmap->scm_modulus);
7051         }
7052
7053         kmem_free(kernel_map, (vm_offset_t)cmap, map_size);
7054         return KERN_SUCCESS;
7055 }
7056
7057
7058 /*
7059  * Find the hashtable slot currently occupied by an entry for the supplied offset.
7060  */
7061 static kern_return_t
7062 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
7063 {
7064         int             index;
7065         u_int32_t       i;
7066
7067         offset = DRT_ALIGN_ADDRESS(offset);
7068         index = DRT_HASH(cmap, offset);
7069
7070         /* traverse the hashtable */
7071         for (i = 0; i < cmap->scm_modulus; i++) {
7072                 /*
7073                  * If the slot is vacant, we can stop.
7074                  */
7075                 if (DRT_HASH_VACANT(cmap, index)) {
7076                         break;
7077                 }
7078
7079                 /*
7080                  * If the address matches our offset, we have success.
7081                  */
7082                 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
7083                         *indexp = index;
7084                         return KERN_SUCCESS;
7085                 }
7086
7087                 /*
7088                  * Move to the next slot, try again.
7089                  */
7090                 index = DRT_HASH_NEXT(cmap, index);
7091         }
7092         /*
7093          * It's not there.
7094          */
7095         return KERN_FAILURE;
7096 }
7097
7098 /*
7099  * Find the hashtable slot for the supplied offset.  If we haven't allocated
7100  * one yet, allocate one and populate the address field.  Note that it will
7101  * not have a nonzero page count and thus will still technically be free, so
7102  * in the case where we are called to clean pages, the slot will remain free.
7103  */
7104 static kern_return_t
7105 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
7106 {
7107         struct vfs_drt_clustermap *cmap;
7108         kern_return_t   kret;
7109         u_int32_t       index;
7110         u_int32_t       i;
7111
7112         cmap = *cmapp;
7113
7114         /* look for an existing entry */
7115         kret = vfs_drt_search_index(cmap, offset, indexp);
7116         if (kret == KERN_SUCCESS) {
7117                 return kret;
7118         }
7119
7120         /* need to allocate an entry */
7121         offset = DRT_ALIGN_ADDRESS(offset);
7122         index = DRT_HASH(cmap, offset);
7123
7124         /* scan from the index forwards looking for a vacant slot */
7125         for (i = 0; i < cmap->scm_modulus; i++) {
7126                 /* slot vacant? */
7127                 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap, index) == 0) {
7128                         cmap->scm_buckets++;
7129                         if (index < cmap->scm_lastclean) {
7130                                 cmap->scm_lastclean = index;
7131                         }
7132                         DRT_HASH_SET_ADDRESS(cmap, index, offset);
7133                         DRT_HASH_SET_COUNT(cmap, index, 0);
7134                         DRT_BITVECTOR_CLEAR(cmap, index);
7135                         *indexp = index;
7136                         vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
7137                         return KERN_SUCCESS;
7138                 }
7139                 cmap->scm_iskips += i;
7140                 index = DRT_HASH_NEXT(cmap, index);
7141         }
7142
7143         /*
7144          * We haven't found a vacant slot, so the map is full.  If we're not
7145          * already recursed, try reallocating/compacting it.
7146          */
7147         if (recursed) {
7148                 return KERN_FAILURE;
7149         }
7150         kret = vfs_drt_alloc_map(cmapp);
7151         if (kret == KERN_SUCCESS) {
7152                 /* now try to insert again */
7153                 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
7154         }
7155         return kret;
7156 }
7157
7158 /*
7159  * Implementation of set dirty/clean.
7160  *
7161  * In the 'clean' case, not finding a map is OK.
7162  */
7163 static kern_return_t
7164 vfs_drt_do_mark_pages(
7165         void            **private,
7166         u_int64_t       offset,
7167         u_int           length,
7168         u_int           *setcountp,
7169         int             dirty)
7170 {
7171         struct vfs_drt_clustermap *cmap, **cmapp;
7172         kern_return_t   kret;
7173         int             i, index, pgoff, pgcount, setcount, ecount;
7174
7175         cmapp = (struct vfs_drt_clustermap **)private;
7176         cmap = *cmapp;
7177
7178         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
7179
7180         if (setcountp != NULL) {
7181                 *setcountp = 0;
7182         }
7183
7184         /* allocate a cluster map if we don't already have one */
7185         if (cmap == NULL) {
7186                 /* no cluster map, nothing to clean */
7187                 if (!dirty) {
7188                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
7189                         return KERN_SUCCESS;
7190                 }
7191                 kret = vfs_drt_alloc_map(cmapp);
7192                 if (kret != KERN_SUCCESS) {
7193                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
7194                         return kret;
7195                 }
7196         }
7197         setcount = 0;
7198
7199         /*
7200          * Iterate over the length of the region.
7201          */
7202         while (length > 0) {
7203                 /*
7204                  * Get the hashtable index for this offset.
7205                  *
7206                  * XXX this will add blank entries if we are clearing a range
7207                  * that hasn't been dirtied.
7208                  */
7209                 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
7210                 cmap = *cmapp;  /* may have changed! */
7211                 /* this may be a partial-success return */
7212                 if (kret != KERN_SUCCESS) {
7213                         if (setcountp != NULL) {
7214                                 *setcountp = setcount;
7215                         }
7216                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
7217
7218                         return kret;
7219                 }
7220
7221                 /*
7222                  * Work out how many pages we're modifying in this
7223                  * hashtable entry.
7224                  */
7225                 pgoff = (int)((offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE);
7226                 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
7227
7228                 /*
7229                  * Iterate over pages, dirty/clearing as we go.
7230                  */
7231                 ecount = DRT_HASH_GET_COUNT(cmap, index);
7232                 for (i = 0; i < pgcount; i++) {
7233                         if (dirty) {
7234                                 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7235                                         if (ecount >= DRT_BITVECTOR_PAGES) {
7236                                                 panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7237                                         }
7238                                         DRT_HASH_SET_BIT(cmap, index, pgoff + i);
7239                                         ecount++;
7240                                         setcount++;
7241                                 }
7242                         } else {
7243                                 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7244                                         if (ecount <= 0) {
7245                                                 panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7246                                         }
7247                                         assert(ecount > 0);
7248                                         DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
7249                                         ecount--;
7250                                         setcount++;
7251                                 }
7252                         }
7253                 }
7254                 DRT_HASH_SET_COUNT(cmap, index, ecount);
7255
7256                 offset += pgcount * PAGE_SIZE;
7257                 length -= pgcount * PAGE_SIZE;
7258         }
7259         if (setcountp != NULL) {
7260                 *setcountp = setcount;
7261         }
7262
7263         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
7264
7265         return KERN_SUCCESS;
7266 }
7267
7268 /*
7269  * Mark a set of pages as dirty/clean.
7270  *
7271  * This is a public interface.
7272  *
7273  * cmapp
7274  *      Pointer to storage suitable for holding a pointer.  Note that
7275  *      this must either be NULL or a value set by this function.
7276  *
7277  * size
7278  *      Current file size in bytes.
7279  *
7280  * offset
7281  *      Offset of the first page to be marked as dirty, in bytes.  Must be
7282  *      page-aligned.
7283  *
7284  * length
7285  *      Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE.
7286  *
7287  * setcountp
7288  *      Number of pages newly marked dirty by this call (optional).
7289  *
7290  * Returns KERN_SUCCESS if all the pages were successfully marked.
7291  */
7292 static kern_return_t
7293 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp)
7294 {
7295         /* XXX size unused, drop from interface */
7296         return vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1);
7297 }
7298
7299 #if 0
7300 static kern_return_t
7301 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
7302 {
7303         return vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7304 }
7305 #endif
7306
7307 /*
7308  * Get a cluster of dirty pages.
7309  *
7310  * This is a public interface.
7311  *
7312  * cmapp
7313  *      Pointer to storage managed by drt_mark_pages.  Note that this must
7314  *      be NULL or a value set by drt_mark_pages.
7315  *
7316  * offsetp
7317  *      Returns the byte offset into the file of the first page in the cluster.
7318  *
7319  * lengthp
7320  *      Returns the length in bytes of the cluster of dirty pages.
7321  *
7322  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there
7323  * are no dirty pages meeting the minmum size criteria.  Private storage will
7324  * be released if there are no more dirty pages left in the map
7325  *
7326  */
7327 static kern_return_t
7328 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
7329 {
7330         struct vfs_drt_clustermap *cmap;
7331         u_int64_t       offset;
7332         u_int           length;
7333         u_int32_t       j;
7334         int             index, i, fs, ls;
7335
7336         /* sanity */
7337         if ((cmapp == NULL) || (*cmapp == NULL)) {
7338                 return KERN_FAILURE;
7339         }
7340         cmap = *cmapp;
7341
7342         /* walk the hashtable */
7343         for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
7344                 index = DRT_HASH(cmap, offset);
7345
7346                 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0)) {
7347                         continue;
7348                 }
7349
7350                 /* scan the bitfield for a string of bits */
7351                 fs = -1;
7352
7353                 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7354                         if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7355                                 fs = i;
7356                                 break;
7357                         }
7358                 }
7359                 if (fs == -1) {
7360                         /*  didn't find any bits set */
7361                         panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
7362                             cmap, index, DRT_HASH_GET_COUNT(cmap, index));
7363                 }
7364                 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
7365                         if (!DRT_HASH_TEST_BIT(cmap, index, i)) {
7366                                 break;
7367                         }
7368                 }
7369
7370                 /* compute offset and length, mark pages clean */
7371                 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
7372                 length = ls * PAGE_SIZE;
7373                 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7374                 cmap->scm_lastclean = index;
7375
7376                 /* return successful */
7377                 *offsetp = (off_t)offset;
7378                 *lengthp = length;
7379
7380                 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
7381                 return KERN_SUCCESS;
7382         }
7383         /*
7384          * We didn't find anything... hashtable is empty
7385          * emit stats into trace buffer and
7386          * then free it
7387          */
7388         vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7389             cmap->scm_modulus,
7390             cmap->scm_buckets,
7391             cmap->scm_lastclean,
7392             cmap->scm_iskips);
7393
7394         vfs_drt_free_map(cmap);
7395         *cmapp = NULL;
7396
7397         return KERN_FAILURE;
7398 }
7399
7400
7401 static kern_return_t
7402 vfs_drt_control(void **cmapp, int op_type)
7403 {
7404         struct vfs_drt_clustermap *cmap;
7405
7406         /* sanity */
7407         if ((cmapp == NULL) || (*cmapp == NULL)) {
7408                 return KERN_FAILURE;
7409         }
7410         cmap = *cmapp;
7411
7412         switch (op_type) {
7413         case 0:
7414                 /* emit stats into trace buffer */
7415                 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7416                     cmap->scm_modulus,
7417                     cmap->scm_buckets,
7418                     cmap->scm_lastclean,
7419                     cmap->scm_iskips);
7420
7421                 vfs_drt_free_map(cmap);
7422                 *cmapp = NULL;
7423                 break;
7424
7425         case 1:
7426                 cmap->scm_lastclean = 0;
7427                 break;
7428         }
7429         return KERN_SUCCESS;
7430 }
7431
7432
7433
7434 /*
7435  * Emit a summary of the state of the clustermap into the trace buffer
7436  * along with some caller-provided data.
7437  */
7438 #if KDEBUG
7439 static void
7440 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
7441 {
7442         KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
7443 }
7444 #else
7445 static void
7446 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
7447     __unused int arg1, __unused int arg2, __unused int arg3,
7448     __unused int arg4)
7449 {
7450 }
7451 #endif
7452
7453 #if 0
7454 /*
7455  * Perform basic sanity check on the hash entry summary count
7456  * vs. the actual bits set in the entry.
7457  */
7458 static void
7459 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
7460 {
7461         int index, i;
7462         int bits_on;
7463
7464         for (index = 0; index < cmap->scm_modulus; index++) {
7465                 if (DRT_HASH_VACANT(cmap, index)) {
7466                         continue;
7467                 }
7468
7469                 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7470                         if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7471                                 bits_on++;
7472                         }
7473                 }
7474                 if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) {
7475                         panic("bits_on = %d,  index = %d\n", bits_on, index);
7476                 }
7477         }
7478 }
7479 #endif
7480
7481 /*
7482  * Internal interface only.
7483  */
7484 static kern_return_t
7485 vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag)
7486 {
7487         struct vfs_drt_clustermap *cmap;
7488
7489         /* sanity */
7490         if ((cmapp == NULL) || (*cmapp == NULL) || (push_flag == NULL)) {
7491                 return KERN_FAILURE;
7492         }
7493         cmap = *cmapp;
7494
7495         if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7496                 /*
7497                  * If we have a full xlarge sparse cluster,
7498                  * we push it out all at once so the cluster
7499                  * map can be available to absorb more I/Os.
7500                  * This is done on large memory configs so
7501                  * the small I/Os don't interfere with the
7502                  * pro workloads.
7503                  */
7504                 *push_flag = PUSH_ALL;
7505         }
7506         return KERN_SUCCESS;
7507 }