bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  62  */
  63
  64 #include <sys/param.h>
  65 #include <sys/proc_internal.h>
  66 #include <sys/buf_internal.h>
  67 #include <sys/mount_internal.h>
  68 #include <sys/vnode_internal.h>
  69 #include <sys/trace.h>
  70 #include <sys/malloc.h>
  71 #include <sys/time.h>
  72 #include <sys/kernel.h>
  73 #include <sys/resourcevar.h>
  74 #include <miscfs/specfs/specdev.h>
  75 #include <sys/uio_internal.h>
  76 #include <libkern/libkern.h>
  77 #include <machine/machine_routines.h>
  78
  79 #include <sys/ubc_internal.h>
  80 #include <vm/vnode_pager.h>
  81
  82 #include <mach/mach_types.h>
  83 #include <mach/memory_object_types.h>
  84 #include <mach/vm_map.h>
  85 #include <mach/upl.h>
  86 #include <kern/task.h>
  87 #include <kern/policy_internal.h>
  88
  89 #include <vm/vm_kern.h>
  90 #include <vm/vm_map.h>
  91 #include <vm/vm_pageout.h>
  92 #include <vm/vm_fault.h>
  93
  94 #include <sys/kdebug.h>
  95 #include <libkern/OSAtomic.h>
  96
  97 #include <sys/sdt.h>
  98
  99 #include <stdbool.h>
 100
 101 #include <vfs/vfs_disk_conditioner.h>
 102
 103 #if 0
 104 #undef KERNEL_DEBUG
 105 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 106 #endif
 107
 108
 109 #define CL_READ         0x01
 110 #define CL_WRITE        0x02
 111 #define CL_ASYNC        0x04
 112 #define CL_COMMIT       0x08
 113 #define CL_PAGEOUT      0x10
 114 #define CL_AGE          0x20
 115 #define CL_NOZERO       0x40
 116 #define CL_PAGEIN       0x80
 117 #define CL_DEV_MEMORY   0x100
 118 #define CL_PRESERVE     0x200
 119 #define CL_THROTTLE     0x400
 120 #define CL_KEEPCACHED   0x800
 121 #define CL_DIRECT_IO    0x1000
 122 #define CL_PASSIVE      0x2000
 123 #define CL_IOSTREAMING  0x4000
 124 #define CL_CLOSE        0x8000
 125 #define CL_ENCRYPTED    0x10000
 126 #define CL_RAW_ENCRYPTED        0x20000
 127 #define CL_NOCACHE      0x40000
 128
 129 #define MAX_VECTOR_UPL_ELEMENTS 8
 130 #define MAX_VECTOR_UPL_SIZE     (2 * MAX_UPL_SIZE_BYTES)
 131
 132 #define CLUSTER_IO_WAITING              ((buf_t)1)
 133
 134 extern upl_t vector_upl_create(vm_offset_t);
 135 extern boolean_t vector_upl_is_valid(upl_t);
 136 extern boolean_t vector_upl_set_subupl(upl_t, upl_t, u_int32_t);
 137 extern void vector_upl_set_pagelist(upl_t);
 138 extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
 139
 140 struct clios {
 141         lck_mtx_t io_mtxp;
 142         u_int  io_completed;       /* amount of io that has currently completed */
 143         u_int  io_issued;          /* amount of io that was successfully issued */
 144         int    io_error;           /* error code of first error encountered */
 145         int    io_wanted;          /* someone is sleeping waiting for a change in state */
 146 };
 147
 148 struct cl_direct_read_lock {
 149         LIST_ENTRY(cl_direct_read_lock)         chain;
 150         int32_t                                                         ref_count;
 151         vnode_t                                                         vp;
 152         lck_rw_t                                                        rw_lock;
 153 };
 154
 155 #define CL_DIRECT_READ_LOCK_BUCKETS 61
 156
 157 static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
 158 cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
 159
 160 static lck_spin_t cl_direct_read_spin_lock;
 161
 162 static lck_grp_t        *cl_mtx_grp;
 163 static lck_attr_t       *cl_mtx_attr;
 164 static lck_grp_attr_t   *cl_mtx_grp_attr;
 165 static lck_mtx_t        *cl_transaction_mtxp;
 166
 167 #define IO_UNKNOWN      0
 168 #define IO_DIRECT       1
 169 #define IO_CONTIG       2
 170 #define IO_COPY         3
 171
 172 #define PUSH_DELAY      0x01
 173 #define PUSH_ALL        0x02
 174 #define PUSH_SYNC       0x04
 175
 176
 177 static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset);
 178 static void cluster_wait_IO(buf_t cbp_head, int async);
 179 static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait);
 180
 181 static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length);
 182
 183 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 184     int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg);
 185 static int cluster_iodone(buf_t bp, void *callback_arg);
 186 static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
 187 static int cluster_is_throttled(vnode_t vp);
 188
 189 static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name);
 190
 191 static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags);
 192
 193 static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
 194 static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference);
 195
 196 static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
 197     int (*)(buf_t, void *), void *callback_arg);
 198 static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
 199     int flags, int (*)(buf_t, void *), void *callback_arg);
 200 static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
 201     int (*)(buf_t, void *), void *callback_arg, int flags);
 202
 203 static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
 204     off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg);
 205 static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
 206     int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg);
 207 static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
 208     int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag);
 209
 210 static void cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes, boolean_t *first_pass,
 211     off_t write_off, int write_cnt, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
 212
 213 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg);
 214
 215 static int      cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
 216 static void     cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra,
 217     int (*callback)(buf_t, void *), void *callback_arg, int bflag);
 218
 219 static int      cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_ioitiated);
 220
 221 static int      cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *),
 222     void *callback_arg, int *err, boolean_t vm_initiated);
 223
 224 static int      sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
 225 static int      sparse_cluster_push(struct cl_writebehind *, void **cmapp, vnode_t vp, off_t EOF, int push_flag,
 226     int io_flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
 227 static int      sparse_cluster_add(struct cl_writebehind *, void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF,
 228     int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
 229
 230 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp);
 231 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
 232 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
 233 static kern_return_t vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag);
 234
 235
 236 /*
 237  * For throttled IO to check whether
 238  * a block is cached by the boot cache
 239  * and thus it can avoid delaying the IO.
 240  *
 241  * bootcache_contains_block is initially
 242  * NULL. The BootCache will set it while
 243  * the cache is active and clear it when
 244  * the cache is jettisoned.
 245  *
 246  * Returns 0 if the block is not
 247  * contained in the cache, 1 if it is
 248  * contained.
 249  *
 250  * The function pointer remains valid
 251  * after the cache has been evicted even
 252  * if bootcache_contains_block has been
 253  * cleared.
 254  *
 255  * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
 256  */
 257 int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
 258
 259
 260 /*
 261  * limit the internal I/O size so that we
 262  * can represent it in a 32 bit int
 263  */
 264 #define MAX_IO_REQUEST_SIZE     (1024 * 1024 * 512)
 265 #define MAX_IO_CONTIG_SIZE      MAX_UPL_SIZE_BYTES
 266 #define MAX_VECTS               16
 267 /*
 268  * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
 269  * allowing the caller to bypass the buffer cache.  For small I/Os (less than 16k),
 270  * we have not historically allowed the write to bypass the UBC.
 271  */
 272 #define MIN_DIRECT_WRITE_SIZE   (16384)
 273
 274 #define WRITE_THROTTLE          6
 275 #define WRITE_THROTTLE_SSD      2
 276 #define WRITE_BEHIND            1
 277 #define WRITE_BEHIND_SSD        1
 278
 279 #if CONFIG_EMBEDDED
 280 #define PREFETCH                1
 281 #define PREFETCH_SSD            1
 282 uint32_t speculative_prefetch_max = (2048 * 1024);              /* maximum bytes in a specluative read-ahead */
 283 uint32_t speculative_prefetch_max_iosize = (512 * 1024);        /* maximum I/O size to use in a specluative read-ahead */
 284 #else
 285 #define PREFETCH                3
 286 #define PREFETCH_SSD            2
 287 uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3);   /* maximum bytes in a specluative read-ahead */
 288 uint32_t speculative_prefetch_max_iosize = (512 * 1024);        /* maximum I/O size to use in a specluative read-ahead on SSDs*/
 289 #endif
 290
 291
 292 #define IO_SCALE(vp, base)              (vp->v_mount->mnt_ioscale * (base))
 293 #define MAX_CLUSTER_SIZE(vp)            (cluster_max_io_size(vp->v_mount, CL_WRITE))
 294 #define MAX_PREFETCH(vp, size, is_ssd)  (size * IO_SCALE(vp, ((is_ssd) ? PREFETCH_SSD : PREFETCH)))
 295
 296 int     speculative_reads_disabled = 0;
 297
 298 /*
 299  * throttle the number of async writes that
 300  * can be outstanding on a single vnode
 301  * before we issue a synchronous write
 302  */
 303 #define THROTTLE_MAXCNT 0
 304
 305 uint32_t throttle_max_iosize = (128 * 1024);
 306
 307 #define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
 308
 309 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
 310
 311
 312 void
 313 cluster_init(void)
 314 {
 315         /*
 316          * allocate lock group attribute and group
 317          */
 318         cl_mtx_grp_attr = lck_grp_attr_alloc_init();
 319         cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
 320
 321         /*
 322          * allocate the lock attribute
 323          */
 324         cl_mtx_attr = lck_attr_alloc_init();
 325
 326         cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
 327
 328         if (cl_transaction_mtxp == NULL) {
 329                 panic("cluster_init: failed to allocate cl_transaction_mtxp");
 330         }
 331
 332         lck_spin_init(&cl_direct_read_spin_lock, cl_mtx_grp, cl_mtx_attr);
 333
 334         for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i) {
 335                 LIST_INIT(&cl_direct_read_locks[i]);
 336         }
 337 }
 338
 339
 340 uint32_t
 341 cluster_max_io_size(mount_t mp, int type)
 342 {
 343         uint32_t        max_io_size;
 344         uint32_t        segcnt;
 345         uint32_t        maxcnt;
 346
 347         switch (type) {
 348         case CL_READ:
 349                 segcnt = mp->mnt_segreadcnt;
 350                 maxcnt = mp->mnt_maxreadcnt;
 351                 break;
 352         case CL_WRITE:
 353                 segcnt = mp->mnt_segwritecnt;
 354                 maxcnt = mp->mnt_maxwritecnt;
 355                 break;
 356         default:
 357                 segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
 358                 maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
 359                 break;
 360         }
 361         if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
 362                 /*
 363                  * don't allow a size beyond the max UPL size we can create
 364                  */
 365                 segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
 366         }
 367         max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
 368
 369         if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
 370                 /*
 371                  * don't allow a size smaller than the old fixed limit
 372                  */
 373                 max_io_size = MAX_UPL_TRANSFER_BYTES;
 374         } else {
 375                 /*
 376                  * make sure the size specified is a multiple of PAGE_SIZE
 377                  */
 378                 max_io_size &= ~PAGE_MASK;
 379         }
 380         return max_io_size;
 381 }
 382
 383
 384
 385
 386 #define CLW_ALLOCATE            0x01
 387 #define CLW_RETURNLOCKED        0x02
 388 #define CLW_IONOCACHE           0x04
 389 #define CLW_IOPASSIVE   0x08
 390
 391 /*
 392  * if the read ahead context doesn't yet exist,
 393  * allocate and initialize it...
 394  * the vnode lock serializes multiple callers
 395  * during the actual assignment... first one
 396  * to grab the lock wins... the other callers
 397  * will release the now unnecessary storage
 398  *
 399  * once the context is present, try to grab (but don't block on)
 400  * the lock associated with it... if someone
 401  * else currently owns it, than the read
 402  * will run without read-ahead.  this allows
 403  * multiple readers to run in parallel and
 404  * since there's only 1 read ahead context,
 405  * there's no real loss in only allowing 1
 406  * reader to have read-ahead enabled.
 407  */
 408 static struct cl_readahead *
 409 cluster_get_rap(vnode_t vp)
 410 {
 411         struct ubc_info         *ubc;
 412         struct cl_readahead     *rap;
 413
 414         ubc = vp->v_ubcinfo;
 415
 416         if ((rap = ubc->cl_rahead) == NULL) {
 417                 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
 418
 419                 bzero(rap, sizeof *rap);
 420                 rap->cl_lastr = -1;
 421                 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
 422
 423                 vnode_lock(vp);
 424
 425                 if (ubc->cl_rahead == NULL) {
 426                         ubc->cl_rahead = rap;
 427                 } else {
 428                         lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
 429                         FREE_ZONE(rap, sizeof *rap, M_CLRDAHEAD);
 430                         rap = ubc->cl_rahead;
 431                 }
 432                 vnode_unlock(vp);
 433         }
 434         if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE) {
 435                 return rap;
 436         }
 437
 438         return (struct cl_readahead *)NULL;
 439 }
 440
 441
 442 /*
 443  * if the write behind context doesn't yet exist,
 444  * and CLW_ALLOCATE is specified, allocate and initialize it...
 445  * the vnode lock serializes multiple callers
 446  * during the actual assignment... first one
 447  * to grab the lock wins... the other callers
 448  * will release the now unnecessary storage
 449  *
 450  * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
 451  * the lock associated with the write behind context before
 452  * returning
 453  */
 454
 455 static struct cl_writebehind *
 456 cluster_get_wbp(vnode_t vp, int flags)
 457 {
 458         struct ubc_info *ubc;
 459         struct cl_writebehind *wbp;
 460
 461         ubc = vp->v_ubcinfo;
 462
 463         if ((wbp = ubc->cl_wbehind) == NULL) {
 464                 if (!(flags & CLW_ALLOCATE)) {
 465                         return (struct cl_writebehind *)NULL;
 466                 }
 467
 468                 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
 469
 470                 bzero(wbp, sizeof *wbp);
 471                 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
 472
 473                 vnode_lock(vp);
 474
 475                 if (ubc->cl_wbehind == NULL) {
 476                         ubc->cl_wbehind = wbp;
 477                 } else {
 478                         lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
 479                         FREE_ZONE(wbp, sizeof *wbp, M_CLWRBEHIND);
 480                         wbp = ubc->cl_wbehind;
 481                 }
 482                 vnode_unlock(vp);
 483         }
 484         if (flags & CLW_RETURNLOCKED) {
 485                 lck_mtx_lock(&wbp->cl_lockw);
 486         }
 487
 488         return wbp;
 489 }
 490
 491
 492 static void
 493 cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags)
 494 {
 495         struct cl_writebehind *wbp;
 496
 497         if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
 498                 if (wbp->cl_number) {
 499                         lck_mtx_lock(&wbp->cl_lockw);
 500
 501                         cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg, NULL, FALSE);
 502
 503                         lck_mtx_unlock(&wbp->cl_lockw);
 504                 }
 505         }
 506 }
 507
 508
 509 static int
 510 cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
 511 {
 512         daddr64_t blkno;
 513         size_t    io_size;
 514         int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
 515
 516         if (bootcache_check_fn && vp->v_mount && vp->v_mount->mnt_devvp) {
 517                 if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL)) {
 518                         return 0;
 519                 }
 520
 521                 if (io_size == 0) {
 522                         return 0;
 523                 }
 524
 525                 if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) {
 526                         return 1;
 527                 }
 528         }
 529         return 0;
 530 }
 531
 532
 533 static int
 534 cluster_is_throttled(vnode_t vp)
 535 {
 536         return throttle_io_will_be_throttled(-1, vp->v_mount);
 537 }
 538
 539
 540 static void
 541 cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name)
 542 {
 543         lck_mtx_lock(&iostate->io_mtxp);
 544
 545         while ((iostate->io_issued - iostate->io_completed) > target) {
 546                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
 547                     iostate->io_issued, iostate->io_completed, target, 0, 0);
 548
 549                 iostate->io_wanted = 1;
 550                 msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
 551
 552                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
 553                     iostate->io_issued, iostate->io_completed, target, 0, 0);
 554         }
 555         lck_mtx_unlock(&iostate->io_mtxp);
 556 }
 557
 558 static void
 559 cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
 560     upl_offset_t upl_offset, upl_size_t size)
 561 {
 562         if (!size) {
 563                 return;
 564         }
 565
 566         upl_t associated_upl = upl_associated_upl(upl);
 567
 568         if (!associated_upl) {
 569                 return;
 570         }
 571
 572 #if 0
 573         printf("1: %d %d\n", upl_offset, upl_offset + size);
 574 #endif
 575
 576         /*
 577          * The associated UPL is page aligned to file offsets whereas the
 578          * UPL it's attached to has different alignment requirements.  The
 579          * upl_offset that we have refers to @upl.  The code that follows
 580          * has to deal with the first and last pages in this transaction
 581          * which might straddle pages in the associated UPL.  To keep
 582          * track of these pages, we use the mark bits: if the mark bit is
 583          * set, we know another transaction has completed its part of that
 584          * page and so we can unlock that page here.
 585          *
 586          * The following illustrates what we have to deal with:
 587          *
 588          *    MEM u <------------ 1 PAGE ------------> e
 589          *        +-------------+----------------------+-----------------
 590          *        |             |######################|#################
 591          *        +-------------+----------------------+-----------------
 592          *   FILE | <--- a ---> o <------------ 1 PAGE ------------>
 593          *
 594          * So here we show a write to offset @o.  The data that is to be
 595          * written is in a buffer that is not page aligned; it has offset
 596          * @a in the page.  The upl that carries the data starts in memory
 597          * at @u.  The associated upl starts in the file at offset @o.  A
 598          * transaction will always end on a page boundary (like @e above)
 599          * except for the very last transaction in the group.  We cannot
 600          * unlock the page at @o in the associated upl until both the
 601          * transaction ending at @e and the following transaction (that
 602          * starts at @e) has completed.
 603          */
 604
 605         /*
 606          * We record whether or not the two UPLs are aligned as the mark
 607          * bit in the first page of @upl.
 608          */
 609         upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
 610         bool is_unaligned = upl_page_get_mark(pl, 0);
 611
 612         if (is_unaligned) {
 613                 upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
 614
 615                 upl_offset_t upl_end = upl_offset + size;
 616                 assert(upl_end >= PAGE_SIZE);
 617
 618                 upl_size_t assoc_upl_size = upl_get_size(associated_upl);
 619
 620                 /*
 621                  * In the very first transaction in the group, upl_offset will
 622                  * not be page aligned, but after that it will be and in that
 623                  * case we want the preceding page in the associated UPL hence
 624                  * the minus one.
 625                  */
 626                 assert(upl_offset);
 627                 if (upl_offset) {
 628                         upl_offset = trunc_page_32(upl_offset - 1);
 629                 }
 630
 631                 lck_mtx_lock_spin(&iostate->io_mtxp);
 632
 633                 // Look at the first page...
 634                 if (upl_offset
 635                     && !upl_page_get_mark(assoc_pl, upl_offset >> PAGE_SHIFT)) {
 636                         /*
 637                          * The first page isn't marked so let another transaction
 638                          * completion handle it.
 639                          */
 640                         upl_page_set_mark(assoc_pl, upl_offset >> PAGE_SHIFT, true);
 641                         upl_offset += PAGE_SIZE;
 642                 }
 643
 644                 // And now the last page...
 645
 646                 /*
 647                  * This needs to be > rather than >= because if it's equal, it
 648                  * means there's another transaction that is sharing the last
 649                  * page.
 650                  */
 651                 if (upl_end > assoc_upl_size) {
 652                         upl_end = assoc_upl_size;
 653                 } else {
 654                         upl_end = trunc_page_32(upl_end);
 655                         const int last_pg = (upl_end >> PAGE_SHIFT) - 1;
 656
 657                         if (!upl_page_get_mark(assoc_pl, last_pg)) {
 658                                 /*
 659                                  * The last page isn't marked so mark the page and let another
 660                                  * transaction completion handle it.
 661                                  */
 662                                 upl_page_set_mark(assoc_pl, last_pg, true);
 663                                 upl_end -= PAGE_SIZE;
 664                         }
 665                 }
 666
 667                 lck_mtx_unlock(&iostate->io_mtxp);
 668
 669 #if 0
 670                 printf("2: %d %d\n", upl_offset, upl_end);
 671 #endif
 672
 673                 if (upl_end <= upl_offset) {
 674                         return;
 675                 }
 676
 677                 size = upl_end - upl_offset;
 678         } else {
 679                 assert(!(upl_offset & PAGE_MASK));
 680                 assert(!(size & PAGE_MASK));
 681         }
 682
 683         boolean_t empty;
 684
 685         /*
 686          * We can unlock these pages now and as this is for a
 687          * direct/uncached write, we want to dump the pages too.
 688          */
 689         kern_return_t kr = upl_abort_range(associated_upl, upl_offset, size,
 690             UPL_ABORT_DUMP_PAGES, &empty);
 691
 692         assert(!kr);
 693
 694         if (!kr && empty) {
 695                 upl_set_associated_upl(upl, NULL);
 696                 upl_deallocate(associated_upl);
 697         }
 698 }
 699
 700 static int
 701 cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
 702 {
 703         int upl_abort_code = 0;
 704         int page_in  = 0;
 705         int page_out = 0;
 706
 707         if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) {
 708                 /*
 709                  * direct write of any flavor, or a direct read that wasn't aligned
 710                  */
 711                 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
 712         } else {
 713                 if (io_flags & B_PAGEIO) {
 714                         if (io_flags & B_READ) {
 715                                 page_in  = 1;
 716                         } else {
 717                                 page_out = 1;
 718                         }
 719                 }
 720                 if (io_flags & B_CACHE) {
 721                         /*
 722                          * leave pages in the cache unchanged on error
 723                          */
 724                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 725                 } else if (((io_flags & B_READ) == 0) && ((error != ENXIO) || vnode_isswap(vp))) {
 726                         /*
 727                          * transient error on pageout/write path... leave pages unchanged
 728                          */
 729                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 730                 } else if (page_in) {
 731                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 732                 } else {
 733                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 734                 }
 735
 736                 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
 737         }
 738         return upl_abort_code;
 739 }
 740
 741
 742 static int
 743 cluster_iodone(buf_t bp, void *callback_arg)
 744 {
 745         int     b_flags;
 746         int     error;
 747         int     total_size;
 748         int     total_resid;
 749         int     upl_offset;
 750         int     zero_offset;
 751         int     pg_offset = 0;
 752         int     commit_size = 0;
 753         int     upl_flags = 0;
 754         int     transaction_size = 0;
 755         upl_t   upl;
 756         buf_t   cbp;
 757         buf_t   cbp_head;
 758         buf_t   cbp_next;
 759         buf_t   real_bp;
 760         vnode_t vp;
 761         struct  clios *iostate;
 762         boolean_t       transaction_complete = FALSE;
 763
 764         __IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
 765
 766         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 767             cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 768
 769         if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) {
 770                 lck_mtx_lock_spin(cl_transaction_mtxp);
 771
 772                 bp->b_flags |= B_TDONE;
 773
 774                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 775                         /*
 776                          * all I/O requests that are part of this transaction
 777                          * have to complete before we can process it
 778                          */
 779                         if (!(cbp->b_flags & B_TDONE)) {
 780                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 781                                     cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
 782
 783                                 lck_mtx_unlock(cl_transaction_mtxp);
 784
 785                                 return 0;
 786                         }
 787
 788                         if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
 789                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 790                                     cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
 791
 792                                 lck_mtx_unlock(cl_transaction_mtxp);
 793                                 wakeup(cbp);
 794
 795                                 return 0;
 796                         }
 797
 798                         if (cbp->b_flags & B_EOT) {
 799                                 transaction_complete = TRUE;
 800                         }
 801                 }
 802                 lck_mtx_unlock(cl_transaction_mtxp);
 803
 804                 if (transaction_complete == FALSE) {
 805                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 806                             cbp_head, 0, 0, 0, 0);
 807                         return 0;
 808                 }
 809         }
 810         error       = 0;
 811         total_size  = 0;
 812         total_resid = 0;
 813
 814         cbp        = cbp_head;
 815         vp         = cbp->b_vp;
 816         upl_offset = cbp->b_uploffset;
 817         upl        = cbp->b_upl;
 818         b_flags    = cbp->b_flags;
 819         real_bp    = cbp->b_real_bp;
 820         zero_offset = cbp->b_validend;
 821         iostate    = (struct clios *)cbp->b_iostate;
 822
 823         if (real_bp) {
 824                 real_bp->b_dev = cbp->b_dev;
 825         }
 826
 827         while (cbp) {
 828                 if ((cbp->b_flags & B_ERROR) && error == 0) {
 829                         error = cbp->b_error;
 830                 }
 831
 832                 total_resid += cbp->b_resid;
 833                 total_size  += cbp->b_bcount;
 834
 835                 cbp_next = cbp->b_trans_next;
 836
 837                 if (cbp_next == NULL) {
 838                         /*
 839                          * compute the overall size of the transaction
 840                          * in case we created one that has 'holes' in it
 841                          * 'total_size' represents the amount of I/O we
 842                          * did, not the span of the transaction w/r to the UPL
 843                          */
 844                         transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
 845                 }
 846
 847                 if (cbp != cbp_head) {
 848                         free_io_buf(cbp);
 849                 }
 850
 851                 cbp = cbp_next;
 852         }
 853
 854         if (ISSET(b_flags, B_COMMIT_UPL)) {
 855                 cluster_handle_associated_upl(iostate,
 856                     cbp_head->b_upl,
 857                     upl_offset,
 858                     transaction_size);
 859         }
 860
 861         if (error == 0 && total_resid) {
 862                 error = EIO;
 863         }
 864
 865         if (error == 0) {
 866                 int     (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone);
 867
 868                 if (cliodone_func != NULL) {
 869                         cbp_head->b_bcount = transaction_size;
 870
 871                         error = (*cliodone_func)(cbp_head, callback_arg);
 872                 }
 873         }
 874         if (zero_offset) {
 875                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 876         }
 877
 878         free_io_buf(cbp_head);
 879
 880         if (iostate) {
 881                 int need_wakeup = 0;
 882
 883                 /*
 884                  * someone has issued multiple I/Os asynchrounsly
 885                  * and is waiting for them to complete (streaming)
 886                  */
 887                 lck_mtx_lock_spin(&iostate->io_mtxp);
 888
 889                 if (error && iostate->io_error == 0) {
 890                         iostate->io_error = error;
 891                 }
 892
 893                 iostate->io_completed += total_size;
 894
 895                 if (iostate->io_wanted) {
 896                         /*
 897                          * someone is waiting for the state of
 898                          * this io stream to change
 899                          */
 900                         iostate->io_wanted = 0;
 901                         need_wakeup = 1;
 902                 }
 903                 lck_mtx_unlock(&iostate->io_mtxp);
 904
 905                 if (need_wakeup) {
 906                         wakeup((caddr_t)&iostate->io_wanted);
 907                 }
 908         }
 909
 910         if (b_flags & B_COMMIT_UPL) {
 911                 pg_offset   = upl_offset & PAGE_MASK;
 912                 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 913
 914                 if (error) {
 915                         upl_set_iodone_error(upl, error);
 916
 917                         upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
 918                 } else {
 919                         upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
 920
 921                         if ((b_flags & B_PHYS) && (b_flags & B_READ)) {
 922                                 upl_flags |= UPL_COMMIT_SET_DIRTY;
 923                         }
 924
 925                         if (b_flags & B_AGE) {
 926                                 upl_flags |= UPL_COMMIT_INACTIVATE;
 927                         }
 928
 929                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
 930                 }
 931         }
 932         if (real_bp) {
 933                 if (error) {
 934                         real_bp->b_flags |= B_ERROR;
 935                         real_bp->b_error = error;
 936                 }
 937                 real_bp->b_resid = total_resid;
 938
 939                 buf_biodone(real_bp);
 940         }
 941         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 942             upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0);
 943
 944         return error;
 945 }
 946
 947
 948 uint32_t
 949 cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
 950 {
 951         if (cluster_is_throttled(vp)) {
 952                 *limit = THROTTLE_MAX_IOSIZE;
 953                 return 1;
 954         }
 955         return 0;
 956 }
 957
 958
 959 void
 960 cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
 961 {
 962         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
 963             upl_offset, size, bp, 0, 0);
 964
 965         if (bp == NULL || bp->b_datap == 0) {
 966                 upl_page_info_t *pl;
 967                 addr64_t        zero_addr;
 968
 969                 pl = ubc_upl_pageinfo(upl);
 970
 971                 if (upl_device_page(pl) == TRUE) {
 972                         zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
 973
 974                         bzero_phys_nc(zero_addr, size);
 975                 } else {
 976                         while (size) {
 977                                 int     page_offset;
 978                                 int     page_index;
 979                                 int     zero_cnt;
 980
 981                                 page_index  = upl_offset / PAGE_SIZE;
 982                                 page_offset = upl_offset & PAGE_MASK;
 983
 984                                 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
 985                                 zero_cnt  = min(PAGE_SIZE - page_offset, size);
 986
 987                                 bzero_phys(zero_addr, zero_cnt);
 988
 989                                 size       -= zero_cnt;
 990                                 upl_offset += zero_cnt;
 991                         }
 992                 }
 993         } else {
 994                 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
 995         }
 996
 997         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
 998             upl_offset, size, 0, 0, 0);
 999 }
1000
1001
1002 static void
1003 cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset)
1004 {
1005         cbp_head->b_validend = zero_offset;
1006         cbp_tail->b_flags |= B_EOT;
1007 }
1008
1009 static void
1010 cluster_wait_IO(buf_t cbp_head, int async)
1011 {
1012         buf_t   cbp;
1013
1014         if (async) {
1015                 /*
1016                  * Async callback completion will not normally generate a
1017                  * wakeup upon I/O completion.  To get woken up, we set
1018                  * b_trans_next (which is safe for us to modify) on the last
1019                  * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
1020                  * to wake us up when all buffers as part of this transaction
1021                  * are completed.  This is done under the umbrella of
1022                  * cl_transaction_mtxp which is also taken in cluster_iodone.
1023                  */
1024                 bool done = true;
1025                 buf_t last = NULL;
1026
1027                 lck_mtx_lock_spin(cl_transaction_mtxp);
1028
1029                 for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
1030                         if (!ISSET(cbp->b_flags, B_TDONE)) {
1031                                 done = false;
1032                         }
1033                 }
1034
1035                 if (!done) {
1036                         last->b_trans_next = CLUSTER_IO_WAITING;
1037
1038                         DTRACE_IO1(wait__start, buf_t, last);
1039                         do {
1040                                 msleep(last, cl_transaction_mtxp, PSPIN | (PRIBIO + 1), "cluster_wait_IO", NULL);
1041
1042                                 /*
1043                                  * We should only have been woken up if all the
1044                                  * buffers are completed, but just in case...
1045                                  */
1046                                 done = true;
1047                                 for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
1048                                         if (!ISSET(cbp->b_flags, B_TDONE)) {
1049                                                 done = false;
1050                                                 break;
1051                                         }
1052                                 }
1053                         } while (!done);
1054                         DTRACE_IO1(wait__done, buf_t, last);
1055
1056                         last->b_trans_next = NULL;
1057                 }
1058
1059                 lck_mtx_unlock(cl_transaction_mtxp);
1060         } else { // !async
1061                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1062                         buf_biowait(cbp);
1063                 }
1064         }
1065 }
1066
1067 static void
1068 cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait)
1069 {
1070         buf_t   cbp;
1071         int     error;
1072         boolean_t isswapout = FALSE;
1073
1074         /*
1075          * cluster_complete_transaction will
1076          * only be called if we've issued a complete chain in synchronous mode
1077          * or, we've already done a cluster_wait_IO on an incomplete chain
1078          */
1079         if (needwait) {
1080                 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1081                         buf_biowait(cbp);
1082                 }
1083         }
1084         /*
1085          * we've already waited on all of the I/Os in this transaction,
1086          * so mark all of the buf_t's in this transaction as B_TDONE
1087          * so that cluster_iodone sees the transaction as completed
1088          */
1089         for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1090                 cbp->b_flags |= B_TDONE;
1091         }
1092         cbp = *cbp_head;
1093
1094         if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp)) {
1095                 isswapout = TRUE;
1096         }
1097
1098         error = cluster_iodone(cbp, callback_arg);
1099
1100         if (!(flags & CL_ASYNC) && error && *retval == 0) {
1101                 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) {
1102                         *retval = error;
1103                 } else if (isswapout == TRUE) {
1104                         *retval = error;
1105                 }
1106         }
1107         *cbp_head = (buf_t)NULL;
1108 }
1109
1110
1111 static int
1112 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
1113     int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1114 {
1115         buf_t   cbp;
1116         u_int   size;
1117         u_int   io_size;
1118         int     io_flags;
1119         int     bmap_flags;
1120         int     error = 0;
1121         int     retval = 0;
1122         buf_t   cbp_head = NULL;
1123         buf_t   cbp_tail = NULL;
1124         int     trans_count = 0;
1125         int     max_trans_count;
1126         u_int   pg_count;
1127         int     pg_offset;
1128         u_int   max_iosize;
1129         u_int   max_vectors;
1130         int     priv;
1131         int     zero_offset = 0;
1132         int     async_throttle = 0;
1133         mount_t mp;
1134         vm_offset_t upl_end_offset;
1135         boolean_t   need_EOT = FALSE;
1136
1137         /*
1138          * we currently don't support buffers larger than a page
1139          */
1140         if (real_bp && non_rounded_size > PAGE_SIZE) {
1141                 panic("%s(): Called with real buffer of size %d bytes which "
1142                     "is greater than the maximum allowed size of "
1143                     "%d bytes (the system PAGE_SIZE).\n",
1144                     __FUNCTION__, non_rounded_size, PAGE_SIZE);
1145         }
1146
1147         mp = vp->v_mount;
1148
1149         /*
1150          * we don't want to do any funny rounding of the size for IO requests
1151          * coming through the DIRECT or CONTIGUOUS paths...  those pages don't
1152          * belong to us... we can't extend (nor do we need to) the I/O to fill
1153          * out a page
1154          */
1155         if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) {
1156                 /*
1157                  * round the requested size up so that this I/O ends on a
1158                  * page boundary in case this is a 'write'... if the filesystem
1159                  * has blocks allocated to back the page beyond the EOF, we want to
1160                  * make sure to write out the zero's that are sitting beyond the EOF
1161                  * so that in case the filesystem doesn't explicitly zero this area
1162                  * if a hole is created via a lseek/write beyond the current EOF,
1163                  * it will return zeros when it's read back from the disk.  If the
1164                  * physical allocation doesn't extend for the whole page, we'll
1165                  * only write/read from the disk up to the end of this allocation
1166                  * via the extent info returned from the VNOP_BLOCKMAP call.
1167                  */
1168                 pg_offset = upl_offset & PAGE_MASK;
1169
1170                 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
1171         } else {
1172                 /*
1173                  * anyone advertising a blocksize of 1 byte probably
1174                  * can't deal with us rounding up the request size
1175                  * AFP is one such filesystem/device
1176                  */
1177                 size = non_rounded_size;
1178         }
1179         upl_end_offset = upl_offset + size;
1180
1181         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
1182
1183         /*
1184          * Set the maximum transaction size to the maximum desired number of
1185          * buffers.
1186          */
1187         max_trans_count = 8;
1188         if (flags & CL_DEV_MEMORY) {
1189                 max_trans_count = 16;
1190         }
1191
1192         if (flags & CL_READ) {
1193                 io_flags = B_READ;
1194                 bmap_flags = VNODE_READ;
1195
1196                 max_iosize  = mp->mnt_maxreadcnt;
1197                 max_vectors = mp->mnt_segreadcnt;
1198         } else {
1199                 io_flags = B_WRITE;
1200                 bmap_flags = VNODE_WRITE;
1201
1202                 max_iosize  = mp->mnt_maxwritecnt;
1203                 max_vectors = mp->mnt_segwritecnt;
1204         }
1205         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
1206
1207         /*
1208          * make sure the maximum iosize is a
1209          * multiple of the page size
1210          */
1211         max_iosize  &= ~PAGE_MASK;
1212
1213         /*
1214          * Ensure the maximum iosize is sensible.
1215          */
1216         if (!max_iosize) {
1217                 max_iosize = PAGE_SIZE;
1218         }
1219
1220         if (flags & CL_THROTTLE) {
1221                 if (!(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
1222                         if (max_iosize > THROTTLE_MAX_IOSIZE) {
1223                                 max_iosize = THROTTLE_MAX_IOSIZE;
1224                         }
1225                         async_throttle = THROTTLE_MAXCNT;
1226                 } else {
1227                         if ((flags & CL_DEV_MEMORY)) {
1228                                 async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
1229                         } else {
1230                                 u_int max_cluster;
1231                                 u_int max_cluster_size;
1232                                 u_int scale;
1233
1234                                 if (vp->v_mount->mnt_minsaturationbytecount) {
1235                                         max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
1236
1237                                         scale = 1;
1238                                 } else {
1239                                         max_cluster_size = MAX_CLUSTER_SIZE(vp);
1240
1241                                         if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
1242                                                 scale = WRITE_THROTTLE_SSD;
1243                                         } else {
1244                                                 scale = WRITE_THROTTLE;
1245                                         }
1246                                 }
1247                                 if (max_iosize > max_cluster_size) {
1248                                         max_cluster = max_cluster_size;
1249                                 } else {
1250                                         max_cluster = max_iosize;
1251                                 }
1252
1253                                 if (size < max_cluster) {
1254                                         max_cluster = size;
1255                                 }
1256
1257                                 if (flags & CL_CLOSE) {
1258                                         scale += MAX_CLUSTERS;
1259                                 }
1260
1261                                 async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
1262                         }
1263                 }
1264         }
1265         if (flags & CL_AGE) {
1266                 io_flags |= B_AGE;
1267         }
1268         if (flags & (CL_PAGEIN | CL_PAGEOUT)) {
1269                 io_flags |= B_PAGEIO;
1270         }
1271         if (flags & (CL_IOSTREAMING)) {
1272                 io_flags |= B_IOSTREAMING;
1273         }
1274         if (flags & CL_COMMIT) {
1275                 io_flags |= B_COMMIT_UPL;
1276         }
1277         if (flags & CL_DIRECT_IO) {
1278                 io_flags |= B_PHYS;
1279         }
1280         if (flags & (CL_PRESERVE | CL_KEEPCACHED)) {
1281                 io_flags |= B_CACHE;
1282         }
1283         if (flags & CL_PASSIVE) {
1284                 io_flags |= B_PASSIVE;
1285         }
1286         if (flags & CL_ENCRYPTED) {
1287                 io_flags |= B_ENCRYPTED_IO;
1288         }
1289
1290         if (vp->v_flag & VSYSTEM) {
1291                 io_flags |= B_META;
1292         }
1293
1294         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1295                 /*
1296                  * then we are going to end up
1297                  * with a page that we can't complete (the file size wasn't a multiple
1298                  * of PAGE_SIZE and we're trying to read to the end of the file
1299                  * so we'll go ahead and zero out the portion of the page we can't
1300                  * read in from the file
1301                  */
1302                 zero_offset = upl_offset + non_rounded_size;
1303         } else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
1304                 assert(ISSET(flags, CL_COMMIT));
1305
1306                 // For a direct/uncached write, we need to lock pages...
1307
1308                 upl_t cached_upl;
1309
1310                 /*
1311                  * Create a UPL to lock the pages in the cache whilst the
1312                  * write is in progress.
1313                  */
1314                 ubc_create_upl_kernel(vp, f_offset, non_rounded_size, &cached_upl,
1315                     NULL, UPL_SET_LITE, VM_KERN_MEMORY_FILE);
1316
1317                 /*
1318                  * Attach this UPL to the other UPL so that we can find it
1319                  * later.
1320                  */
1321                 upl_set_associated_upl(upl, cached_upl);
1322
1323                 if (upl_offset & PAGE_MASK) {
1324                         /*
1325                          * The two UPLs are not aligned, so mark the first page in
1326                          * @upl so that cluster_handle_associated_upl can handle
1327                          * it accordingly.
1328                          */
1329                         upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1330                         upl_page_set_mark(pl, 0, true);
1331                 }
1332         }
1333
1334         while (size) {
1335                 daddr64_t blkno;
1336                 daddr64_t lblkno;
1337                 u_int   io_size_wanted;
1338                 size_t  io_size_tmp;
1339
1340                 if (size > max_iosize) {
1341                         io_size = max_iosize;
1342                 } else {
1343                         io_size = size;
1344                 }
1345
1346                 io_size_wanted = io_size;
1347                 io_size_tmp = (size_t)io_size;
1348
1349                 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) {
1350                         break;
1351                 }
1352
1353                 if (io_size_tmp > io_size_wanted) {
1354                         io_size = io_size_wanted;
1355                 } else {
1356                         io_size = (u_int)io_size_tmp;
1357                 }
1358
1359                 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
1360                         real_bp->b_blkno = blkno;
1361                 }
1362
1363                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
1364                     (int)f_offset, (int)(blkno >> 32), (int)blkno, io_size, 0);
1365
1366                 if (io_size == 0) {
1367                         /*
1368                          * vnop_blockmap didn't return an error... however, it did
1369                          * return an extent size of 0 which means we can't
1370                          * make forward progress on this I/O... a hole in the
1371                          * file would be returned as a blkno of -1 with a non-zero io_size
1372                          * a real extent is returned with a blkno != -1 and a non-zero io_size
1373                          */
1374                         error = EINVAL;
1375                         break;
1376                 }
1377                 if (!(flags & CL_READ) && blkno == -1) {
1378                         off_t   e_offset;
1379                         int     pageout_flags;
1380
1381                         if (upl_get_internal_vectorupl(upl)) {
1382                                 panic("Vector UPLs should not take this code-path\n");
1383                         }
1384                         /*
1385                          * we're writing into a 'hole'
1386                          */
1387                         if (flags & CL_PAGEOUT) {
1388                                 /*
1389                                  * if we got here via cluster_pageout
1390                                  * then just error the request and return
1391                                  * the 'hole' should already have been covered
1392                                  */
1393                                 error = EINVAL;
1394                                 break;
1395                         }
1396                         /*
1397                          * we can get here if the cluster code happens to
1398                          * pick up a page that was dirtied via mmap vs
1399                          * a 'write' and the page targets a 'hole'...
1400                          * i.e. the writes to the cluster were sparse
1401                          * and the file was being written for the first time
1402                          *
1403                          * we can also get here if the filesystem supports
1404                          * 'holes' that are less than PAGE_SIZE.... because
1405                          * we can't know if the range in the page that covers
1406                          * the 'hole' has been dirtied via an mmap or not,
1407                          * we have to assume the worst and try to push the
1408                          * entire page to storage.
1409                          *
1410                          * Try paging out the page individually before
1411                          * giving up entirely and dumping it (the pageout
1412                          * path will insure that the zero extent accounting
1413                          * has been taken care of before we get back into cluster_io)
1414                          *
1415                          * go direct to vnode_pageout so that we don't have to
1416                          * unbusy the page from the UPL... we used to do this
1417                          * so that we could call ubc_msync, but that results
1418                          * in a potential deadlock if someone else races us to acquire
1419                          * that page and wins and in addition needs one of the pages
1420                          * we're continuing to hold in the UPL
1421                          */
1422                         pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT;
1423
1424                         if (!(flags & CL_ASYNC)) {
1425                                 pageout_flags |= UPL_IOSYNC;
1426                         }
1427                         if (!(flags & CL_COMMIT)) {
1428                                 pageout_flags |= UPL_NOCOMMIT;
1429                         }
1430
1431                         if (cbp_head) {
1432                                 buf_t prev_cbp;
1433                                 int   bytes_in_last_page;
1434
1435                                 /*
1436                                  * first we have to wait for the the current outstanding I/Os
1437                                  * to complete... EOT hasn't been set yet on this transaction
1438                                  * so the pages won't be released
1439                                  */
1440                                 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1441
1442                                 bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
1443                                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1444                                         bytes_in_last_page += cbp->b_bcount;
1445                                 }
1446                                 bytes_in_last_page &= PAGE_MASK;
1447
1448                                 while (bytes_in_last_page) {
1449                                         /*
1450                                          * we've got a transcation that
1451                                          * includes the page we're about to push out through vnode_pageout...
1452                                          * find the bp's in the list which intersect this page and either
1453                                          * remove them entirely from the transaction (there could be multiple bp's), or
1454                                          * round it's iosize down to the page boundary (there can only be one)...
1455                                          *
1456                                          * find the last bp in the list and act on it
1457                                          */
1458                                         for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) {
1459                                                 prev_cbp = cbp;
1460                                         }
1461
1462                                         if (bytes_in_last_page >= cbp->b_bcount) {
1463                                                 /*
1464                                                  * this buf no longer has any I/O associated with it
1465                                                  */
1466                                                 bytes_in_last_page -= cbp->b_bcount;
1467                                                 cbp->b_bcount = 0;
1468
1469                                                 free_io_buf(cbp);
1470
1471                                                 if (cbp == cbp_head) {
1472                                                         assert(bytes_in_last_page == 0);
1473                                                         /*
1474                                                          * the buf we just freed was the only buf in
1475                                                          * this transaction... so there's no I/O to do
1476                                                          */
1477                                                         cbp_head = NULL;
1478                                                         cbp_tail = NULL;
1479                                                 } else {
1480                                                         /*
1481                                                          * remove the buf we just freed from
1482                                                          * the transaction list
1483                                                          */
1484                                                         prev_cbp->b_trans_next = NULL;
1485                                                         cbp_tail = prev_cbp;
1486                                                 }
1487                                         } else {
1488                                                 /*
1489                                                  * this is the last bp that has I/O
1490                                                  * intersecting the page of interest
1491                                                  * only some of the I/O is in the intersection
1492                                                  * so clip the size but keep it in the transaction list
1493                                                  */
1494                                                 cbp->b_bcount -= bytes_in_last_page;
1495                                                 cbp_tail = cbp;
1496                                                 bytes_in_last_page = 0;
1497                                         }
1498                                 }
1499                                 if (cbp_head) {
1500                                         /*
1501                                          * there was more to the current transaction
1502                                          * than just the page we are pushing out via vnode_pageout...
1503                                          * mark it as finished and complete it... we've already
1504                                          * waited for the I/Os to complete above in the call to cluster_wait_IO
1505                                          */
1506                                         cluster_EOT(cbp_head, cbp_tail, 0);
1507
1508                                         cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1509
1510                                         trans_count = 0;
1511                                 }
1512                         }
1513                         if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
1514                                 error = EINVAL;
1515                         }
1516                         e_offset = round_page_64(f_offset + 1);
1517                         io_size = e_offset - f_offset;
1518
1519                         f_offset   += io_size;
1520                         upl_offset += io_size;
1521
1522                         if (size >= io_size) {
1523                                 size -= io_size;
1524                         } else {
1525                                 size = 0;
1526                         }
1527                         /*
1528                          * keep track of how much of the original request
1529                          * that we've actually completed... non_rounded_size
1530                          * may go negative due to us rounding the request
1531                          * to a page size multiple (i.e.  size > non_rounded_size)
1532                          */
1533                         non_rounded_size -= io_size;
1534
1535                         if (non_rounded_size <= 0) {
1536                                 /*
1537                                  * we've transferred all of the data in the original
1538                                  * request, but we were unable to complete the tail
1539                                  * of the last page because the file didn't have
1540                                  * an allocation to back that portion... this is ok.
1541                                  */
1542                                 size = 0;
1543                         }
1544                         if (error) {
1545                                 if (size == 0) {
1546                                         flags &= ~CL_COMMIT;
1547                                 }
1548                                 break;
1549                         }
1550                         continue;
1551                 }
1552                 lblkno = (daddr64_t)(f_offset / 0x1000);
1553                 /*
1554                  * we have now figured out how much I/O we can do - this is in 'io_size'
1555                  * pg_offset is the starting point in the first page for the I/O
1556                  * pg_count is the number of full and partial pages that 'io_size' encompasses
1557                  */
1558                 pg_offset = upl_offset & PAGE_MASK;
1559
1560                 if (flags & CL_DEV_MEMORY) {
1561                         /*
1562                          * treat physical requests as one 'giant' page
1563                          */
1564                         pg_count = 1;
1565                 } else {
1566                         pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
1567                 }
1568
1569                 if ((flags & CL_READ) && blkno == -1) {
1570                         vm_offset_t  commit_offset;
1571                         int bytes_to_zero;
1572                         int complete_transaction_now = 0;
1573
1574                         /*
1575                          * if we're reading and blkno == -1, then we've got a
1576                          * 'hole' in the file that we need to deal with by zeroing
1577                          * out the affected area in the upl
1578                          */
1579                         if (io_size >= (u_int)non_rounded_size) {
1580                                 /*
1581                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1582                                  * than 'zero_offset' will be non-zero
1583                                  * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1584                                  * (indicated by the io_size finishing off the I/O request for this UPL)
1585                                  * than we're not going to issue an I/O for the
1586                                  * last page in this upl... we need to zero both the hole and the tail
1587                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
1588                                  */
1589                                 bytes_to_zero = non_rounded_size;
1590                                 if (!(flags & CL_NOZERO)) {
1591                                         bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
1592                                 }
1593
1594                                 zero_offset = 0;
1595                         } else {
1596                                 bytes_to_zero = io_size;
1597                         }
1598
1599                         pg_count = 0;
1600
1601                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
1602
1603                         if (cbp_head) {
1604                                 int     pg_resid;
1605
1606                                 /*
1607                                  * if there is a current I/O chain pending
1608                                  * then the first page of the group we just zero'd
1609                                  * will be handled by the I/O completion if the zero
1610                                  * fill started in the middle of the page
1611                                  */
1612                                 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1613
1614                                 pg_resid = commit_offset - upl_offset;
1615
1616                                 if (bytes_to_zero >= pg_resid) {
1617                                         /*
1618                                          * the last page of the current I/O
1619                                          * has been completed...
1620                                          * compute the number of fully zero'd
1621                                          * pages that are beyond it
1622                                          * plus the last page if its partial
1623                                          * and we have no more I/O to issue...
1624                                          * otherwise a partial page is left
1625                                          * to begin the next I/O
1626                                          */
1627                                         if ((int)io_size >= non_rounded_size) {
1628                                                 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
1629                                         } else {
1630                                                 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
1631                                         }
1632
1633                                         complete_transaction_now = 1;
1634                                 }
1635                         } else {
1636                                 /*
1637                                  * no pending I/O to deal with
1638                                  * so, commit all of the fully zero'd pages
1639                                  * plus the last page if its partial
1640                                  * and we have no more I/O to issue...
1641                                  * otherwise a partial page is left
1642                                  * to begin the next I/O
1643                                  */
1644                                 if ((int)io_size >= non_rounded_size) {
1645                                         pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
1646                                 } else {
1647                                         pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
1648                                 }
1649
1650                                 commit_offset = upl_offset & ~PAGE_MASK;
1651                         }
1652
1653                         // Associated UPL is currently only used in the direct write path
1654                         assert(!upl_associated_upl(upl));
1655
1656                         if ((flags & CL_COMMIT) && pg_count) {
1657                                 ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE,
1658                                     UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1659                         }
1660                         upl_offset += io_size;
1661                         f_offset   += io_size;
1662                         size       -= io_size;
1663
1664                         /*
1665                          * keep track of how much of the original request
1666                          * that we've actually completed... non_rounded_size
1667                          * may go negative due to us rounding the request
1668                          * to a page size multiple (i.e.  size > non_rounded_size)
1669                          */
1670                         non_rounded_size -= io_size;
1671
1672                         if (non_rounded_size <= 0) {
1673                                 /*
1674                                  * we've transferred all of the data in the original
1675                                  * request, but we were unable to complete the tail
1676                                  * of the last page because the file didn't have
1677                                  * an allocation to back that portion... this is ok.
1678                                  */
1679                                 size = 0;
1680                         }
1681                         if (cbp_head && (complete_transaction_now || size == 0)) {
1682                                 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1683
1684                                 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
1685
1686                                 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1687
1688                                 trans_count = 0;
1689                         }
1690                         continue;
1691                 }
1692                 if (pg_count > max_vectors) {
1693                         if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
1694                                 io_size = PAGE_SIZE - pg_offset;
1695                                 pg_count = 1;
1696                         } else {
1697                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
1698                                 pg_count = max_vectors;
1699                         }
1700                 }
1701                 /*
1702                  * If the transaction is going to reach the maximum number of
1703                  * desired elements, truncate the i/o to the nearest page so
1704                  * that the actual i/o is initiated after this buffer is
1705                  * created and added to the i/o chain.
1706                  *
1707                  * I/O directed to physically contiguous memory
1708                  * doesn't have a requirement to make sure we 'fill' a page
1709                  */
1710                 if (!(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
1711                     ((upl_offset + io_size) & PAGE_MASK)) {
1712                         vm_offset_t aligned_ofs;
1713
1714                         aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
1715                         /*
1716                          * If the io_size does not actually finish off even a
1717                          * single page we have to keep adding buffers to the
1718                          * transaction despite having reached the desired limit.
1719                          *
1720                          * Eventually we get here with the page being finished
1721                          * off (and exceeded) and then we truncate the size of
1722                          * this i/o request so that it is page aligned so that
1723                          * we can finally issue the i/o on the transaction.
1724                          */
1725                         if (aligned_ofs > upl_offset) {
1726                                 io_size = aligned_ofs - upl_offset;
1727                                 pg_count--;
1728                         }
1729                 }
1730
1731                 if (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) {
1732                         /*
1733                          * if we're not targeting a virtual device i.e. a disk image
1734                          * it's safe to dip into the reserve pool since real devices
1735                          * can complete this I/O request without requiring additional
1736                          * bufs from the alloc_io_buf pool
1737                          */
1738                         priv = 1;
1739                 } else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT)) {
1740                         /*
1741                          * Throttle the speculative IO
1742                          */
1743                         priv = 0;
1744                 } else {
1745                         priv = 1;
1746                 }
1747
1748                 cbp = alloc_io_buf(vp, priv);
1749
1750                 if (flags & CL_PAGEOUT) {
1751                         u_int i;
1752
1753                         /*
1754                          * since blocks are in offsets of 0x1000, scale
1755                          * iteration to (PAGE_SIZE * pg_count) of blks.
1756                          */
1757                         for (i = 0; i < (PAGE_SIZE * pg_count) / 0x1000; i++) {
1758                                 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) {
1759                                         panic("BUSY bp found in cluster_io");
1760                                 }
1761                         }
1762                 }
1763                 if (flags & CL_ASYNC) {
1764                         if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg)) {
1765                                 panic("buf_setcallback failed\n");
1766                         }
1767                 }
1768                 cbp->b_cliodone = (void *)callback;
1769                 cbp->b_flags |= io_flags;
1770                 if (flags & CL_NOCACHE) {
1771                         cbp->b_attr.ba_flags |= BA_NOCACHE;
1772                 }
1773
1774                 cbp->b_lblkno = lblkno;
1775                 cbp->b_blkno  = blkno;
1776                 cbp->b_bcount = io_size;
1777
1778                 if (buf_setupl(cbp, upl, upl_offset)) {
1779                         panic("buf_setupl failed\n");
1780                 }
1781 #if CONFIG_IOSCHED
1782                 upl_set_blkno(upl, upl_offset, io_size, blkno);
1783 #endif
1784                 cbp->b_trans_next = (buf_t)NULL;
1785
1786                 if ((cbp->b_iostate = (void *)iostate)) {
1787                         /*
1788                          * caller wants to track the state of this
1789                          * io... bump the amount issued against this stream
1790                          */
1791                         iostate->io_issued += io_size;
1792                 }
1793
1794                 if (flags & CL_READ) {
1795                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
1796                             (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1797                 } else {
1798                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
1799                             (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1800                 }
1801
1802                 if (cbp_head) {
1803                         cbp_tail->b_trans_next = cbp;
1804                         cbp_tail = cbp;
1805                 } else {
1806                         cbp_head = cbp;
1807                         cbp_tail = cbp;
1808
1809                         if ((cbp_head->b_real_bp = real_bp)) {
1810                                 real_bp = (buf_t)NULL;
1811                         }
1812                 }
1813                 *(buf_t *)(&cbp->b_trans_head) = cbp_head;
1814
1815                 trans_count++;
1816
1817                 upl_offset += io_size;
1818                 f_offset   += io_size;
1819                 size       -= io_size;
1820                 /*
1821                  * keep track of how much of the original request
1822                  * that we've actually completed... non_rounded_size
1823                  * may go negative due to us rounding the request
1824                  * to a page size multiple (i.e.  size > non_rounded_size)
1825                  */
1826                 non_rounded_size -= io_size;
1827
1828                 if (non_rounded_size <= 0) {
1829                         /*
1830                          * we've transferred all of the data in the original
1831                          * request, but we were unable to complete the tail
1832                          * of the last page because the file didn't have
1833                          * an allocation to back that portion... this is ok.
1834                          */
1835                         size = 0;
1836                 }
1837                 if (size == 0) {
1838                         /*
1839                          * we have no more I/O to issue, so go
1840                          * finish the final transaction
1841                          */
1842                         need_EOT = TRUE;
1843                 } else if (((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) &&
1844                     ((flags & CL_ASYNC) || trans_count > max_trans_count)) {
1845                         /*
1846                          * I/O directed to physically contiguous memory...
1847                          * which doesn't have a requirement to make sure we 'fill' a page
1848                          * or...
1849                          * the current I/O we've prepared fully
1850                          * completes the last page in this request
1851                          * and ...
1852                          * it's either an ASYNC request or
1853                          * we've already accumulated more than 8 I/O's into
1854                          * this transaction so mark it as complete so that
1855                          * it can finish asynchronously or via the cluster_complete_transaction
1856                          * below if the request is synchronous
1857                          */
1858                         need_EOT = TRUE;
1859                 }
1860                 if (need_EOT == TRUE) {
1861                         cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
1862                 }
1863
1864                 if (flags & CL_THROTTLE) {
1865                         (void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
1866                 }
1867
1868                 if (!(io_flags & B_READ)) {
1869                         vnode_startwrite(vp);
1870                 }
1871
1872                 if (flags & CL_RAW_ENCRYPTED) {
1873                         /*
1874                          * User requested raw encrypted bytes.
1875                          * Twiddle the bit in the ba_flags for the buffer
1876                          */
1877                         cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO;
1878                 }
1879
1880                 (void) VNOP_STRATEGY(cbp);
1881
1882                 if (need_EOT == TRUE) {
1883                         if (!(flags & CL_ASYNC)) {
1884                                 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
1885                         }
1886
1887                         need_EOT = FALSE;
1888                         trans_count = 0;
1889                         cbp_head = NULL;
1890                 }
1891         }
1892         if (error) {
1893                 int abort_size;
1894
1895                 io_size = 0;
1896
1897                 if (cbp_head) {
1898                         /*
1899                          * Wait until all of the outstanding I/O
1900                          * for this partial transaction has completed
1901                          */
1902                         cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1903
1904                         /*
1905                          * Rewind the upl offset to the beginning of the
1906                          * transaction.
1907                          */
1908                         upl_offset = cbp_head->b_uploffset;
1909                 }
1910
1911                 if (ISSET(flags, CL_COMMIT)) {
1912                         cluster_handle_associated_upl(iostate, upl, upl_offset,
1913                             upl_end_offset - upl_offset);
1914                 }
1915
1916                 // Free all the IO buffers in this transaction
1917                 for (cbp = cbp_head; cbp;) {
1918                         buf_t   cbp_next;
1919
1920                         size       += cbp->b_bcount;
1921                         io_size    += cbp->b_bcount;
1922
1923                         cbp_next = cbp->b_trans_next;
1924                         free_io_buf(cbp);
1925                         cbp = cbp_next;
1926                 }
1927
1928                 if (iostate) {
1929                         int need_wakeup = 0;
1930
1931                         /*
1932                          * update the error condition for this stream
1933                          * since we never really issued the io
1934                          * just go ahead and adjust it back
1935                          */
1936                         lck_mtx_lock_spin(&iostate->io_mtxp);
1937
1938                         if (iostate->io_error == 0) {
1939                                 iostate->io_error = error;
1940                         }
1941                         iostate->io_issued -= io_size;
1942
1943                         if (iostate->io_wanted) {
1944                                 /*
1945                                  * someone is waiting for the state of
1946                                  * this io stream to change
1947                                  */
1948                                 iostate->io_wanted = 0;
1949                                 need_wakeup = 1;
1950                         }
1951                         lck_mtx_unlock(&iostate->io_mtxp);
1952
1953                         if (need_wakeup) {
1954                                 wakeup((caddr_t)&iostate->io_wanted);
1955                         }
1956                 }
1957
1958                 if (flags & CL_COMMIT) {
1959                         int     upl_flags;
1960
1961                         pg_offset  = upl_offset & PAGE_MASK;
1962                         abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK;
1963
1964                         upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags, vp);
1965
1966                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
1967                             upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0);
1968                 }
1969                 if (retval == 0) {
1970                         retval = error;
1971                 }
1972         } else if (cbp_head) {
1973                 panic("%s(): cbp_head is not NULL.\n", __FUNCTION__);
1974         }
1975
1976         if (real_bp) {
1977                 /*
1978                  * can get here if we either encountered an error
1979                  * or we completely zero-filled the request and
1980                  * no I/O was issued
1981                  */
1982                 if (error) {
1983                         real_bp->b_flags |= B_ERROR;
1984                         real_bp->b_error = error;
1985                 }
1986                 buf_biodone(real_bp);
1987         }
1988         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
1989
1990         return retval;
1991 }
1992
1993 #define reset_vector_run_state()                                                                                \
1994         issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
1995
1996 static int
1997 vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
1998     int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1999 {
2000         vector_upl_set_pagelist(vector_upl);
2001
2002         if (io_flag & CL_READ) {
2003                 if (vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK) == 0)) {
2004                         io_flag &= ~CL_PRESERVE; /*don't zero fill*/
2005                 } else {
2006                         io_flag |= CL_PRESERVE; /*zero fill*/
2007                 }
2008         }
2009         return cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg);
2010 }
2011
2012 static int
2013 cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2014 {
2015         int           pages_in_prefetch;
2016
2017         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
2018             (int)f_offset, size, (int)filesize, 0, 0);
2019
2020         if (f_offset >= filesize) {
2021                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2022                     (int)f_offset, 0, 0, 0, 0);
2023                 return 0;
2024         }
2025         if ((off_t)size > (filesize - f_offset)) {
2026                 size = filesize - f_offset;
2027         }
2028         pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
2029
2030         advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
2031
2032         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2033             (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
2034
2035         return pages_in_prefetch;
2036 }
2037
2038
2039
2040 static void
2041 cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg,
2042     int bflag)
2043 {
2044         daddr64_t       r_addr;
2045         off_t           f_offset;
2046         int             size_of_prefetch;
2047         u_int           max_prefetch;
2048
2049
2050         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
2051             (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
2052
2053         if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
2054                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2055                     rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
2056                 return;
2057         }
2058         if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
2059                 rap->cl_ralen = 0;
2060                 rap->cl_maxra = 0;
2061
2062                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2063                     rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
2064
2065                 return;
2066         }
2067         max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), disk_conditioner_mount_is_ssd(vp->v_mount));
2068
2069         if (max_prefetch > speculative_prefetch_max) {
2070                 max_prefetch = speculative_prefetch_max;
2071         }
2072
2073         if (max_prefetch <= PAGE_SIZE) {
2074                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2075                     rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
2076                 return;
2077         }
2078         if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
2079                 if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
2080                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2081                             rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
2082                         return;
2083                 }
2084         }
2085         r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
2086         f_offset = (off_t)(r_addr * PAGE_SIZE_64);
2087
2088         size_of_prefetch = 0;
2089
2090         ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
2091
2092         if (size_of_prefetch) {
2093                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2094                     rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
2095                 return;
2096         }
2097         if (f_offset < filesize) {
2098                 daddr64_t read_size;
2099
2100                 rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
2101
2102                 read_size = (extent->e_addr + 1) - extent->b_addr;
2103
2104                 if (read_size > rap->cl_ralen) {
2105                         if (read_size > max_prefetch / PAGE_SIZE) {
2106                                 rap->cl_ralen = max_prefetch / PAGE_SIZE;
2107                         } else {
2108                                 rap->cl_ralen = read_size;
2109                         }
2110                 }
2111                 size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
2112
2113                 if (size_of_prefetch) {
2114                         rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
2115                 }
2116         }
2117         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2118             rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
2119 }
2120
2121
2122 int
2123 cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2124     int size, off_t filesize, int flags)
2125 {
2126         return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2127 }
2128
2129
2130 int
2131 cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2132     int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2133 {
2134         int           io_size;
2135         int           rounded_size;
2136         off_t         max_size;
2137         int           local_flags;
2138
2139         local_flags = CL_PAGEOUT | CL_THROTTLE;
2140
2141         if ((flags & UPL_IOSYNC) == 0) {
2142                 local_flags |= CL_ASYNC;
2143         }
2144         if ((flags & UPL_NOCOMMIT) == 0) {
2145                 local_flags |= CL_COMMIT;
2146         }
2147         if ((flags & UPL_KEEPCACHED)) {
2148                 local_flags |= CL_KEEPCACHED;
2149         }
2150         if (flags & UPL_PAGING_ENCRYPTED) {
2151                 local_flags |= CL_ENCRYPTED;
2152         }
2153
2154
2155         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
2156             (int)f_offset, size, (int)filesize, local_flags, 0);
2157
2158         /*
2159          * If they didn't specify any I/O, then we are done...
2160          * we can't issue an abort because we don't know how
2161          * big the upl really is
2162          */
2163         if (size <= 0) {
2164                 return EINVAL;
2165         }
2166
2167         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
2168                 if (local_flags & CL_COMMIT) {
2169                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2170                 }
2171                 return EROFS;
2172         }
2173         /*
2174          * can't page-in from a negative offset
2175          * or if we're starting beyond the EOF
2176          * or if the file offset isn't page aligned
2177          * or the size requested isn't a multiple of PAGE_SIZE
2178          */
2179         if (f_offset < 0 || f_offset >= filesize ||
2180             (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
2181                 if (local_flags & CL_COMMIT) {
2182                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2183                 }
2184                 return EINVAL;
2185         }
2186         max_size = filesize - f_offset;
2187
2188         if (size < max_size) {
2189                 io_size = size;
2190         } else {
2191                 io_size = max_size;
2192         }
2193
2194         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2195
2196         if (size > rounded_size) {
2197                 if (local_flags & CL_COMMIT) {
2198                         ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
2199                             UPL_ABORT_FREE_ON_EMPTY);
2200                 }
2201         }
2202         return cluster_io(vp, upl, upl_offset, f_offset, io_size,
2203                    local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2204 }
2205
2206
2207 int
2208 cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2209     int size, off_t filesize, int flags)
2210 {
2211         return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2212 }
2213
2214
2215 int
2216 cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2217     int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2218 {
2219         u_int         io_size;
2220         int           rounded_size;
2221         off_t         max_size;
2222         int           retval;
2223         int           local_flags = 0;
2224
2225         if (upl == NULL || size < 0) {
2226                 panic("cluster_pagein: NULL upl passed in");
2227         }
2228
2229         if ((flags & UPL_IOSYNC) == 0) {
2230                 local_flags |= CL_ASYNC;
2231         }
2232         if ((flags & UPL_NOCOMMIT) == 0) {
2233                 local_flags |= CL_COMMIT;
2234         }
2235         if (flags & UPL_IOSTREAMING) {
2236                 local_flags |= CL_IOSTREAMING;
2237         }
2238         if (flags & UPL_PAGING_ENCRYPTED) {
2239                 local_flags |= CL_ENCRYPTED;
2240         }
2241
2242
2243         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
2244             (int)f_offset, size, (int)filesize, local_flags, 0);
2245
2246         /*
2247          * can't page-in from a negative offset
2248          * or if we're starting beyond the EOF
2249          * or if the file offset isn't page aligned
2250          * or the size requested isn't a multiple of PAGE_SIZE
2251          */
2252         if (f_offset < 0 || f_offset >= filesize ||
2253             (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
2254                 if (local_flags & CL_COMMIT) {
2255                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2256                 }
2257                 return EINVAL;
2258         }
2259         max_size = filesize - f_offset;
2260
2261         if (size < max_size) {
2262                 io_size = size;
2263         } else {
2264                 io_size = max_size;
2265         }
2266
2267         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2268
2269         if (size > rounded_size && (local_flags & CL_COMMIT)) {
2270                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
2271                     size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2272         }
2273
2274         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
2275             local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2276
2277         return retval;
2278 }
2279
2280
2281 int
2282 cluster_bp(buf_t bp)
2283 {
2284         return cluster_bp_ext(bp, NULL, NULL);
2285 }
2286
2287
2288 int
2289 cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg)
2290 {
2291         off_t  f_offset;
2292         int    flags;
2293
2294         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
2295             bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
2296
2297         if (bp->b_flags & B_READ) {
2298                 flags = CL_ASYNC | CL_READ;
2299         } else {
2300                 flags = CL_ASYNC;
2301         }
2302         if (bp->b_flags & B_PASSIVE) {
2303                 flags |= CL_PASSIVE;
2304         }
2305
2306         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
2307
2308         return cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg);
2309 }
2310
2311
2312
2313 int
2314 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
2315 {
2316         return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
2317 }
2318
2319
2320 int
2321 cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
2322     int xflags, int (*callback)(buf_t, void *), void *callback_arg)
2323 {
2324         user_ssize_t    cur_resid;
2325         int             retval = 0;
2326         int             flags;
2327         int             zflags;
2328         int             bflag;
2329         int             write_type = IO_COPY;
2330         u_int32_t       write_length;
2331
2332         flags = xflags;
2333
2334         if (flags & IO_PASSIVE) {
2335                 bflag = CL_PASSIVE;
2336         } else {
2337                 bflag = 0;
2338         }
2339
2340         if (vp->v_flag & VNOCACHE_DATA) {
2341                 flags |= IO_NOCACHE;
2342                 bflag |= CL_NOCACHE;
2343         }
2344         if (uio == NULL) {
2345                 /*
2346                  * no user data...
2347                  * this call is being made to zero-fill some range in the file
2348                  */
2349                 retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
2350
2351                 return retval;
2352         }
2353         /*
2354          * do a write through the cache if one of the following is true....
2355          *   NOCACHE is not true or NODIRECT is true
2356          *   the uio request doesn't target USERSPACE
2357          * otherwise, find out if we want the direct or contig variant for
2358          * the first vector in the uio request
2359          */
2360         if (((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
2361                 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2362         }
2363
2364         if ((flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) {
2365                 /*
2366                  * must go through the cached variant in this case
2367                  */
2368                 write_type = IO_COPY;
2369         }
2370
2371         while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
2372                 switch (write_type) {
2373                 case IO_COPY:
2374                         /*
2375                          * make sure the uio_resid isn't too big...
2376                          * internally, we want to handle all of the I/O in
2377                          * chunk sizes that fit in a 32 bit int
2378                          */
2379                         if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
2380                                 /*
2381                                  * we're going to have to call cluster_write_copy
2382                                  * more than once...
2383                                  *
2384                                  * only want the last call to cluster_write_copy to
2385                                  * have the IO_TAILZEROFILL flag set and only the
2386                                  * first call should have IO_HEADZEROFILL
2387                                  */
2388                                 zflags = flags & ~IO_TAILZEROFILL;
2389                                 flags &= ~IO_HEADZEROFILL;
2390
2391                                 write_length = MAX_IO_REQUEST_SIZE;
2392                         } else {
2393                                 /*
2394                                  * last call to cluster_write_copy
2395                                  */
2396                                 zflags = flags;
2397
2398                                 write_length = (u_int32_t)cur_resid;
2399                         }
2400                         retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
2401                         break;
2402
2403                 case IO_CONTIG:
2404                         zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL);
2405
2406                         if (flags & IO_HEADZEROFILL) {
2407                                 /*
2408                                  * only do this once per request
2409                                  */
2410                                 flags &= ~IO_HEADZEROFILL;
2411
2412                                 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
2413                                     headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2414                                 if (retval) {
2415                                         break;
2416                                 }
2417                         }
2418                         retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
2419
2420                         if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
2421                                 /*
2422                                  * we're done with the data from the user specified buffer(s)
2423                                  * and we've been requested to zero fill at the tail
2424                                  * treat this as an IO_HEADZEROFILL which doesn't require a uio
2425                                  * by rearranging the args and passing in IO_HEADZEROFILL
2426                                  */
2427                                 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset,
2428                                     (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2429                         }
2430                         break;
2431
2432                 case IO_DIRECT:
2433                         /*
2434                          * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
2435                          */
2436                         retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
2437                         break;
2438
2439                 case IO_UNKNOWN:
2440                         retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2441                         break;
2442                 }
2443                 /*
2444                  * in case we end up calling cluster_write_copy (from cluster_write_direct)
2445                  * multiple times to service a multi-vector request that is not aligned properly
2446                  * we need to update the oldEOF so that we
2447                  * don't zero-fill the head of a page if we've successfully written
2448                  * data to that area... 'cluster_write_copy' will zero-fill the head of a
2449                  * page that is beyond the oldEOF if the write is unaligned... we only
2450                  * want that to happen for the very first page of the cluster_write,
2451                  * NOT the first page of each vector making up a multi-vector write.
2452                  */
2453                 if (uio->uio_offset > oldEOF) {
2454                         oldEOF = uio->uio_offset;
2455                 }
2456         }
2457         return retval;
2458 }
2459
2460
2461 static int
2462 cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
2463     int flags, int (*callback)(buf_t, void *), void *callback_arg)
2464 {
2465         upl_t            upl;
2466         upl_page_info_t  *pl;
2467         vm_offset_t      upl_offset;
2468         vm_offset_t      vector_upl_offset = 0;
2469         u_int32_t        io_req_size;
2470         u_int32_t        offset_in_file;
2471         u_int32_t        offset_in_iovbase;
2472         u_int32_t        io_size;
2473         int              io_flag = 0;
2474         upl_size_t       upl_size, vector_upl_size = 0;
2475         vm_size_t        upl_needed_size;
2476         mach_msg_type_number_t  pages_in_pl;
2477         upl_control_flags_t upl_flags;
2478         kern_return_t    kret;
2479         mach_msg_type_number_t  i;
2480         int              force_data_sync;
2481         int              retval = 0;
2482         int              first_IO = 1;
2483         struct clios     iostate;
2484         user_addr_t      iov_base;
2485         u_int32_t        mem_alignment_mask;
2486         u_int32_t        devblocksize;
2487         u_int32_t        max_io_size;
2488         u_int32_t        max_upl_size;
2489         u_int32_t        max_vector_size;
2490         u_int32_t        bytes_outstanding_limit;
2491         boolean_t        io_throttled = FALSE;
2492
2493         u_int32_t        vector_upl_iosize = 0;
2494         int              issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
2495         off_t            v_upl_uio_offset = 0;
2496         int              vector_upl_index = 0;
2497         upl_t            vector_upl = NULL;
2498
2499
2500         /*
2501          * When we enter this routine, we know
2502          *  -- the resid will not exceed iov_len
2503          */
2504         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
2505             (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2506
2507         max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
2508
2509         io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO;
2510
2511         if (flags & IO_PASSIVE) {
2512                 io_flag |= CL_PASSIVE;
2513         }
2514
2515         if (flags & IO_NOCACHE) {
2516                 io_flag |= CL_NOCACHE;
2517         }
2518
2519         if (flags & IO_SKIP_ENCRYPTION) {
2520                 io_flag |= CL_ENCRYPTED;
2521         }
2522
2523         iostate.io_completed = 0;
2524         iostate.io_issued = 0;
2525         iostate.io_error = 0;
2526         iostate.io_wanted = 0;
2527
2528         lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
2529
2530         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2531         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2532
2533         if (devblocksize == 1) {
2534                 /*
2535                  * the AFP client advertises a devblocksize of 1
2536                  * however, its BLOCKMAP routine maps to physical
2537                  * blocks that are PAGE_SIZE in size...
2538                  * therefore we can't ask for I/Os that aren't page aligned
2539                  * or aren't multiples of PAGE_SIZE in size
2540                  * by setting devblocksize to PAGE_SIZE, we re-instate
2541                  * the old behavior we had before the mem_alignment_mask
2542                  * changes went in...
2543                  */
2544                 devblocksize = PAGE_SIZE;
2545         }
2546
2547 next_dwrite:
2548         io_req_size = *write_length;
2549         iov_base = uio_curriovbase(uio);
2550
2551         offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
2552         offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
2553
2554         if (offset_in_file || offset_in_iovbase) {
2555                 /*
2556                  * one of the 2 important offsets is misaligned
2557                  * so fire an I/O through the cache for this entire vector
2558                  */
2559                 goto wait_for_dwrites;
2560         }
2561         if (iov_base & (devblocksize - 1)) {
2562                 /*
2563                  * the offset in memory must be on a device block boundary
2564                  * so that we can guarantee that we can generate an
2565                  * I/O that ends on a page boundary in cluster_io
2566                  */
2567                 goto wait_for_dwrites;
2568         }
2569
2570         task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
2571         while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
2572                 int     throttle_type;
2573
2574                 if ((throttle_type = cluster_is_throttled(vp))) {
2575                         /*
2576                          * we're in the throttle window, at the very least
2577                          * we want to limit the size of the I/O we're about
2578                          * to issue
2579                          */
2580                         if ((flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
2581                                 /*
2582                                  * we're in the throttle window and at least 1 I/O
2583                                  * has already been issued by a throttleable thread
2584                                  * in this window, so return with EAGAIN to indicate
2585                                  * to the FS issuing the cluster_write call that it
2586                                  * should now throttle after dropping any locks
2587                                  */
2588                                 throttle_info_update_by_mount(vp->v_mount);
2589
2590                                 io_throttled = TRUE;
2591                                 goto wait_for_dwrites;
2592                         }
2593                         max_vector_size = THROTTLE_MAX_IOSIZE;
2594                         max_io_size = THROTTLE_MAX_IOSIZE;
2595                 } else {
2596                         max_vector_size = MAX_VECTOR_UPL_SIZE;
2597                         max_io_size = max_upl_size;
2598                 }
2599
2600                 if (first_IO) {
2601                         cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
2602                         first_IO = 0;
2603                 }
2604                 io_size  = io_req_size & ~PAGE_MASK;
2605                 iov_base = uio_curriovbase(uio);
2606
2607                 if (io_size > max_io_size) {
2608                         io_size = max_io_size;
2609                 }
2610
2611                 if (useVectorUPL && (iov_base & PAGE_MASK)) {
2612                         /*
2613                          * We have an iov_base that's not page-aligned.
2614                          * Issue all I/O's that have been collected within
2615                          * this Vectored UPL.
2616                          */
2617                         if (vector_upl_index) {
2618                                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2619                                 reset_vector_run_state();
2620                         }
2621
2622                         /*
2623                          * After this point, if we are using the Vector UPL path and the base is
2624                          * not page-aligned then the UPL with that base will be the first in the vector UPL.
2625                          */
2626                 }
2627
2628                 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2629                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2630
2631                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
2632                     (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
2633
2634                 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2635                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2636                         pages_in_pl = 0;
2637                         upl_size = upl_needed_size;
2638                         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2639                             UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2640
2641                         kret = vm_map_get_upl(map,
2642                             (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2643                             &upl_size,
2644                             &upl,
2645                             NULL,
2646                             &pages_in_pl,
2647                             &upl_flags,
2648                             VM_KERN_MEMORY_FILE,
2649                             force_data_sync);
2650
2651                         if (kret != KERN_SUCCESS) {
2652                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2653                                     0, 0, 0, kret, 0);
2654                                 /*
2655                                  * failed to get pagelist
2656                                  *
2657                                  * we may have already spun some portion of this request
2658                                  * off as async requests... we need to wait for the I/O
2659                                  * to complete before returning
2660                                  */
2661                                 goto wait_for_dwrites;
2662                         }
2663                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2664                         pages_in_pl = upl_size / PAGE_SIZE;
2665
2666                         for (i = 0; i < pages_in_pl; i++) {
2667                                 if (!upl_valid_page(pl, i)) {
2668                                         break;
2669                                 }
2670                         }
2671                         if (i == pages_in_pl) {
2672                                 break;
2673                         }
2674
2675                         /*
2676                          * didn't get all the pages back that we
2677                          * needed... release this upl and try again
2678                          */
2679                         ubc_upl_abort(upl, 0);
2680                 }
2681                 if (force_data_sync >= 3) {
2682                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2683                             i, pages_in_pl, upl_size, kret, 0);
2684                         /*
2685                          * for some reason, we couldn't acquire a hold on all
2686                          * the pages needed in the user's address space
2687                          *
2688                          * we may have already spun some portion of this request
2689                          * off as async requests... we need to wait for the I/O
2690                          * to complete before returning
2691                          */
2692                         goto wait_for_dwrites;
2693                 }
2694
2695                 /*
2696                  * Consider the possibility that upl_size wasn't satisfied.
2697                  */
2698                 if (upl_size < upl_needed_size) {
2699                         if (upl_size && upl_offset == 0) {
2700                                 io_size = upl_size;
2701                         } else {
2702                                 io_size = 0;
2703                         }
2704                 }
2705                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2706                     (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
2707
2708                 if (io_size == 0) {
2709                         ubc_upl_abort(upl, 0);
2710                         /*
2711                          * we may have already spun some portion of this request
2712                          * off as async requests... we need to wait for the I/O
2713                          * to complete before returning
2714                          */
2715                         goto wait_for_dwrites;
2716                 }
2717
2718                 if (useVectorUPL) {
2719                         vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
2720                         if (end_off) {
2721                                 issueVectorUPL = 1;
2722                         }
2723                         /*
2724                          * After this point, if we are using a vector UPL, then
2725                          * either all the UPL elements end on a page boundary OR
2726                          * this UPL is the last element because it does not end
2727                          * on a page boundary.
2728                          */
2729                 }
2730
2731                 /*
2732                  * we want push out these writes asynchronously so that we can overlap
2733                  * the preparation of the next I/O
2734                  * if there are already too many outstanding writes
2735                  * wait until some complete before issuing the next
2736                  */
2737                 if (vp->v_mount->mnt_minsaturationbytecount) {
2738                         bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
2739                 } else {
2740                         bytes_outstanding_limit = max_upl_size * IO_SCALE(vp, 2);
2741                 }
2742
2743                 cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct");
2744
2745                 if (iostate.io_error) {
2746                         /*
2747                          * one of the earlier writes we issued ran into a hard error
2748                          * don't issue any more writes, cleanup the UPL
2749                          * that was just created but not used, then
2750                          * go wait for all writes that are part of this stream
2751                          * to complete before returning the error to the caller
2752                          */
2753                         ubc_upl_abort(upl, 0);
2754
2755                         goto wait_for_dwrites;
2756                 }
2757
2758                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
2759                     (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
2760
2761                 if (!useVectorUPL) {
2762                         retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2763                             io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2764                 } else {
2765                         if (!vector_upl_index) {
2766                                 vector_upl = vector_upl_create(upl_offset);
2767                                 v_upl_uio_offset = uio->uio_offset;
2768                                 vector_upl_offset = upl_offset;
2769                         }
2770
2771                         vector_upl_set_subupl(vector_upl, upl, upl_size);
2772                         vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
2773                         vector_upl_index++;
2774                         vector_upl_iosize += io_size;
2775                         vector_upl_size += upl_size;
2776
2777                         if (issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
2778                                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2779                                 reset_vector_run_state();
2780                         }
2781                 }
2782
2783                 /*
2784                  * update the uio structure to
2785                  * reflect the I/O that we just issued
2786                  */
2787                 uio_update(uio, (user_size_t)io_size);
2788
2789                 /*
2790                  * in case we end up calling through to cluster_write_copy to finish
2791                  * the tail of this request, we need to update the oldEOF so that we
2792                  * don't zero-fill the head of a page if we've successfully written
2793                  * data to that area... 'cluster_write_copy' will zero-fill the head of a
2794                  * page that is beyond the oldEOF if the write is unaligned... we only
2795                  * want that to happen for the very first page of the cluster_write,
2796                  * NOT the first page of each vector making up a multi-vector write.
2797                  */
2798                 if (uio->uio_offset > oldEOF) {
2799                         oldEOF = uio->uio_offset;
2800                 }
2801
2802                 io_req_size -= io_size;
2803
2804                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
2805                     (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
2806         } /* end while */
2807
2808         if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
2809                 retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
2810
2811                 if (retval == 0 && *write_type == IO_DIRECT) {
2812                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE,
2813                             (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2814
2815                         goto next_dwrite;
2816                 }
2817         }
2818
2819 wait_for_dwrites:
2820
2821         if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
2822                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2823                 reset_vector_run_state();
2824         }
2825         /*
2826          * make sure all async writes issued as part of this stream
2827          * have completed before we return
2828          */
2829         cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
2830
2831         if (iostate.io_error) {
2832                 retval = iostate.io_error;
2833         }
2834
2835         lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
2836
2837         if (io_throttled == TRUE && retval == 0) {
2838                 retval = EAGAIN;
2839         }
2840
2841         if (io_req_size && retval == 0) {
2842                 /*
2843                  * we couldn't handle the tail of this request in DIRECT mode
2844                  * so fire it through the copy path
2845                  *
2846                  * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2847                  * so we can just pass 0 in for the headOff and tailOff
2848                  */
2849                 if (uio->uio_offset > oldEOF) {
2850                         oldEOF = uio->uio_offset;
2851                 }
2852
2853                 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
2854
2855                 *write_type = IO_UNKNOWN;
2856         }
2857         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
2858             (int)uio->uio_offset, io_req_size, retval, 4, 0);
2859
2860         return retval;
2861 }
2862
2863
2864 static int
2865 cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length,
2866     int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2867 {
2868         upl_page_info_t *pl;
2869         addr64_t         src_paddr = 0;
2870         upl_t            upl[MAX_VECTS];
2871         vm_offset_t      upl_offset;
2872         u_int32_t        tail_size = 0;
2873         u_int32_t        io_size;
2874         u_int32_t        xsize;
2875         upl_size_t       upl_size;
2876         vm_size_t        upl_needed_size;
2877         mach_msg_type_number_t  pages_in_pl;
2878         upl_control_flags_t upl_flags;
2879         kern_return_t    kret;
2880         struct clios     iostate;
2881         int              error  = 0;
2882         int              cur_upl = 0;
2883         int              num_upl = 0;
2884         int              n;
2885         user_addr_t      iov_base;
2886         u_int32_t        devblocksize;
2887         u_int32_t        mem_alignment_mask;
2888
2889         /*
2890          * When we enter this routine, we know
2891          *  -- the io_req_size will not exceed iov_len
2892          *  -- the target address is physically contiguous
2893          */
2894         cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
2895
2896         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2897         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2898
2899         iostate.io_completed = 0;
2900         iostate.io_issued = 0;
2901         iostate.io_error = 0;
2902         iostate.io_wanted = 0;
2903
2904         lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
2905
2906 next_cwrite:
2907         io_size = *write_length;
2908
2909         iov_base = uio_curriovbase(uio);
2910
2911         upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2912         upl_needed_size = upl_offset + io_size;
2913
2914         pages_in_pl = 0;
2915         upl_size = upl_needed_size;
2916         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2917             UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2918
2919         vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2920         kret = vm_map_get_upl(map,
2921             (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2922             &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
2923
2924         if (kret != KERN_SUCCESS) {
2925                 /*
2926                  * failed to get pagelist
2927                  */
2928                 error = EINVAL;
2929                 goto wait_for_cwrites;
2930         }
2931         num_upl++;
2932
2933         /*
2934          * Consider the possibility that upl_size wasn't satisfied.
2935          */
2936         if (upl_size < upl_needed_size) {
2937                 /*
2938                  * This is a failure in the physical memory case.
2939                  */
2940                 error = EINVAL;
2941                 goto wait_for_cwrites;
2942         }
2943         pl = ubc_upl_pageinfo(upl[cur_upl]);
2944
2945         src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
2946
2947         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
2948                 u_int32_t   head_size;
2949
2950                 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
2951
2952                 if (head_size > io_size) {
2953                         head_size = io_size;
2954                 }
2955
2956                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
2957
2958                 if (error) {
2959                         goto wait_for_cwrites;
2960                 }
2961
2962                 upl_offset += head_size;
2963                 src_paddr  += head_size;
2964                 io_size    -= head_size;
2965
2966                 iov_base   += head_size;
2967         }
2968         if ((u_int32_t)iov_base & mem_alignment_mask) {
2969                 /*
2970                  * request doesn't set up on a memory boundary
2971                  * the underlying DMA engine can handle...
2972                  * return an error instead of going through
2973                  * the slow copy path since the intent of this
2974                  * path is direct I/O from device memory
2975                  */
2976                 error = EINVAL;
2977                 goto wait_for_cwrites;
2978         }
2979
2980         tail_size = io_size & (devblocksize - 1);
2981         io_size  -= tail_size;
2982
2983         while (io_size && error == 0) {
2984                 if (io_size > MAX_IO_CONTIG_SIZE) {
2985                         xsize = MAX_IO_CONTIG_SIZE;
2986                 } else {
2987                         xsize = io_size;
2988                 }
2989                 /*
2990                  * request asynchronously so that we can overlap
2991                  * the preparation of the next I/O... we'll do
2992                  * the commit after all the I/O has completed
2993                  * since its all issued against the same UPL
2994                  * if there are already too many outstanding writes
2995                  * wait until some have completed before issuing the next
2996                  */
2997                 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
2998
2999                 if (iostate.io_error) {
3000                         /*
3001                          * one of the earlier writes we issued ran into a hard error
3002                          * don't issue any more writes...
3003                          * go wait for all writes that are part of this stream
3004                          * to complete before returning the error to the caller
3005                          */
3006                         goto wait_for_cwrites;
3007                 }
3008                 /*
3009                  * issue an asynchronous write to cluster_io
3010                  */
3011                 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
3012                     xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
3013
3014                 if (error == 0) {
3015                         /*
3016                          * The cluster_io write completed successfully,
3017                          * update the uio structure
3018                          */
3019                         uio_update(uio, (user_size_t)xsize);
3020
3021                         upl_offset += xsize;
3022                         src_paddr  += xsize;
3023                         io_size    -= xsize;
3024                 }
3025         }
3026         if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
3027                 error = cluster_io_type(uio, write_type, write_length, 0);
3028
3029                 if (error == 0 && *write_type == IO_CONTIG) {
3030                         cur_upl++;
3031                         goto next_cwrite;
3032                 }
3033         } else {
3034                 *write_type = IO_UNKNOWN;
3035         }
3036
3037 wait_for_cwrites:
3038         /*
3039          * make sure all async writes that are part of this stream
3040          * have completed before we proceed
3041          */
3042         cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
3043
3044         if (iostate.io_error) {
3045                 error = iostate.io_error;
3046         }
3047
3048         lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
3049
3050         if (error == 0 && tail_size) {
3051                 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
3052         }
3053
3054         for (n = 0; n < num_upl; n++) {
3055                 /*
3056                  * just release our hold on each physically contiguous
3057                  * region without changing any state
3058                  */
3059                 ubc_upl_abort(upl[n], 0);
3060         }
3061
3062         return error;
3063 }
3064
3065
3066 /*
3067  * need to avoid a race between an msync of a range of pages dirtied via mmap
3068  * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
3069  * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
3070  *
3071  * we should never force-zero-fill pages that are already valid in the cache...
3072  * the entire page contains valid data (either from disk, zero-filled or dirtied
3073  * via an mmap) so we can only do damage by trying to zero-fill
3074  *
3075  */
3076 static int
3077 cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
3078 {
3079         int zero_pg_index;
3080         boolean_t need_cluster_zero = TRUE;
3081
3082         if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
3083                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
3084                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
3085
3086                 if (upl_valid_page(pl, zero_pg_index)) {
3087                         /*
3088                          * never force zero valid pages - dirty or clean
3089                          * we'll leave these in the UPL for cluster_write_copy to deal with
3090                          */
3091                         need_cluster_zero = FALSE;
3092                 }
3093         }
3094         if (need_cluster_zero == TRUE) {
3095                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
3096         }
3097
3098         return bytes_to_zero;
3099 }
3100
3101
3102 void
3103 cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated)
3104 {
3105         struct cl_extent cl;
3106         boolean_t first_pass = TRUE;
3107
3108         assert(s_offset < e_offset);
3109         assert((s_offset & PAGE_MASK_64) == 0);
3110         assert((e_offset & PAGE_MASK_64) == 0);
3111
3112         cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64);
3113         cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64);
3114
3115         cluster_update_state_internal(vp, &cl, 0, TRUE, &first_pass, s_offset, (int)(e_offset - s_offset),
3116             vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated);
3117 }
3118
3119
3120 static void
3121 cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes,
3122     boolean_t *first_pass, off_t write_off, int write_cnt, off_t newEOF,
3123     int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
3124 {
3125         struct cl_writebehind *wbp;
3126         int     cl_index;
3127         int     ret_cluster_try_push;
3128         u_int   max_cluster_pgcount;
3129
3130
3131         max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
3132
3133         /*
3134          * take the lock to protect our accesses
3135          * of the writebehind and sparse cluster state
3136          */
3137         wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3138
3139         if (wbp->cl_scmap) {
3140                 if (!(flags & IO_NOCACHE)) {
3141                         /*
3142                          * we've fallen into the sparse
3143                          * cluster method of delaying dirty pages
3144                          */
3145                         sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3146
3147                         lck_mtx_unlock(&wbp->cl_lockw);
3148                         return;
3149                 }
3150                 /*
3151                  * must have done cached writes that fell into
3152                  * the sparse cluster mechanism... we've switched
3153                  * to uncached writes on the file, so go ahead
3154                  * and push whatever's in the sparse map
3155                  * and switch back to normal clustering
3156                  */
3157                 wbp->cl_number = 0;
3158
3159                 sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg, vm_initiated);
3160                 /*
3161                  * no clusters of either type present at this point
3162                  * so just go directly to start_new_cluster since
3163                  * we know we need to delay this I/O since we've
3164                  * already released the pages back into the cache
3165                  * to avoid the deadlock with sparse_cluster_push
3166                  */
3167                 goto start_new_cluster;
3168         }
3169         if (*first_pass == TRUE) {
3170                 if (write_off == wbp->cl_last_write) {
3171                         wbp->cl_seq_written += write_cnt;
3172                 } else {
3173                         wbp->cl_seq_written = write_cnt;
3174                 }
3175
3176                 wbp->cl_last_write = write_off + write_cnt;
3177
3178                 *first_pass = FALSE;
3179         }
3180         if (wbp->cl_number == 0) {
3181                 /*
3182                  * no clusters currently present
3183                  */
3184                 goto start_new_cluster;
3185         }
3186
3187         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3188                 /*
3189                  * check each cluster that we currently hold
3190                  * try to merge some or all of this write into
3191                  * one or more of the existing clusters... if
3192                  * any portion of the write remains, start a
3193                  * new cluster
3194                  */
3195                 if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) {
3196                         /*
3197                          * the current write starts at or after the current cluster
3198                          */
3199                         if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3200                                 /*
3201                                  * we have a write that fits entirely
3202                                  * within the existing cluster limits
3203                                  */
3204                                 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3205                                         /*
3206                                          * update our idea of where the cluster ends
3207                                          */
3208                                         wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3209                                 }
3210                                 break;
3211                         }
3212                         if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3213                                 /*
3214                                  * we have a write that starts in the middle of the current cluster
3215                                  * but extends beyond the cluster's limit... we know this because
3216                                  * of the previous checks
3217                                  * we'll extend the current cluster to the max
3218                                  * and update the b_addr for the current write to reflect that
3219                                  * the head of it was absorbed into this cluster...
3220                                  * note that we'll always have a leftover tail in this case since
3221                                  * full absorbtion would have occurred in the clause above
3222                                  */
3223                                 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
3224
3225                                 cl->b_addr = wbp->cl_clusters[cl_index].e_addr;
3226                         }
3227                         /*
3228                          * we come here for the case where the current write starts
3229                          * beyond the limit of the existing cluster or we have a leftover
3230                          * tail after a partial absorbtion
3231                          *
3232                          * in either case, we'll check the remaining clusters before
3233                          * starting a new one
3234                          */
3235                 } else {
3236                         /*
3237                          * the current write starts in front of the cluster we're currently considering
3238                          */
3239                         if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) {
3240                                 /*
3241                                  * we can just merge the new request into
3242                                  * this cluster and leave it in the cache
3243                                  * since the resulting cluster is still
3244                                  * less than the maximum allowable size
3245                                  */
3246                                 wbp->cl_clusters[cl_index].b_addr = cl->b_addr;
3247
3248                                 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3249                                         /*
3250                                          * the current write completely
3251                                          * envelops the existing cluster and since
3252                                          * each write is limited to at most max_cluster_pgcount pages
3253                                          * we can just use the start and last blocknos of the write
3254                                          * to generate the cluster limits
3255                                          */
3256                                         wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3257                                 }
3258                                 break;
3259                         }
3260                         /*
3261                          * if we were to combine this write with the current cluster
3262                          * we would exceed the cluster size limit.... so,
3263                          * let's see if there's any overlap of the new I/O with
3264                          * the cluster we're currently considering... in fact, we'll
3265                          * stretch the cluster out to it's full limit and see if we
3266                          * get an intersection with the current write
3267                          *
3268                          */
3269                         if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
3270                                 /*
3271                                  * the current write extends into the proposed cluster
3272                                  * clip the length of the current write after first combining it's
3273                                  * tail with the newly shaped cluster
3274                                  */
3275                                 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
3276
3277                                 cl->e_addr = wbp->cl_clusters[cl_index].b_addr;
3278                         }
3279                         /*
3280                          * if we get here, there was no way to merge
3281                          * any portion of this write with this cluster
3282                          * or we could only merge part of it which
3283                          * will leave a tail...
3284                          * we'll check the remaining clusters before starting a new one
3285                          */
3286                 }
3287         }
3288         if (cl_index < wbp->cl_number) {
3289                 /*
3290                  * we found an existing cluster(s) that we
3291                  * could entirely merge this I/O into
3292                  */
3293                 goto delay_io;
3294         }
3295
3296         if (defer_writes == FALSE &&
3297             wbp->cl_number == MAX_CLUSTERS &&
3298             wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
3299                 uint32_t        n;
3300
3301                 if (vp->v_mount->mnt_minsaturationbytecount) {
3302                         n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
3303
3304                         if (n > MAX_CLUSTERS) {
3305                                 n = MAX_CLUSTERS;
3306                         }
3307                 } else {
3308                         n = 0;
3309                 }
3310
3311                 if (n == 0) {
3312                         if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
3313                                 n = WRITE_BEHIND_SSD;
3314                         } else {
3315                                 n = WRITE_BEHIND;
3316                         }
3317                 }
3318                 while (n--) {
3319                         cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL, vm_initiated);
3320                 }
3321         }
3322         if (wbp->cl_number < MAX_CLUSTERS) {
3323                 /*
3324                  * we didn't find an existing cluster to
3325                  * merge into, but there's room to start
3326                  * a new one
3327                  */
3328                 goto start_new_cluster;
3329         }
3330         /*
3331          * no exisitng cluster to merge with and no
3332          * room to start a new one... we'll try
3333          * pushing one of the existing ones... if none of
3334          * them are able to be pushed, we'll switch
3335          * to the sparse cluster mechanism
3336          * cluster_try_push updates cl_number to the
3337          * number of remaining clusters... and
3338          * returns the number of currently unused clusters
3339          */
3340         ret_cluster_try_push = 0;
3341
3342         /*
3343          * if writes are not deferred, call cluster push immediately
3344          */
3345         if (defer_writes == FALSE) {
3346                 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL, vm_initiated);
3347         }
3348         /*
3349          * execute following regardless of writes being deferred or not
3350          */
3351         if (ret_cluster_try_push == 0) {
3352                 /*
3353                  * no more room in the normal cluster mechanism
3354                  * so let's switch to the more expansive but expensive
3355                  * sparse mechanism....
3356                  */
3357                 sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg, vm_initiated);
3358                 sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3359
3360                 lck_mtx_unlock(&wbp->cl_lockw);
3361                 return;
3362         }
3363 start_new_cluster:
3364         wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr;
3365         wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr;
3366
3367         wbp->cl_clusters[wbp->cl_number].io_flags = 0;
3368
3369         if (flags & IO_NOCACHE) {
3370                 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
3371         }
3372
3373         if (flags & IO_PASSIVE) {
3374                 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
3375         }
3376
3377         wbp->cl_number++;
3378 delay_io:
3379         lck_mtx_unlock(&wbp->cl_lockw);
3380         return;
3381 }
3382
3383
3384 static int
3385 cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
3386     off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3387 {
3388         upl_page_info_t *pl;
3389         upl_t            upl;
3390         vm_offset_t      upl_offset = 0;
3391         vm_size_t        upl_size;
3392         off_t            upl_f_offset;
3393         int              pages_in_upl;
3394         int              start_offset;
3395         int              xfer_resid;
3396         int              io_size;
3397         int              io_offset;
3398         int              bytes_to_zero;
3399         int              bytes_to_move;
3400         kern_return_t    kret;
3401         int              retval = 0;
3402         int              io_resid;
3403         long long        total_size;
3404         long long        zero_cnt;
3405         off_t            zero_off;
3406         long long        zero_cnt1;
3407         off_t            zero_off1;
3408         off_t            write_off = 0;
3409         int              write_cnt = 0;
3410         boolean_t        first_pass = FALSE;
3411         struct cl_extent cl;
3412         int              bflag;
3413         u_int            max_io_size;
3414
3415         if (uio) {
3416                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3417                     (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
3418
3419                 io_resid = io_req_size;
3420         } else {
3421                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3422                     0, 0, (int)oldEOF, (int)newEOF, 0);
3423
3424                 io_resid = 0;
3425         }
3426         if (flags & IO_PASSIVE) {
3427                 bflag = CL_PASSIVE;
3428         } else {
3429                 bflag = 0;
3430         }
3431         if (flags & IO_NOCACHE) {
3432                 bflag |= CL_NOCACHE;
3433         }
3434
3435         if (flags & IO_SKIP_ENCRYPTION) {
3436                 bflag |= CL_ENCRYPTED;
3437         }
3438
3439         zero_cnt  = 0;
3440         zero_cnt1 = 0;
3441         zero_off  = 0;
3442         zero_off1 = 0;
3443
3444         max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
3445
3446         if (flags & IO_HEADZEROFILL) {
3447                 /*
3448                  * some filesystems (HFS is one) don't support unallocated holes within a file...
3449                  * so we zero fill the intervening space between the old EOF and the offset
3450                  * where the next chunk of real data begins.... ftruncate will also use this
3451                  * routine to zero fill to the new EOF when growing a file... in this case, the
3452                  * uio structure will not be provided
3453                  */
3454                 if (uio) {
3455                         if (headOff < uio->uio_offset) {
3456                                 zero_cnt = uio->uio_offset - headOff;
3457                                 zero_off = headOff;
3458                         }
3459                 } else if (headOff < newEOF) {
3460                         zero_cnt = newEOF - headOff;
3461                         zero_off = headOff;
3462                 }
3463         } else {
3464                 if (uio && uio->uio_offset > oldEOF) {
3465                         zero_off = uio->uio_offset & ~PAGE_MASK_64;
3466
3467                         if (zero_off >= oldEOF) {
3468                                 zero_cnt = uio->uio_offset - zero_off;
3469
3470                                 flags |= IO_HEADZEROFILL;
3471                         }
3472                 }
3473         }
3474         if (flags & IO_TAILZEROFILL) {
3475                 if (uio) {
3476                         zero_off1 = uio->uio_offset + io_req_size;
3477
3478                         if (zero_off1 < tailOff) {
3479                                 zero_cnt1 = tailOff - zero_off1;
3480                         }
3481                 }
3482         } else {
3483                 if (uio && newEOF > oldEOF) {
3484                         zero_off1 = uio->uio_offset + io_req_size;
3485
3486                         if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
3487                                 zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
3488
3489                                 flags |= IO_TAILZEROFILL;
3490                         }
3491                 }
3492         }
3493         if (zero_cnt == 0 && uio == (struct uio *) 0) {
3494                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
3495                     retval, 0, 0, 0, 0);
3496                 return 0;
3497         }
3498         if (uio) {
3499                 write_off = uio->uio_offset;
3500                 write_cnt = uio_resid(uio);
3501                 /*
3502                  * delay updating the sequential write info
3503                  * in the control block until we've obtained
3504                  * the lock for it
3505                  */
3506                 first_pass = TRUE;
3507         }
3508         while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
3509                 /*
3510                  * for this iteration of the loop, figure out where our starting point is
3511                  */
3512                 if (zero_cnt) {
3513                         start_offset = (int)(zero_off & PAGE_MASK_64);
3514                         upl_f_offset = zero_off - start_offset;
3515                 } else if (io_resid) {
3516                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3517                         upl_f_offset = uio->uio_offset - start_offset;
3518                 } else {
3519                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
3520                         upl_f_offset = zero_off1 - start_offset;
3521                 }
3522                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
3523                     (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
3524
3525                 if (total_size > max_io_size) {
3526                         total_size = max_io_size;
3527                 }
3528
3529                 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
3530
3531                 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
3532                         /*
3533                          * assumption... total_size <= io_resid
3534                          * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
3535                          */
3536                         if ((start_offset + total_size) > max_io_size) {
3537                                 total_size = max_io_size - start_offset;
3538                         }
3539                         xfer_resid = total_size;
3540
3541                         retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
3542
3543                         if (retval) {
3544                                 break;
3545                         }
3546
3547                         io_resid    -= (total_size - xfer_resid);
3548                         total_size   = xfer_resid;
3549                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3550                         upl_f_offset = uio->uio_offset - start_offset;
3551
3552                         if (total_size == 0) {
3553                                 if (start_offset) {
3554                                         /*
3555                                          * the write did not finish on a page boundary
3556                                          * which will leave upl_f_offset pointing to the
3557                                          * beginning of the last page written instead of
3558                                          * the page beyond it... bump it in this case
3559                                          * so that the cluster code records the last page
3560                                          * written as dirty
3561                                          */
3562                                         upl_f_offset += PAGE_SIZE_64;
3563                                 }
3564                                 upl_size = 0;
3565
3566                                 goto check_cluster;
3567                         }
3568                 }
3569                 /*
3570                  * compute the size of the upl needed to encompass
3571                  * the requested write... limit each call to cluster_io
3572                  * to the maximum UPL size... cluster_io will clip if
3573                  * this exceeds the maximum io_size for the device,
3574                  * make sure to account for
3575                  * a starting offset that's not page aligned
3576                  */
3577                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3578
3579                 if (upl_size > max_io_size) {
3580                         upl_size = max_io_size;
3581                 }
3582
3583                 pages_in_upl = upl_size / PAGE_SIZE;
3584                 io_size      = upl_size - start_offset;
3585
3586                 if ((long long)io_size > total_size) {
3587                         io_size = total_size;
3588                 }
3589
3590                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
3591
3592
3593                 /*
3594                  * Gather the pages from the buffer cache.
3595                  * The UPL_WILL_MODIFY flag lets the UPL subsystem know
3596                  * that we intend to modify these pages.
3597                  */
3598                 kret = ubc_create_upl_kernel(vp,
3599                     upl_f_offset,
3600                     upl_size,
3601                     &upl,
3602                     &pl,
3603                     UPL_SET_LITE | ((uio != NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY),
3604                     VM_KERN_MEMORY_FILE);
3605                 if (kret != KERN_SUCCESS) {
3606                         panic("cluster_write_copy: failed to get pagelist");
3607                 }
3608
3609                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
3610                     upl, (int)upl_f_offset, start_offset, 0, 0);
3611
3612                 if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
3613                         int   read_size;
3614
3615                         /*
3616                          * we're starting in the middle of the first page of the upl
3617                          * and the page isn't currently valid, so we're going to have
3618                          * to read it in first... this is a synchronous operation
3619                          */
3620                         read_size = PAGE_SIZE;
3621
3622                         if ((upl_f_offset + read_size) > oldEOF) {
3623                                 read_size = oldEOF - upl_f_offset;
3624                         }
3625
3626                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
3627                             CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3628                         if (retval) {
3629                                 /*
3630                                  * we had an error during the read which causes us to abort
3631                                  * the current cluster_write request... before we do, we need
3632                                  * to release the rest of the pages in the upl without modifying
3633                                  * there state and mark the failed page in error
3634                                  */
3635                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3636
3637                                 if (upl_size > PAGE_SIZE) {
3638                                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3639                                 }
3640
3641                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3642                                     upl, 0, 0, retval, 0);
3643                                 break;
3644                         }
3645                 }
3646                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
3647                         /*
3648                          * the last offset we're writing to in this upl does not end on a page
3649                          * boundary... if it's not beyond the old EOF, then we'll also need to
3650                          * pre-read this page in if it isn't already valid
3651                          */
3652                         upl_offset = upl_size - PAGE_SIZE;
3653
3654                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
3655                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
3656                                 int   read_size;
3657
3658                                 read_size = PAGE_SIZE;
3659
3660                                 if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) {
3661                                         read_size = oldEOF - (upl_f_offset + upl_offset);
3662                                 }
3663
3664                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
3665                                     CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3666                                 if (retval) {
3667                                         /*
3668                                          * we had an error during the read which causes us to abort
3669                                          * the current cluster_write request... before we do, we
3670                                          * need to release the rest of the pages in the upl without
3671                                          * modifying there state and mark the failed page in error
3672                                          */
3673                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3674
3675                                         if (upl_size > PAGE_SIZE) {
3676                                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3677                                         }
3678
3679                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3680                                             upl, 0, 0, retval, 0);
3681                                         break;
3682                                 }
3683                         }
3684                 }
3685                 xfer_resid = io_size;
3686                 io_offset = start_offset;
3687
3688                 while (zero_cnt && xfer_resid) {
3689                         if (zero_cnt < (long long)xfer_resid) {
3690                                 bytes_to_zero = zero_cnt;
3691                         } else {
3692                                 bytes_to_zero = xfer_resid;
3693                         }
3694
3695                         bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
3696
3697                         xfer_resid -= bytes_to_zero;
3698                         zero_cnt   -= bytes_to_zero;
3699                         zero_off   += bytes_to_zero;
3700                         io_offset  += bytes_to_zero;
3701                 }
3702                 if (xfer_resid && io_resid) {
3703                         u_int32_t  io_requested;
3704
3705                         bytes_to_move = min(io_resid, xfer_resid);
3706                         io_requested = bytes_to_move;
3707
3708                         retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
3709
3710                         if (retval) {
3711                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3712
3713                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3714                                     upl, 0, 0, retval, 0);
3715                         } else {
3716                                 io_resid   -= bytes_to_move;
3717                                 xfer_resid -= bytes_to_move;
3718                                 io_offset  += bytes_to_move;
3719                         }
3720                 }
3721                 while (xfer_resid && zero_cnt1 && retval == 0) {
3722                         if (zero_cnt1 < (long long)xfer_resid) {
3723                                 bytes_to_zero = zero_cnt1;
3724                         } else {
3725                                 bytes_to_zero = xfer_resid;
3726                         }
3727
3728                         bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
3729
3730                         xfer_resid -= bytes_to_zero;
3731                         zero_cnt1  -= bytes_to_zero;
3732                         zero_off1  += bytes_to_zero;
3733                         io_offset  += bytes_to_zero;
3734                 }
3735                 if (retval == 0) {
3736                         int do_zeroing = 1;
3737
3738                         io_size += start_offset;
3739
3740                         /* Force more restrictive zeroing behavior only on APFS */
3741                         if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) {
3742                                 do_zeroing = 0;
3743                         }
3744
3745                         if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
3746                                 /*
3747                                  * if we're extending the file with this write
3748                                  * we'll zero fill the rest of the page so that
3749                                  * if the file gets extended again in such a way as to leave a
3750                                  * hole starting at this EOF, we'll have zero's in the correct spot
3751                                  */
3752                                 cluster_zero(upl, io_size, upl_size - io_size, NULL);
3753                         }
3754                         /*
3755                          * release the upl now if we hold one since...
3756                          * 1) pages in it may be present in the sparse cluster map
3757                          *    and may span 2 separate buckets there... if they do and
3758                          *    we happen to have to flush a bucket to make room and it intersects
3759                          *    this upl, a deadlock may result on page BUSY
3760                          * 2) we're delaying the I/O... from this point forward we're just updating
3761                          *    the cluster state... no need to hold the pages, so commit them
3762                          * 3) IO_SYNC is set...
3763                          *    because we had to ask for a UPL that provides currenty non-present pages, the
3764                          *    UPL has been automatically set to clear the dirty flags (both software and hardware)
3765                          *    upon committing it... this is not the behavior we want since it's possible for
3766                          *    pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3767                          *    we'll pick these pages back up later with the correct behavior specified.
3768                          * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3769                          *    of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3770                          *    we hold since the flushing context is holding the cluster lock.
3771                          */
3772                         ubc_upl_commit_range(upl, 0, upl_size,
3773                             UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
3774 check_cluster:
3775                         /*
3776                          * calculate the last logical block number
3777                          * that this delayed I/O encompassed
3778                          */
3779                         cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
3780
3781                         if (flags & IO_SYNC) {
3782                                 /*
3783                                  * if the IO_SYNC flag is set than we need to bypass
3784                                  * any clustering and immediately issue the I/O
3785                                  *
3786                                  * we don't hold the lock at this point
3787                                  *
3788                                  * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3789                                  * so that we correctly deal with a change in state of the hardware modify bit...
3790                                  * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3791                                  * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3792                                  * responsible for generating the correct sized I/O(s)
3793                                  */
3794                                 retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg, FALSE);
3795                         } else {
3796                                 boolean_t defer_writes = FALSE;
3797
3798                                 if (vfs_flags(vp->v_mount) & MNT_DEFWRITE) {
3799                                         defer_writes = TRUE;
3800                                 }
3801
3802                                 cluster_update_state_internal(vp, &cl, flags, defer_writes, &first_pass,
3803                                     write_off, write_cnt, newEOF, callback, callback_arg, FALSE);
3804                         }
3805                 }
3806         }
3807         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0);
3808
3809         return retval;
3810 }
3811
3812
3813
3814 int
3815 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
3816 {
3817         return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
3818 }
3819
3820
3821 int
3822 cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg)
3823 {
3824         int             retval = 0;
3825         int             flags;
3826         user_ssize_t    cur_resid;
3827         u_int32_t       io_size;
3828         u_int32_t       read_length = 0;
3829         int             read_type = IO_COPY;
3830
3831         flags = xflags;
3832
3833         if (vp->v_flag & VNOCACHE_DATA) {
3834                 flags |= IO_NOCACHE;
3835         }
3836         if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) {
3837                 flags |= IO_RAOFF;
3838         }
3839
3840         if (flags & IO_SKIP_ENCRYPTION) {
3841                 flags |= IO_ENCRYPTED;
3842         }
3843
3844         /*
3845          * do a read through the cache if one of the following is true....
3846          *   NOCACHE is not true
3847          *   the uio request doesn't target USERSPACE
3848          * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
3849          * Reading encrypted data from a CP filesystem should never result in the data touching
3850          * the UBC.
3851          *
3852          * otherwise, find out if we want the direct or contig variant for
3853          * the first vector in the uio request
3854          */
3855         if (((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) || (flags & IO_ENCRYPTED)) {
3856                 retval = cluster_io_type(uio, &read_type, &read_length, 0);
3857         }
3858
3859         while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
3860                 switch (read_type) {
3861                 case IO_COPY:
3862                         /*
3863                          * make sure the uio_resid isn't too big...
3864                          * internally, we want to handle all of the I/O in
3865                          * chunk sizes that fit in a 32 bit int
3866                          */
3867                         if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
3868                                 io_size = MAX_IO_REQUEST_SIZE;
3869                         } else {
3870                                 io_size = (u_int32_t)cur_resid;
3871                         }
3872
3873                         retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
3874                         break;
3875
3876                 case IO_DIRECT:
3877                         retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
3878                         break;
3879
3880                 case IO_CONTIG:
3881                         retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
3882                         break;
3883
3884                 case IO_UNKNOWN:
3885                         retval = cluster_io_type(uio, &read_type, &read_length, 0);
3886                         break;
3887                 }
3888         }
3889         return retval;
3890 }
3891
3892
3893
3894 static void
3895 cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
3896 {
3897         int range;
3898         int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
3899
3900         if ((range = last_pg - start_pg)) {
3901                 if (take_reference) {
3902                         abort_flags |= UPL_ABORT_REFERENCE;
3903                 }
3904
3905                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
3906         }
3907 }
3908
3909
3910 static int
3911 cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3912 {
3913         upl_page_info_t *pl;
3914         upl_t            upl;
3915         vm_offset_t      upl_offset;
3916         u_int32_t        upl_size;
3917         off_t            upl_f_offset;
3918         int              start_offset;
3919         int              start_pg;
3920         int              last_pg;
3921         int              uio_last = 0;
3922         int              pages_in_upl;
3923         off_t            max_size;
3924         off_t            last_ioread_offset;
3925         off_t            last_request_offset;
3926         kern_return_t    kret;
3927         int              error  = 0;
3928         int              retval = 0;
3929         u_int32_t        size_of_prefetch;
3930         u_int32_t        xsize;
3931         u_int32_t        io_size;
3932         u_int32_t        max_rd_size;
3933         u_int32_t        max_io_size;
3934         u_int32_t        max_prefetch;
3935         u_int            rd_ahead_enabled = 1;
3936         u_int            prefetch_enabled = 1;
3937         struct cl_readahead *   rap;
3938         struct clios            iostate;
3939         struct cl_extent        extent;
3940         int              bflag;
3941         int              take_reference = 1;
3942         int              policy = IOPOL_DEFAULT;
3943         boolean_t        iolock_inited = FALSE;
3944
3945         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
3946             (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
3947
3948         if (flags & IO_ENCRYPTED) {
3949                 panic("encrypted blocks will hit UBC!");
3950         }
3951
3952         policy = throttle_get_io_policy(NULL);
3953
3954         if (policy == THROTTLE_LEVEL_TIER3 || policy == THROTTLE_LEVEL_TIER2 || (flags & IO_NOCACHE)) {
3955                 take_reference = 0;
3956         }
3957
3958         if (flags & IO_PASSIVE) {
3959                 bflag = CL_PASSIVE;
3960         } else {
3961                 bflag = 0;
3962         }
3963
3964         if (flags & IO_NOCACHE) {
3965                 bflag |= CL_NOCACHE;
3966         }
3967
3968         if (flags & IO_SKIP_ENCRYPTION) {
3969                 bflag |= CL_ENCRYPTED;
3970         }
3971
3972         max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
3973         max_prefetch = MAX_PREFETCH(vp, max_io_size, disk_conditioner_mount_is_ssd(vp->v_mount));
3974         max_rd_size = max_prefetch;
3975
3976         last_request_offset = uio->uio_offset + io_req_size;
3977
3978         if (last_request_offset > filesize) {
3979                 last_request_offset = filesize;
3980         }
3981
3982         if ((flags & (IO_RAOFF | IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
3983                 rd_ahead_enabled = 0;
3984                 rap = NULL;
3985         } else {
3986                 if (cluster_is_throttled(vp)) {
3987                         /*
3988                          * we're in the throttle window, at the very least
3989                          * we want to limit the size of the I/O we're about
3990                          * to issue
3991                          */
3992                         rd_ahead_enabled = 0;
3993                         prefetch_enabled = 0;
3994
3995                         max_rd_size = THROTTLE_MAX_IOSIZE;
3996                 }
3997                 if ((rap = cluster_get_rap(vp)) == NULL) {
3998                         rd_ahead_enabled = 0;
3999                 } else {
4000                         extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
4001                         extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
4002                 }
4003         }
4004         if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
4005                 /*
4006                  * determine if we already have a read-ahead in the pipe courtesy of the
4007                  * last read systemcall that was issued...
4008                  * if so, pick up it's extent to determine where we should start
4009                  * with respect to any read-ahead that might be necessary to
4010                  * garner all the data needed to complete this read systemcall
4011                  */
4012                 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
4013
4014                 if (last_ioread_offset < uio->uio_offset) {
4015                         last_ioread_offset = (off_t)0;
4016                 } else if (last_ioread_offset > last_request_offset) {
4017                         last_ioread_offset = last_request_offset;
4018                 }
4019         } else {
4020                 last_ioread_offset = (off_t)0;
4021         }
4022
4023         while (io_req_size && uio->uio_offset < filesize && retval == 0) {
4024                 max_size = filesize - uio->uio_offset;
4025
4026                 if ((off_t)(io_req_size) < max_size) {
4027                         io_size = io_req_size;
4028                 } else {
4029                         io_size = max_size;
4030                 }
4031
4032                 if (!(flags & IO_NOCACHE)) {
4033                         while (io_size) {
4034                                 u_int32_t io_resid;
4035                                 u_int32_t io_requested;
4036
4037                                 /*
4038                                  * if we keep finding the pages we need already in the cache, then
4039                                  * don't bother to call cluster_read_prefetch since it costs CPU cycles
4040                                  * to determine that we have all the pages we need... once we miss in
4041                                  * the cache and have issued an I/O, than we'll assume that we're likely
4042                                  * to continue to miss in the cache and it's to our advantage to try and prefetch
4043                                  */
4044                                 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
4045                                         if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
4046                                                 /*
4047                                                  * we've already issued I/O for this request and
4048                                                  * there's still work to do and
4049                                                  * our prefetch stream is running dry, so issue a
4050                                                  * pre-fetch I/O... the I/O latency will overlap
4051                                                  * with the copying of the data
4052                                                  */
4053                                                 if (size_of_prefetch > max_rd_size) {
4054                                                         size_of_prefetch = max_rd_size;
4055                                                 }
4056
4057                                                 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4058
4059                                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4060
4061                                                 if (last_ioread_offset > last_request_offset) {
4062                                                         last_ioread_offset = last_request_offset;
4063                                                 }
4064                                         }
4065                                 }
4066                                 /*
4067                                  * limit the size of the copy we're about to do so that
4068                                  * we can notice that our I/O pipe is running dry and
4069                                  * get the next I/O issued before it does go dry
4070                                  */
4071                                 if (last_ioread_offset && io_size > (max_io_size / 4)) {
4072                                         io_resid = (max_io_size / 4);
4073                                 } else {
4074                                         io_resid = io_size;
4075                                 }
4076
4077                                 io_requested = io_resid;
4078
4079                                 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
4080
4081                                 xsize = io_requested - io_resid;
4082
4083                                 io_size -= xsize;
4084                                 io_req_size -= xsize;
4085
4086                                 if (retval || io_resid) {
4087                                         /*
4088                                          * if we run into a real error or
4089                                          * a page that is not in the cache
4090                                          * we need to leave streaming mode
4091                                          */
4092                                         break;
4093                                 }
4094
4095                                 if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) {
4096                                         /*
4097                                          * we're already finished the I/O for this read request
4098                                          * let's see if we should do a read-ahead
4099                                          */
4100                                         cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4101                                 }
4102                         }
4103                         if (retval) {
4104                                 break;
4105                         }
4106                         if (io_size == 0) {
4107                                 if (rap != NULL) {
4108                                         if (extent.e_addr < rap->cl_lastr) {
4109                                                 rap->cl_maxra = 0;
4110                                         }
4111                                         rap->cl_lastr = extent.e_addr;
4112                                 }
4113                                 break;
4114                         }
4115                         /*
4116                          * recompute max_size since cluster_copy_ubc_data_internal
4117                          * may have advanced uio->uio_offset
4118                          */
4119                         max_size = filesize - uio->uio_offset;
4120                 }
4121
4122                 iostate.io_completed = 0;
4123                 iostate.io_issued = 0;
4124                 iostate.io_error = 0;
4125                 iostate.io_wanted = 0;
4126
4127                 if ((flags & IO_RETURN_ON_THROTTLE)) {
4128                         if (cluster_is_throttled(vp) == THROTTLE_NOW) {
4129                                 if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
4130                                         /*
4131                                          * we're in the throttle window and at least 1 I/O
4132                                          * has already been issued by a throttleable thread
4133                                          * in this window, so return with EAGAIN to indicate
4134                                          * to the FS issuing the cluster_read call that it
4135                                          * should now throttle after dropping any locks
4136                                          */
4137                                         throttle_info_update_by_mount(vp->v_mount);
4138
4139                                         retval = EAGAIN;
4140                                         break;
4141                                 }
4142                         }
4143                 }
4144
4145                 /*
4146                  * compute the size of the upl needed to encompass
4147                  * the requested read... limit each call to cluster_io
4148                  * to the maximum UPL size... cluster_io will clip if
4149                  * this exceeds the maximum io_size for the device,
4150                  * make sure to account for
4151                  * a starting offset that's not page aligned
4152                  */
4153                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4154                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
4155
4156                 if (io_size > max_rd_size) {
4157                         io_size = max_rd_size;
4158                 }
4159
4160                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4161
4162                 if (flags & IO_NOCACHE) {
4163                         if (upl_size > max_io_size) {
4164                                 upl_size = max_io_size;
4165                         }
4166                 } else {
4167                         if (upl_size > max_io_size / 4) {
4168                                 upl_size = max_io_size / 4;
4169                                 upl_size &= ~PAGE_MASK;
4170
4171                                 if (upl_size == 0) {
4172                                         upl_size = PAGE_SIZE;
4173                                 }
4174                         }
4175                 }
4176                 pages_in_upl = upl_size / PAGE_SIZE;
4177
4178                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
4179                     upl, (int)upl_f_offset, upl_size, start_offset, 0);
4180
4181                 kret = ubc_create_upl_kernel(vp,
4182                     upl_f_offset,
4183                     upl_size,
4184                     &upl,
4185                     &pl,
4186                     UPL_FILE_IO | UPL_SET_LITE,
4187                     VM_KERN_MEMORY_FILE);
4188                 if (kret != KERN_SUCCESS) {
4189                         panic("cluster_read_copy: failed to get pagelist");
4190                 }
4191
4192                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
4193                     upl, (int)upl_f_offset, upl_size, start_offset, 0);
4194
4195                 /*
4196                  * scan from the beginning of the upl looking for the first
4197                  * non-valid page.... this will become the first page in
4198                  * the request we're going to make to 'cluster_io'... if all
4199                  * of the pages are valid, we won't call through to 'cluster_io'
4200                  */
4201                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
4202                         if (!upl_valid_page(pl, start_pg)) {
4203                                 break;
4204                         }
4205                 }
4206
4207                 /*
4208                  * scan from the starting invalid page looking for a valid
4209                  * page before the end of the upl is reached, if we
4210                  * find one, then it will be the last page of the request to
4211                  * 'cluster_io'
4212                  */
4213                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4214                         if (upl_valid_page(pl, last_pg)) {
4215                                 break;
4216                         }
4217                 }
4218
4219                 if (start_pg < last_pg) {
4220                         /*
4221                          * we found a range of 'invalid' pages that must be filled
4222                          * if the last page in this range is the last page of the file
4223                          * we may have to clip the size of it to keep from reading past
4224                          * the end of the last physical block associated with the file
4225                          */
4226                         if (iolock_inited == FALSE) {
4227                                 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
4228
4229                                 iolock_inited = TRUE;
4230                         }
4231                         upl_offset = start_pg * PAGE_SIZE;
4232                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
4233
4234                         if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
4235                                 io_size = filesize - (upl_f_offset + upl_offset);
4236                         }
4237
4238                         /*
4239                          * issue an asynchronous read to cluster_io
4240                          */
4241
4242                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
4243                             io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg);
4244
4245                         if (rap) {
4246                                 if (extent.e_addr < rap->cl_maxra) {
4247                                         /*
4248                                          * we've just issued a read for a block that should have been
4249                                          * in the cache courtesy of the read-ahead engine... something
4250                                          * has gone wrong with the pipeline, so reset the read-ahead
4251                                          * logic which will cause us to restart from scratch
4252                                          */
4253                                         rap->cl_maxra = 0;
4254                                 }
4255                         }
4256                 }
4257                 if (error == 0) {
4258                         /*
4259                          * if the read completed successfully, or there was no I/O request
4260                          * issued, than copy the data into user land via 'cluster_upl_copy_data'
4261                          * we'll first add on any 'valid'
4262                          * pages that were present in the upl when we acquired it.
4263                          */
4264                         u_int  val_size;
4265
4266                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4267                                 if (!upl_valid_page(pl, uio_last)) {
4268                                         break;
4269                                 }
4270                         }
4271                         if (uio_last < pages_in_upl) {
4272                                 /*
4273                                  * there were some invalid pages beyond the valid pages
4274                                  * that we didn't issue an I/O for, just release them
4275                                  * unchanged now, so that any prefetch/readahed can
4276                                  * include them
4277                                  */
4278                                 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4279                                     (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4280                         }
4281
4282                         /*
4283                          * compute size to transfer this round,  if io_req_size is
4284                          * still non-zero after this attempt, we'll loop around and
4285                          * set up for another I/O.
4286                          */
4287                         val_size = (uio_last * PAGE_SIZE) - start_offset;
4288
4289                         if (val_size > max_size) {
4290                                 val_size = max_size;
4291                         }
4292
4293                         if (val_size > io_req_size) {
4294                                 val_size = io_req_size;
4295                         }
4296
4297                         if ((uio->uio_offset + val_size) > last_ioread_offset) {
4298                                 last_ioread_offset = uio->uio_offset + val_size;
4299                         }
4300
4301                         if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
4302                                 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
4303                                         /*
4304                                          * if there's still I/O left to do for this request, and...
4305                                          * we're not in hard throttle mode, and...
4306                                          * we're close to using up the previous prefetch, then issue a
4307                                          * new pre-fetch I/O... the I/O latency will overlap
4308                                          * with the copying of the data
4309                                          */
4310                                         if (size_of_prefetch > max_rd_size) {
4311                                                 size_of_prefetch = max_rd_size;
4312                                         }
4313
4314                                         size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4315
4316                                         last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4317
4318                                         if (last_ioread_offset > last_request_offset) {
4319                                                 last_ioread_offset = last_request_offset;
4320                                         }
4321                                 }
4322                         } else if ((uio->uio_offset + val_size) == last_request_offset) {
4323                                 /*
4324                                  * this transfer will finish this request, so...
4325                                  * let's try to read ahead if we're in
4326                                  * a sequential access pattern and we haven't
4327                                  * explicitly disabled it
4328                                  */
4329                                 if (rd_ahead_enabled) {
4330                                         cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4331                                 }
4332
4333                                 if (rap != NULL) {
4334                                         if (extent.e_addr < rap->cl_lastr) {
4335                                                 rap->cl_maxra = 0;
4336                                         }
4337                                         rap->cl_lastr = extent.e_addr;
4338                                 }
4339                         }
4340                         if (iolock_inited == TRUE) {
4341                                 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4342                         }
4343
4344                         if (iostate.io_error) {
4345                                 error = iostate.io_error;
4346                         } else {
4347                                 u_int32_t io_requested;
4348
4349                                 io_requested = val_size;
4350
4351                                 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
4352
4353                                 io_req_size -= (val_size - io_requested);
4354                         }
4355                 } else {
4356                         if (iolock_inited == TRUE) {
4357                                 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4358                         }
4359                 }
4360                 if (start_pg < last_pg) {
4361                         /*
4362                          * compute the range of pages that we actually issued an I/O for
4363                          * and either commit them as valid if the I/O succeeded
4364                          * or abort them if the I/O failed or we're not supposed to
4365                          * keep them in the cache
4366                          */
4367                         io_size = (last_pg - start_pg) * PAGE_SIZE;
4368
4369                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4370
4371                         if (error || (flags & IO_NOCACHE)) {
4372                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
4373                                     UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4374                         } else {
4375                                 int     commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
4376
4377                                 if (take_reference) {
4378                                         commit_flags |= UPL_COMMIT_INACTIVATE;
4379                                 } else {
4380                                         commit_flags |= UPL_COMMIT_SPECULATE;
4381                                 }
4382
4383                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
4384                         }
4385                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4386                 }
4387                 if ((last_pg - start_pg) < pages_in_upl) {
4388                         /*
4389                          * the set of pages that we issued an I/O for did not encompass
4390                          * the entire upl... so just release these without modifying
4391                          * their state
4392                          */
4393                         if (error) {
4394                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
4395                         } else {
4396                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
4397                                     upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
4398
4399                                 /*
4400                                  * handle any valid pages at the beginning of
4401                                  * the upl... release these appropriately
4402                                  */
4403                                 cluster_read_upl_release(upl, 0, start_pg, take_reference);
4404
4405                                 /*
4406                                  * handle any valid pages immediately after the
4407                                  * pages we issued I/O for... ... release these appropriately
4408                                  */
4409                                 cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
4410
4411                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0);
4412                         }
4413                 }
4414                 if (retval == 0) {
4415                         retval = error;
4416                 }
4417
4418                 if (io_req_size) {
4419                         if (cluster_is_throttled(vp)) {
4420                                 /*
4421                                  * we're in the throttle window, at the very least
4422                                  * we want to limit the size of the I/O we're about
4423                                  * to issue
4424                                  */
4425                                 rd_ahead_enabled = 0;
4426                                 prefetch_enabled = 0;
4427                                 max_rd_size = THROTTLE_MAX_IOSIZE;
4428                         } else {
4429                                 if (max_rd_size == THROTTLE_MAX_IOSIZE) {
4430                                         /*
4431                                          * coming out of throttled state
4432                                          */
4433                                         if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
4434                                                 if (rap != NULL) {
4435                                                         rd_ahead_enabled = 1;
4436                                                 }
4437                                                 prefetch_enabled = 1;
4438                                         }
4439                                         max_rd_size = max_prefetch;
4440                                         last_ioread_offset = 0;
4441                                 }
4442                         }
4443                 }
4444         }
4445         if (iolock_inited == TRUE) {
4446                 /*
4447                  * cluster_io returned an error after it
4448                  * had already issued some I/O.  we need
4449                  * to wait for that I/O to complete before
4450                  * we can destroy the iostate mutex...
4451                  * 'retval' already contains the early error
4452                  * so no need to pick it up from iostate.io_error
4453                  */
4454                 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4455
4456                 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
4457         }
4458         if (rap != NULL) {
4459                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4460                     (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
4461
4462                 lck_mtx_unlock(&rap->cl_lockr);
4463         } else {
4464                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4465                     (int)uio->uio_offset, io_req_size, 0, retval, 0);
4466         }
4467
4468         return retval;
4469 }
4470
4471 /*
4472  * We don't want another read/write lock for every vnode in the system
4473  * so we keep a hash of them here.  There should never be very many of
4474  * these around at any point in time.
4475  */
4476 cl_direct_read_lock_t *
4477 cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
4478 {
4479         struct cl_direct_read_locks *head
4480                 = &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
4481             % CL_DIRECT_READ_LOCK_BUCKETS];
4482
4483         struct cl_direct_read_lock *lck, *new_lck = NULL;
4484
4485         for (;;) {
4486                 lck_spin_lock(&cl_direct_read_spin_lock);
4487
4488                 LIST_FOREACH(lck, head, chain) {
4489                         if (lck->vp == vp) {
4490                                 ++lck->ref_count;
4491                                 lck_spin_unlock(&cl_direct_read_spin_lock);
4492                                 if (new_lck) {
4493                                         // Someone beat us to it, ditch the allocation
4494                                         lck_rw_destroy(&new_lck->rw_lock, cl_mtx_grp);
4495                                         FREE(new_lck, M_TEMP);
4496                                 }
4497                                 lck_rw_lock(&lck->rw_lock, type);
4498                                 return lck;
4499                         }
4500                 }
4501
4502                 if (new_lck) {
4503                         // Use the lock we allocated
4504                         LIST_INSERT_HEAD(head, new_lck, chain);
4505                         lck_spin_unlock(&cl_direct_read_spin_lock);
4506                         lck_rw_lock(&new_lck->rw_lock, type);
4507                         return new_lck;
4508                 }
4509
4510                 lck_spin_unlock(&cl_direct_read_spin_lock);
4511
4512                 // Allocate a new lock
4513                 MALLOC(new_lck, cl_direct_read_lock_t *, sizeof(*new_lck),
4514                     M_TEMP, M_WAITOK);
4515                 lck_rw_init(&new_lck->rw_lock, cl_mtx_grp, cl_mtx_attr);
4516                 new_lck->vp = vp;
4517                 new_lck->ref_count = 1;
4518
4519                 // Got to go round again
4520         }
4521 }
4522
4523 void
4524 cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
4525 {
4526         lck_rw_done(&lck->rw_lock);
4527
4528         lck_spin_lock(&cl_direct_read_spin_lock);
4529         if (lck->ref_count == 1) {
4530                 LIST_REMOVE(lck, chain);
4531                 lck_spin_unlock(&cl_direct_read_spin_lock);
4532                 lck_rw_destroy(&lck->rw_lock, cl_mtx_grp);
4533                 FREE(lck, M_TEMP);
4534         } else {
4535                 --lck->ref_count;
4536                 lck_spin_unlock(&cl_direct_read_spin_lock);
4537         }
4538 }
4539
4540 static int
4541 cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
4542     int flags, int (*callback)(buf_t, void *), void *callback_arg)
4543 {
4544         upl_t            upl;
4545         upl_page_info_t  *pl;
4546         off_t            max_io_size;
4547         vm_offset_t      upl_offset, vector_upl_offset = 0;
4548         upl_size_t       upl_size, vector_upl_size = 0;
4549         vm_size_t        upl_needed_size;
4550         unsigned int     pages_in_pl;
4551         upl_control_flags_t upl_flags;
4552         kern_return_t    kret;
4553         unsigned int     i;
4554         int              force_data_sync;
4555         int              retval = 0;
4556         int              no_zero_fill = 0;
4557         int              io_flag = 0;
4558         int              misaligned = 0;
4559         struct clios     iostate;
4560         user_addr_t      iov_base;
4561         u_int32_t        io_req_size;
4562         u_int32_t        offset_in_file;
4563         u_int32_t        offset_in_iovbase;
4564         u_int32_t        io_size;
4565         u_int32_t        io_min;
4566         u_int32_t        xsize;
4567         u_int32_t        devblocksize;
4568         u_int32_t        mem_alignment_mask;
4569         u_int32_t        max_upl_size;
4570         u_int32_t        max_rd_size;
4571         u_int32_t        max_rd_ahead;
4572         u_int32_t        max_vector_size;
4573         boolean_t        io_throttled = FALSE;
4574
4575         u_int32_t        vector_upl_iosize = 0;
4576         int              issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
4577         off_t            v_upl_uio_offset = 0;
4578         int              vector_upl_index = 0;
4579         upl_t            vector_upl = NULL;
4580         cl_direct_read_lock_t *lock = NULL;
4581
4582         user_addr_t      orig_iov_base = 0;
4583         user_addr_t      last_iov_base = 0;
4584         user_addr_t      next_iov_base = 0;
4585
4586         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
4587             (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
4588
4589         max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
4590
4591         max_rd_size = max_upl_size;
4592         max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4593
4594         io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO;
4595
4596         if (flags & IO_PASSIVE) {
4597                 io_flag |= CL_PASSIVE;
4598         }
4599
4600         if (flags & IO_ENCRYPTED) {
4601                 io_flag |= CL_RAW_ENCRYPTED;
4602         }
4603
4604         if (flags & IO_NOCACHE) {
4605                 io_flag |= CL_NOCACHE;
4606         }
4607
4608         if (flags & IO_SKIP_ENCRYPTION) {
4609                 io_flag |= CL_ENCRYPTED;
4610         }
4611
4612         iostate.io_completed = 0;
4613         iostate.io_issued = 0;
4614         iostate.io_error = 0;
4615         iostate.io_wanted = 0;
4616
4617         lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
4618
4619         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
4620         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
4621
4622         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
4623             (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
4624
4625         if (devblocksize == 1) {
4626                 /*
4627                  * the AFP client advertises a devblocksize of 1
4628                  * however, its BLOCKMAP routine maps to physical
4629                  * blocks that are PAGE_SIZE in size...
4630                  * therefore we can't ask for I/Os that aren't page aligned
4631                  * or aren't multiples of PAGE_SIZE in size
4632                  * by setting devblocksize to PAGE_SIZE, we re-instate
4633                  * the old behavior we had before the mem_alignment_mask
4634                  * changes went in...
4635                  */
4636                 devblocksize = PAGE_SIZE;
4637         }
4638
4639         orig_iov_base = uio_curriovbase(uio);
4640         last_iov_base = orig_iov_base;
4641
4642 next_dread:
4643         io_req_size = *read_length;
4644         iov_base = uio_curriovbase(uio);
4645
4646         offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
4647         offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
4648
4649         if (offset_in_file || offset_in_iovbase) {
4650                 /*
4651                  * one of the 2 important offsets is misaligned
4652                  * so fire an I/O through the cache for this entire vector
4653                  */
4654                 misaligned = 1;
4655         }
4656         if (iov_base & (devblocksize - 1)) {
4657                 /*
4658                  * the offset in memory must be on a device block boundary
4659                  * so that we can guarantee that we can generate an
4660                  * I/O that ends on a page boundary in cluster_io
4661                  */
4662                 misaligned = 1;
4663         }
4664
4665         max_io_size = filesize - uio->uio_offset;
4666
4667         /*
4668          * The user must request IO in aligned chunks.  If the
4669          * offset into the file is bad, or the userland pointer
4670          * is non-aligned, then we cannot service the encrypted IO request.
4671          */
4672         if (flags & IO_ENCRYPTED) {
4673                 if (misaligned || (io_req_size & (devblocksize - 1))) {
4674                         retval = EINVAL;
4675                 }
4676
4677                 max_io_size = roundup(max_io_size, devblocksize);
4678         }
4679
4680         if ((off_t)io_req_size > max_io_size) {
4681                 io_req_size = max_io_size;
4682         }
4683
4684         /*
4685          * When we get to this point, we know...
4686          *  -- the offset into the file is on a devblocksize boundary
4687          */
4688
4689         while (io_req_size && retval == 0) {
4690                 u_int32_t io_start;
4691
4692                 if (cluster_is_throttled(vp)) {
4693                         /*
4694                          * we're in the throttle window, at the very least
4695                          * we want to limit the size of the I/O we're about
4696                          * to issue
4697                          */
4698                         max_rd_size  = THROTTLE_MAX_IOSIZE;
4699                         max_rd_ahead = THROTTLE_MAX_IOSIZE - 1;
4700                         max_vector_size = THROTTLE_MAX_IOSIZE;
4701                 } else {
4702                         max_rd_size  = max_upl_size;
4703                         max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4704                         max_vector_size = MAX_VECTOR_UPL_SIZE;
4705                 }
4706                 io_start = io_size = io_req_size;
4707
4708                 /*
4709                  * First look for pages already in the cache
4710                  * and move them to user space.  But only do this
4711                  * check if we are not retrieving encrypted data directly
4712                  * from the filesystem;  those blocks should never
4713                  * be in the UBC.
4714                  *
4715                  * cluster_copy_ubc_data returns the resid
4716                  * in io_size
4717                  */
4718                 if ((flags & IO_ENCRYPTED) == 0) {
4719                         retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
4720                 }
4721                 /*
4722                  * calculate the number of bytes actually copied
4723                  * starting size - residual
4724                  */
4725                 xsize = io_start - io_size;
4726
4727                 io_req_size -= xsize;
4728
4729                 if (useVectorUPL && (xsize || (iov_base & PAGE_MASK))) {
4730                         /*
4731                          * We found something in the cache or we have an iov_base that's not
4732                          * page-aligned.
4733                          *
4734                          * Issue all I/O's that have been collected within this Vectored UPL.
4735                          */
4736                         if (vector_upl_index) {
4737                                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4738                                 reset_vector_run_state();
4739                         }
4740
4741                         if (xsize) {
4742                                 useVectorUPL = 0;
4743                         }
4744
4745                         /*
4746                          * After this point, if we are using the Vector UPL path and the base is
4747                          * not page-aligned then the UPL with that base will be the first in the vector UPL.
4748                          */
4749                 }
4750
4751                 /*
4752                  * check to see if we are finished with this request.
4753                  *
4754                  * If we satisfied this IO already, then io_req_size will be 0.
4755                  * Otherwise, see if the IO was mis-aligned and needs to go through
4756                  * the UBC to deal with the 'tail'.
4757                  *
4758                  */
4759                 if (io_req_size == 0 || (misaligned)) {
4760                         /*
4761                          * see if there's another uio vector to
4762                          * process that's of type IO_DIRECT
4763                          *
4764                          * break out of while loop to get there
4765                          */
4766                         break;
4767                 }
4768                 /*
4769                  * assume the request ends on a device block boundary
4770                  */
4771                 io_min = devblocksize;
4772
4773                 /*
4774                  * we can handle I/O's in multiples of the device block size
4775                  * however, if io_size isn't a multiple of devblocksize we
4776                  * want to clip it back to the nearest page boundary since
4777                  * we are going to have to go through cluster_read_copy to
4778                  * deal with the 'overhang'... by clipping it to a PAGE_SIZE
4779                  * multiple, we avoid asking the drive for the same physical
4780                  * blocks twice.. once for the partial page at the end of the
4781                  * request and a 2nd time for the page we read into the cache
4782                  * (which overlaps the end of the direct read) in order to
4783                  * get at the overhang bytes
4784                  */
4785                 if (io_size & (devblocksize - 1)) {
4786                         assert(!(flags & IO_ENCRYPTED));
4787                         /*
4788                          * Clip the request to the previous page size boundary
4789                          * since request does NOT end on a device block boundary
4790                          */
4791                         io_size &= ~PAGE_MASK;
4792                         io_min = PAGE_SIZE;
4793                 }
4794                 if (retval || io_size < io_min) {
4795                         /*
4796                          * either an error or we only have the tail left to
4797                          * complete via the copy path...
4798                          * we may have already spun some portion of this request
4799                          * off as async requests... we need to wait for the I/O
4800                          * to complete before returning
4801                          */
4802                         goto wait_for_dreads;
4803                 }
4804
4805                 /*
4806                  * Don't re-check the UBC data if we are looking for uncached IO
4807                  * or asking for encrypted blocks.
4808                  */
4809                 if ((flags & IO_ENCRYPTED) == 0) {
4810                         if ((xsize = io_size) > max_rd_size) {
4811                                 xsize = max_rd_size;
4812                         }
4813
4814                         io_size = 0;
4815
4816                         if (!lock) {
4817                                 /*
4818                                  * We hold a lock here between the time we check the
4819                                  * cache and the time we issue I/O.  This saves us
4820                                  * from having to lock the pages in the cache.  Not
4821                                  * all clients will care about this lock but some
4822                                  * clients may want to guarantee stability between
4823                                  * here and when the I/O is issued in which case they
4824                                  * will take the lock exclusively.
4825                                  */
4826                                 lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
4827                         }
4828
4829                         ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
4830
4831                         if (io_size == 0) {
4832                                 /*
4833                                  * a page must have just come into the cache
4834                                  * since the first page in this range is no
4835                                  * longer absent, go back and re-evaluate
4836                                  */
4837                                 continue;
4838                         }
4839                 }
4840                 if ((flags & IO_RETURN_ON_THROTTLE)) {
4841                         if (cluster_is_throttled(vp) == THROTTLE_NOW) {
4842                                 if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
4843                                         /*
4844                                          * we're in the throttle window and at least 1 I/O
4845                                          * has already been issued by a throttleable thread
4846                                          * in this window, so return with EAGAIN to indicate
4847                                          * to the FS issuing the cluster_read call that it
4848                                          * should now throttle after dropping any locks
4849                                          */
4850                                         throttle_info_update_by_mount(vp->v_mount);
4851
4852                                         io_throttled = TRUE;
4853                                         goto wait_for_dreads;
4854                                 }
4855                         }
4856                 }
4857                 if (io_size > max_rd_size) {
4858                         io_size = max_rd_size;
4859                 }
4860
4861                 iov_base = uio_curriovbase(uio);
4862
4863                 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
4864                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4865
4866                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
4867                     (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
4868
4869                 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
4870                         no_zero_fill = 1;
4871                 } else {
4872                         no_zero_fill = 0;
4873                 }
4874
4875                 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
4876                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
4877                         pages_in_pl = 0;
4878                         upl_size = upl_needed_size;
4879                         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
4880                         if (no_zero_fill) {
4881                                 upl_flags |= UPL_NOZEROFILL;
4882                         }
4883                         if (force_data_sync) {
4884                                 upl_flags |= UPL_FORCE_DATA_SYNC;
4885                         }
4886
4887                         kret = vm_map_create_upl(map,
4888                             (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
4889                             &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE);
4890
4891                         if (kret != KERN_SUCCESS) {
4892                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4893                                     (int)upl_offset, upl_size, io_size, kret, 0);
4894                                 /*
4895                                  * failed to get pagelist
4896                                  *
4897                                  * we may have already spun some portion of this request
4898                                  * off as async requests... we need to wait for the I/O
4899                                  * to complete before returning
4900                                  */
4901                                 goto wait_for_dreads;
4902                         }
4903                         pages_in_pl = upl_size / PAGE_SIZE;
4904                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
4905
4906                         for (i = 0; i < pages_in_pl; i++) {
4907                                 if (!upl_page_present(pl, i)) {
4908                                         break;
4909                                 }
4910                         }
4911                         if (i == pages_in_pl) {
4912                                 break;
4913                         }
4914
4915                         ubc_upl_abort(upl, 0);
4916                 }
4917                 if (force_data_sync >= 3) {
4918                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4919                             (int)upl_offset, upl_size, io_size, kret, 0);
4920
4921                         goto wait_for_dreads;
4922                 }
4923                 /*
4924                  * Consider the possibility that upl_size wasn't satisfied.
4925                  */
4926                 if (upl_size < upl_needed_size) {
4927                         if (upl_size && upl_offset == 0) {
4928                                 io_size = upl_size;
4929                         } else {
4930                                 io_size = 0;
4931                         }
4932                 }
4933                 if (io_size == 0) {
4934                         ubc_upl_abort(upl, 0);
4935                         goto wait_for_dreads;
4936                 }
4937                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4938                     (int)upl_offset, upl_size, io_size, kret, 0);
4939
4940                 if (useVectorUPL) {
4941                         vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
4942                         if (end_off) {
4943                                 issueVectorUPL = 1;
4944                         }
4945                         /*
4946                          * After this point, if we are using a vector UPL, then
4947                          * either all the UPL elements end on a page boundary OR
4948                          * this UPL is the last element because it does not end
4949                          * on a page boundary.
4950                          */
4951                 }
4952
4953                 /*
4954                  * request asynchronously so that we can overlap
4955                  * the preparation of the next I/O
4956                  * if there are already too many outstanding reads
4957                  * wait until some have completed before issuing the next read
4958                  */
4959                 cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
4960
4961                 if (iostate.io_error) {
4962                         /*
4963                          * one of the earlier reads we issued ran into a hard error
4964                          * don't issue any more reads, cleanup the UPL
4965                          * that was just created but not used, then
4966                          * go wait for any other reads to complete before
4967                          * returning the error to the caller
4968                          */
4969                         ubc_upl_abort(upl, 0);
4970
4971                         goto wait_for_dreads;
4972                 }
4973                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
4974                     upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
4975
4976                 if (!useVectorUPL) {
4977                         if (no_zero_fill) {
4978                                 io_flag &= ~CL_PRESERVE;
4979                         } else {
4980                                 io_flag |= CL_PRESERVE;
4981                         }
4982
4983                         retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4984                 } else {
4985                         if (!vector_upl_index) {
4986                                 vector_upl = vector_upl_create(upl_offset);
4987                                 v_upl_uio_offset = uio->uio_offset;
4988                                 vector_upl_offset = upl_offset;
4989                         }
4990
4991                         vector_upl_set_subupl(vector_upl, upl, upl_size);
4992                         vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
4993                         vector_upl_index++;
4994                         vector_upl_size += upl_size;
4995                         vector_upl_iosize += io_size;
4996
4997                         if (issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
4998                                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4999                                 reset_vector_run_state();
5000                         }
5001                 }
5002                 last_iov_base = iov_base + io_size;
5003
5004                 if (lock) {
5005                         // We don't need to wait for the I/O to complete
5006                         cluster_unlock_direct_read(lock);
5007                         lock = NULL;
5008                 }
5009
5010                 /*
5011                  * update the uio structure
5012                  */
5013                 if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
5014                         uio_update(uio, (user_size_t)max_io_size);
5015                 } else {
5016                         uio_update(uio, (user_size_t)io_size);
5017                 }
5018
5019                 io_req_size -= io_size;
5020
5021                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
5022                     upl, (int)uio->uio_offset, io_req_size, retval, 0);
5023         } /* end while */
5024
5025         if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
5026                 retval = cluster_io_type(uio, read_type, read_length, 0);
5027
5028                 if (retval == 0 && *read_type == IO_DIRECT) {
5029                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
5030                             (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
5031
5032                         goto next_dread;
5033                 }
5034         }
5035
5036 wait_for_dreads:
5037
5038         if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
5039                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5040                 reset_vector_run_state();
5041         }
5042
5043         // We don't need to wait for the I/O to complete
5044         if (lock) {
5045                 cluster_unlock_direct_read(lock);
5046         }
5047
5048         /*
5049          * make sure all async reads that are part of this stream
5050          * have completed before we return
5051          */
5052         cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
5053
5054         if (iostate.io_error) {
5055                 retval = iostate.io_error;
5056         }
5057
5058         lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
5059
5060         if (io_throttled == TRUE && retval == 0) {
5061                 retval = EAGAIN;
5062         }
5063
5064         for (next_iov_base = orig_iov_base; next_iov_base < last_iov_base; next_iov_base += PAGE_SIZE) {
5065                 /*
5066                  * This is specifically done for pmap accounting purposes.
5067                  * vm_pre_fault() will call vm_fault() to enter the page into
5068                  * the pmap if there isn't _a_ physical page for that VA already.
5069                  */
5070                 vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK));
5071         }
5072
5073         if (io_req_size && retval == 0) {
5074                 /*
5075                  * we couldn't handle the tail of this request in DIRECT mode
5076                  * so fire it through the copy path
5077                  */
5078                 if (flags & IO_ENCRYPTED) {
5079                         /*
5080                          * We cannot fall back to the copy path for encrypted I/O. If this
5081                          * happens, there is something wrong with the user buffer passed
5082                          * down.
5083                          */
5084                         retval = EFAULT;
5085                 } else {
5086                         retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
5087                 }
5088
5089                 *read_type = IO_UNKNOWN;
5090         }
5091         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
5092             (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
5093
5094         return retval;
5095 }
5096
5097
5098 static int
5099 cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
5100     int (*callback)(buf_t, void *), void *callback_arg, int flags)
5101 {
5102         upl_page_info_t *pl;
5103         upl_t            upl[MAX_VECTS];
5104         vm_offset_t      upl_offset;
5105         addr64_t         dst_paddr = 0;
5106         user_addr_t      iov_base;
5107         off_t            max_size;
5108         upl_size_t       upl_size;
5109         vm_size_t        upl_needed_size;
5110         mach_msg_type_number_t  pages_in_pl;
5111         upl_control_flags_t upl_flags;
5112         kern_return_t    kret;
5113         struct clios     iostate;
5114         int              error = 0;
5115         int              cur_upl = 0;
5116         int              num_upl = 0;
5117         int              n;
5118         u_int32_t        xsize;
5119         u_int32_t        io_size;
5120         u_int32_t        devblocksize;
5121         u_int32_t        mem_alignment_mask;
5122         u_int32_t        tail_size = 0;
5123         int              bflag;
5124
5125         if (flags & IO_PASSIVE) {
5126                 bflag = CL_PASSIVE;
5127         } else {
5128                 bflag = 0;
5129         }
5130
5131         if (flags & IO_NOCACHE) {
5132                 bflag |= CL_NOCACHE;
5133         }
5134
5135         /*
5136          * When we enter this routine, we know
5137          *  -- the read_length will not exceed the current iov_len
5138          *  -- the target address is physically contiguous for read_length
5139          */
5140         cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
5141
5142         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
5143         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
5144
5145         iostate.io_completed = 0;
5146         iostate.io_issued = 0;
5147         iostate.io_error = 0;
5148         iostate.io_wanted = 0;
5149
5150         lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
5151
5152 next_cread:
5153         io_size = *read_length;
5154
5155         max_size = filesize - uio->uio_offset;
5156
5157         if (io_size > max_size) {
5158                 io_size = max_size;
5159         }
5160
5161         iov_base = uio_curriovbase(uio);
5162
5163         upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5164         upl_needed_size = upl_offset + io_size;
5165
5166         pages_in_pl = 0;
5167         upl_size = upl_needed_size;
5168         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
5169
5170
5171         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START,
5172             (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
5173
5174         vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5175         kret = vm_map_get_upl(map,
5176             (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
5177             &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
5178
5179         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END,
5180             (int)upl_offset, upl_size, io_size, kret, 0);
5181
5182         if (kret != KERN_SUCCESS) {
5183                 /*
5184                  * failed to get pagelist
5185                  */
5186                 error = EINVAL;
5187                 goto wait_for_creads;
5188         }
5189         num_upl++;
5190
5191         if (upl_size < upl_needed_size) {
5192                 /*
5193                  * The upl_size wasn't satisfied.
5194                  */
5195                 error = EINVAL;
5196                 goto wait_for_creads;
5197         }
5198         pl = ubc_upl_pageinfo(upl[cur_upl]);
5199
5200         dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
5201
5202         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
5203                 u_int32_t   head_size;
5204
5205                 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
5206
5207                 if (head_size > io_size) {
5208                         head_size = io_size;
5209                 }
5210
5211                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
5212
5213                 if (error) {
5214                         goto wait_for_creads;
5215                 }
5216
5217                 upl_offset += head_size;
5218                 dst_paddr  += head_size;
5219                 io_size    -= head_size;
5220
5221                 iov_base   += head_size;
5222         }
5223         if ((u_int32_t)iov_base & mem_alignment_mask) {
5224                 /*
5225                  * request doesn't set up on a memory boundary
5226                  * the underlying DMA engine can handle...
5227                  * return an error instead of going through
5228                  * the slow copy path since the intent of this
5229                  * path is direct I/O to device memory
5230                  */
5231                 error = EINVAL;
5232                 goto wait_for_creads;
5233         }
5234
5235         tail_size = io_size & (devblocksize - 1);
5236
5237         io_size  -= tail_size;
5238
5239         while (io_size && error == 0) {
5240                 if (io_size > MAX_IO_CONTIG_SIZE) {
5241                         xsize = MAX_IO_CONTIG_SIZE;
5242                 } else {
5243                         xsize = io_size;
5244                 }
5245                 /*
5246                  * request asynchronously so that we can overlap
5247                  * the preparation of the next I/O... we'll do
5248                  * the commit after all the I/O has completed
5249                  * since its all issued against the same UPL
5250                  * if there are already too many outstanding reads
5251                  * wait until some have completed before issuing the next
5252                  */
5253                 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
5254
5255                 if (iostate.io_error) {
5256                         /*
5257                          * one of the earlier reads we issued ran into a hard error
5258                          * don't issue any more reads...
5259                          * go wait for any other reads to complete before
5260                          * returning the error to the caller
5261                          */
5262                         goto wait_for_creads;
5263                 }
5264                 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
5265                     CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag,
5266                     (buf_t)NULL, &iostate, callback, callback_arg);
5267                 /*
5268                  * The cluster_io read was issued successfully,
5269                  * update the uio structure
5270                  */
5271                 if (error == 0) {
5272                         uio_update(uio, (user_size_t)xsize);
5273
5274                         dst_paddr  += xsize;
5275                         upl_offset += xsize;
5276                         io_size    -= xsize;
5277                 }
5278         }
5279         if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
5280                 error = cluster_io_type(uio, read_type, read_length, 0);
5281
5282                 if (error == 0 && *read_type == IO_CONTIG) {
5283                         cur_upl++;
5284                         goto next_cread;
5285                 }
5286         } else {
5287                 *read_type = IO_UNKNOWN;
5288         }
5289
5290 wait_for_creads:
5291         /*
5292          * make sure all async reads that are part of this stream
5293          * have completed before we proceed
5294          */
5295         cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
5296
5297         if (iostate.io_error) {
5298                 error = iostate.io_error;
5299         }
5300
5301         lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
5302
5303         if (error == 0 && tail_size) {
5304                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
5305         }
5306
5307         for (n = 0; n < num_upl; n++) {
5308                 /*
5309                  * just release our hold on each physically contiguous
5310                  * region without changing any state
5311                  */
5312                 ubc_upl_abort(upl[n], 0);
5313         }
5314
5315         return error;
5316 }
5317
5318
5319 static int
5320 cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length)
5321 {
5322         user_size_t      iov_len;
5323         user_addr_t      iov_base = 0;
5324         upl_t            upl;
5325         upl_size_t       upl_size;
5326         upl_control_flags_t upl_flags;
5327         int              retval = 0;
5328
5329         /*
5330          * skip over any emtpy vectors
5331          */
5332         uio_update(uio, (user_size_t)0);
5333
5334         iov_len = uio_curriovlen(uio);
5335
5336         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
5337
5338         if (iov_len) {
5339                 iov_base = uio_curriovbase(uio);
5340                 /*
5341                  * make sure the size of the vector isn't too big...
5342                  * internally, we want to handle all of the I/O in
5343                  * chunk sizes that fit in a 32 bit int
5344                  */
5345                 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) {
5346                         upl_size = MAX_IO_REQUEST_SIZE;
5347                 } else {
5348                         upl_size = (u_int32_t)iov_len;
5349                 }
5350
5351                 upl_flags = UPL_QUERY_OBJECT_TYPE;
5352
5353                 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5354                 if ((vm_map_get_upl(map,
5355                     (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
5356                     &upl_size, &upl, NULL, NULL, &upl_flags, VM_KERN_MEMORY_FILE, 0)) != KERN_SUCCESS) {
5357                         /*
5358                          * the user app must have passed in an invalid address
5359                          */
5360                         retval = EFAULT;
5361                 }
5362                 if (upl_size == 0) {
5363                         retval = EFAULT;
5364                 }
5365
5366                 *io_length = upl_size;
5367
5368                 if (upl_flags & UPL_PHYS_CONTIG) {
5369                         *io_type = IO_CONTIG;
5370                 } else if (iov_len >= min_length) {
5371                         *io_type = IO_DIRECT;
5372                 } else {
5373                         *io_type = IO_COPY;
5374                 }
5375         } else {
5376                 /*
5377                  * nothing left to do for this uio
5378                  */
5379                 *io_length = 0;
5380                 *io_type   = IO_UNKNOWN;
5381         }
5382         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0);
5383
5384         return retval;
5385 }
5386
5387
5388 /*
5389  * generate advisory I/O's in the largest chunks possible
5390  * the completed pages will be released into the VM cache
5391  */
5392 int
5393 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
5394 {
5395         return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
5396 }
5397
5398 int
5399 advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
5400 {
5401         upl_page_info_t *pl;
5402         upl_t            upl;
5403         vm_offset_t      upl_offset;
5404         int              upl_size;
5405         off_t            upl_f_offset;
5406         int              start_offset;
5407         int              start_pg;
5408         int              last_pg;
5409         int              pages_in_upl;
5410         off_t            max_size;
5411         int              io_size;
5412         kern_return_t    kret;
5413         int              retval = 0;
5414         int              issued_io;
5415         int              skip_range;
5416         uint32_t         max_io_size;
5417
5418
5419         if (!UBCINFOEXISTS(vp)) {
5420                 return EINVAL;
5421         }
5422
5423         if (resid < 0) {
5424                 return EINVAL;
5425         }
5426
5427         max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
5428
5429 #if CONFIG_EMBEDDED
5430         if (max_io_size > speculative_prefetch_max_iosize) {
5431                 max_io_size = speculative_prefetch_max_iosize;
5432         }
5433 #else
5434         if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
5435                 if (max_io_size > speculative_prefetch_max_iosize) {
5436                         max_io_size = speculative_prefetch_max_iosize;
5437                 }
5438         }
5439 #endif
5440
5441         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
5442             (int)f_offset, resid, (int)filesize, 0, 0);
5443
5444         while (resid && f_offset < filesize && retval == 0) {
5445                 /*
5446                  * compute the size of the upl needed to encompass
5447                  * the requested read... limit each call to cluster_io
5448                  * to the maximum UPL size... cluster_io will clip if
5449                  * this exceeds the maximum io_size for the device,
5450                  * make sure to account for
5451                  * a starting offset that's not page aligned
5452                  */
5453                 start_offset = (int)(f_offset & PAGE_MASK_64);
5454                 upl_f_offset = f_offset - (off_t)start_offset;
5455                 max_size     = filesize - f_offset;
5456
5457                 if (resid < max_size) {
5458                         io_size = resid;
5459                 } else {
5460                         io_size = max_size;
5461                 }
5462
5463                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5464                 if ((uint32_t)upl_size > max_io_size) {
5465                         upl_size = max_io_size;
5466                 }
5467
5468                 skip_range = 0;
5469                 /*
5470                  * return the number of contiguously present pages in the cache
5471                  * starting at upl_f_offset within the file
5472                  */
5473                 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
5474
5475                 if (skip_range) {
5476                         /*
5477                          * skip over pages already present in the cache
5478                          */
5479                         io_size = skip_range - start_offset;
5480
5481                         f_offset += io_size;
5482                         resid    -= io_size;
5483
5484                         if (skip_range == upl_size) {
5485                                 continue;
5486                         }
5487                         /*
5488                          * have to issue some real I/O
5489                          * at this point, we know it's starting on a page boundary
5490                          * because we've skipped over at least the first page in the request
5491                          */
5492                         start_offset = 0;
5493                         upl_f_offset += skip_range;
5494                         upl_size     -= skip_range;
5495                 }
5496                 pages_in_upl = upl_size / PAGE_SIZE;
5497
5498                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
5499                     upl, (int)upl_f_offset, upl_size, start_offset, 0);
5500
5501                 kret = ubc_create_upl_kernel(vp,
5502                     upl_f_offset,
5503                     upl_size,
5504                     &upl,
5505                     &pl,
5506                     UPL_RET_ONLY_ABSENT | UPL_SET_LITE,
5507                     VM_KERN_MEMORY_FILE);
5508                 if (kret != KERN_SUCCESS) {
5509                         return retval;
5510                 }
5511                 issued_io = 0;
5512
5513                 /*
5514                  * before we start marching forward, we must make sure we end on
5515                  * a present page, otherwise we will be working with a freed
5516                  * upl
5517                  */
5518                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
5519                         if (upl_page_present(pl, last_pg)) {
5520                                 break;
5521                         }
5522                 }
5523                 pages_in_upl = last_pg + 1;
5524
5525
5526                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
5527                     upl, (int)upl_f_offset, upl_size, start_offset, 0);
5528
5529
5530                 for (last_pg = 0; last_pg < pages_in_upl;) {
5531                         /*
5532                          * scan from the beginning of the upl looking for the first
5533                          * page that is present.... this will become the first page in
5534                          * the request we're going to make to 'cluster_io'... if all
5535                          * of the pages are absent, we won't call through to 'cluster_io'
5536                          */
5537                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
5538                                 if (upl_page_present(pl, start_pg)) {
5539                                         break;
5540                                 }
5541                         }
5542
5543                         /*
5544                          * scan from the starting present page looking for an absent
5545                          * page before the end of the upl is reached, if we
5546                          * find one, then it will terminate the range of pages being
5547                          * presented to 'cluster_io'
5548                          */
5549                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
5550                                 if (!upl_page_present(pl, last_pg)) {
5551                                         break;
5552                                 }
5553                         }
5554
5555                         if (last_pg > start_pg) {
5556                                 /*
5557                                  * we found a range of pages that must be filled
5558                                  * if the last page in this range is the last page of the file
5559                                  * we may have to clip the size of it to keep from reading past
5560                                  * the end of the last physical block associated with the file
5561                                  */
5562                                 upl_offset = start_pg * PAGE_SIZE;
5563                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
5564
5565                                 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
5566                                         io_size = filesize - (upl_f_offset + upl_offset);
5567                                 }
5568
5569                                 /*
5570                                  * issue an asynchronous read to cluster_io
5571                                  */
5572                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
5573                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
5574
5575                                 issued_io = 1;
5576                         }
5577                 }
5578                 if (issued_io == 0) {
5579                         ubc_upl_abort(upl, 0);
5580                 }
5581
5582                 io_size = upl_size - start_offset;
5583
5584                 if (io_size > resid) {
5585                         io_size = resid;
5586                 }
5587                 f_offset += io_size;
5588                 resid    -= io_size;
5589         }
5590
5591         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
5592             (int)f_offset, resid, retval, 0, 0);
5593
5594         return retval;
5595 }
5596
5597
5598 int
5599 cluster_push(vnode_t vp, int flags)
5600 {
5601         return cluster_push_ext(vp, flags, NULL, NULL);
5602 }
5603
5604
5605 int
5606 cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg)
5607 {
5608         return cluster_push_err(vp, flags, callback, callback_arg, NULL);
5609 }
5610
5611 /* write errors via err, but return the number of clusters written */
5612 int
5613 cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg, int *err)
5614 {
5615         int     retval;
5616         int     my_sparse_wait = 0;
5617         struct  cl_writebehind *wbp;
5618         int     local_err = 0;
5619
5620         if (err) {
5621                 *err = 0;
5622         }
5623
5624         if (!UBCINFOEXISTS(vp)) {
5625                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0);
5626                 return 0;
5627         }
5628         /* return if deferred write is set */
5629         if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
5630                 return 0;
5631         }
5632         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
5633                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0);
5634                 return 0;
5635         }
5636         if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
5637                 lck_mtx_unlock(&wbp->cl_lockw);
5638
5639                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0);
5640                 return 0;
5641         }
5642         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
5643             wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
5644
5645         /*
5646          * if we have an fsync in progress, we don't want to allow any additional
5647          * sync/fsync/close(s) to occur until it finishes.
5648          * note that its possible for writes to continue to occur to this file
5649          * while we're waiting and also once the fsync starts to clean if we're
5650          * in the sparse map case
5651          */
5652         while (wbp->cl_sparse_wait) {
5653                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5654
5655                 msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5656
5657                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5658         }
5659         if (flags & IO_SYNC) {
5660                 my_sparse_wait = 1;
5661                 wbp->cl_sparse_wait = 1;
5662
5663                 /*
5664                  * this is an fsync (or equivalent)... we must wait for any existing async
5665                  * cleaning operations to complete before we evaulate the current state
5666                  * and finish cleaning... this insures that all writes issued before this
5667                  * fsync actually get cleaned to the disk before this fsync returns
5668                  */
5669                 while (wbp->cl_sparse_pushes) {
5670                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5671
5672                         msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5673
5674                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5675                 }
5676         }
5677         if (wbp->cl_scmap) {
5678                 void    *scmap;
5679
5680                 if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
5681                         scmap = wbp->cl_scmap;
5682                         wbp->cl_scmap = NULL;
5683
5684                         wbp->cl_sparse_pushes++;
5685
5686                         lck_mtx_unlock(&wbp->cl_lockw);
5687
5688                         retval = sparse_cluster_push(wbp, &scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5689
5690                         lck_mtx_lock(&wbp->cl_lockw);
5691
5692                         wbp->cl_sparse_pushes--;
5693
5694                         if (retval) {
5695                                 if (wbp->cl_scmap != NULL) {
5696                                         panic("cluster_push_err: Expected NULL cl_scmap\n");
5697                                 }
5698
5699                                 wbp->cl_scmap = scmap;
5700                         }
5701
5702                         if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) {
5703                                 wakeup((caddr_t)&wbp->cl_sparse_pushes);
5704                         }
5705                 } else {
5706                         retval = sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5707                 }
5708
5709                 local_err = retval;
5710
5711                 if (err) {
5712                         *err = retval;
5713                 }
5714                 retval = 1;
5715         } else {
5716                 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, &local_err, FALSE);
5717                 if (err) {
5718                         *err = local_err;
5719                 }
5720         }
5721         lck_mtx_unlock(&wbp->cl_lockw);
5722
5723         if (flags & IO_SYNC) {
5724                 (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
5725         }
5726
5727         if (my_sparse_wait) {
5728                 /*
5729                  * I'm the owner of the serialization token
5730                  * clear it and wakeup anyone that is waiting
5731                  * for me to finish
5732                  */
5733                 lck_mtx_lock(&wbp->cl_lockw);
5734
5735                 wbp->cl_sparse_wait = 0;
5736                 wakeup((caddr_t)&wbp->cl_sparse_wait);
5737
5738                 lck_mtx_unlock(&wbp->cl_lockw);
5739         }
5740         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
5741             wbp->cl_scmap, wbp->cl_number, retval, local_err, 0);
5742
5743         return retval;
5744 }
5745
5746
5747 __private_extern__ void
5748 cluster_release(struct ubc_info *ubc)
5749 {
5750         struct cl_writebehind *wbp;
5751         struct cl_readahead   *rap;
5752
5753         if ((wbp = ubc->cl_wbehind)) {
5754                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
5755
5756                 if (wbp->cl_scmap) {
5757                         vfs_drt_control(&(wbp->cl_scmap), 0);
5758                 }
5759         } else {
5760                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0);
5761         }
5762
5763         rap = ubc->cl_rahead;
5764
5765         if (wbp != NULL) {
5766                 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
5767                 FREE_ZONE(wbp, sizeof *wbp, M_CLWRBEHIND);
5768         }
5769         if ((rap = ubc->cl_rahead)) {
5770                 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
5771                 FREE_ZONE(rap, sizeof *rap, M_CLRDAHEAD);
5772         }
5773         ubc->cl_rahead  = NULL;
5774         ubc->cl_wbehind = NULL;
5775
5776         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0);
5777 }
5778
5779
5780 static int
5781 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err, boolean_t vm_initiated)
5782 {
5783         int cl_index;
5784         int cl_index1;
5785         int min_index;
5786         int cl_len;
5787         int cl_pushed = 0;
5788         struct cl_wextent l_clusters[MAX_CLUSTERS];
5789         u_int  max_cluster_pgcount;
5790         int error = 0;
5791
5792         max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
5793         /*
5794          * the write behind context exists and has
5795          * already been locked...
5796          */
5797         if (wbp->cl_number == 0) {
5798                 /*
5799                  * no clusters to push
5800                  * return number of empty slots
5801                  */
5802                 return MAX_CLUSTERS;
5803         }
5804
5805         /*
5806          * make a local 'sorted' copy of the clusters
5807          * and clear wbp->cl_number so that new clusters can
5808          * be developed
5809          */
5810         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
5811                 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
5812                         if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) {
5813                                 continue;
5814                         }
5815                         if (min_index == -1) {
5816                                 min_index = cl_index1;
5817                         } else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) {
5818                                 min_index = cl_index1;
5819                         }
5820                 }
5821                 if (min_index == -1) {
5822                         break;
5823                 }
5824
5825                 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
5826                 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
5827                 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
5828
5829                 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
5830         }
5831         wbp->cl_number = 0;
5832
5833         cl_len = cl_index;
5834
5835         /* skip switching to the sparse cluster mechanism if on diskimage */
5836         if (((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS) &&
5837             !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)) {
5838                 int   i;
5839
5840                 /*
5841                  * determine if we appear to be writing the file sequentially
5842                  * if not, by returning without having pushed any clusters
5843                  * we will cause this vnode to be pushed into the sparse cluster mechanism
5844                  * used for managing more random I/O patterns
5845                  *
5846                  * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
5847                  * that's why we're in try_push with PUSH_DELAY...
5848                  *
5849                  * check to make sure that all the clusters except the last one are 'full'... and that each cluster
5850                  * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
5851                  * so we can just make a simple pass through, up to, but not including the last one...
5852                  * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
5853                  * are sequential
5854                  *
5855                  * we let the last one be partial as long as it was adjacent to the previous one...
5856                  * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
5857                  * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
5858                  */
5859                 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
5860                         if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) {
5861                                 goto dont_try;
5862                         }
5863                         if (l_clusters[i].e_addr != l_clusters[i + 1].b_addr) {
5864                                 goto dont_try;
5865                         }
5866                 }
5867         }
5868         if (vm_initiated == TRUE) {
5869                 lck_mtx_unlock(&wbp->cl_lockw);
5870         }
5871
5872         for (cl_index = 0; cl_index < cl_len; cl_index++) {
5873                 int     flags;
5874                 struct  cl_extent cl;
5875                 int retval;
5876
5877                 flags = io_flags & (IO_PASSIVE | IO_CLOSE);
5878
5879                 /*
5880                  * try to push each cluster in turn...
5881                  */
5882                 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) {
5883                         flags |= IO_NOCACHE;
5884                 }
5885
5886                 if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) {
5887                         flags |= IO_PASSIVE;
5888                 }
5889
5890                 if (push_flag & PUSH_SYNC) {
5891                         flags |= IO_SYNC;
5892                 }
5893
5894                 cl.b_addr = l_clusters[cl_index].b_addr;
5895                 cl.e_addr = l_clusters[cl_index].e_addr;
5896
5897                 retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_initiated);
5898
5899                 if (retval == 0) {
5900                         cl_pushed++;
5901
5902                         l_clusters[cl_index].b_addr = 0;
5903                         l_clusters[cl_index].e_addr = 0;
5904                 } else if (error == 0) {
5905                         error = retval;
5906                 }
5907
5908                 if (!(push_flag & PUSH_ALL)) {
5909                         break;
5910                 }
5911         }
5912         if (vm_initiated == TRUE) {
5913                 lck_mtx_lock(&wbp->cl_lockw);
5914         }
5915
5916         if (err) {
5917                 *err = error;
5918         }
5919
5920 dont_try:
5921         if (cl_len > cl_pushed) {
5922                 /*
5923                  * we didn't push all of the clusters, so
5924                  * lets try to merge them back in to the vnode
5925                  */
5926                 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
5927                         /*
5928                          * we picked up some new clusters while we were trying to
5929                          * push the old ones... this can happen because I've dropped
5930                          * the vnode lock... the sum of the
5931                          * leftovers plus the new cluster count exceeds our ability
5932                          * to represent them, so switch to the sparse cluster mechanism
5933                          *
5934                          * collect the active public clusters...
5935                          */
5936                         sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
5937
5938                         for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
5939                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
5940                                         continue;
5941                                 }
5942                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
5943                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
5944                                 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
5945
5946                                 cl_index1++;
5947                         }
5948                         /*
5949                          * update the cluster count
5950                          */
5951                         wbp->cl_number = cl_index1;
5952
5953                         /*
5954                          * and collect the original clusters that were moved into the
5955                          * local storage for sorting purposes
5956                          */
5957                         sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
5958                 } else {
5959                         /*
5960                          * we've got room to merge the leftovers back in
5961                          * just append them starting at the next 'hole'
5962                          * represented by wbp->cl_number
5963                          */
5964                         for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
5965                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
5966                                         continue;
5967                                 }
5968
5969                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
5970                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
5971                                 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
5972
5973                                 cl_index1++;
5974                         }
5975                         /*
5976                          * update the cluster count
5977                          */
5978                         wbp->cl_number = cl_index1;
5979                 }
5980         }
5981         return MAX_CLUSTERS - wbp->cl_number;
5982 }
5983
5984
5985
5986 static int
5987 cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags,
5988     int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
5989 {
5990         upl_page_info_t *pl;
5991         upl_t            upl;
5992         vm_offset_t      upl_offset;
5993         int              upl_size;
5994         off_t            upl_f_offset;
5995         int              pages_in_upl;
5996         int              start_pg;
5997         int              last_pg;
5998         int              io_size;
5999         int              io_flags;
6000         int              upl_flags;
6001         int              bflag;
6002         int              size;
6003         int              error = 0;
6004         int              retval;
6005         kern_return_t    kret;
6006
6007         if (flags & IO_PASSIVE) {
6008                 bflag = CL_PASSIVE;
6009         } else {
6010                 bflag = 0;
6011         }
6012
6013         if (flags & IO_SKIP_ENCRYPTION) {
6014                 bflag |= CL_ENCRYPTED;
6015         }
6016
6017         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
6018             (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
6019
6020         if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
6021                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
6022
6023                 return 0;
6024         }
6025         upl_size = pages_in_upl * PAGE_SIZE;
6026         upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6027
6028         if (upl_f_offset + upl_size >= EOF) {
6029                 if (upl_f_offset >= EOF) {
6030                         /*
6031                          * must have truncated the file and missed
6032                          * clearing a dangling cluster (i.e. it's completely
6033                          * beyond the new EOF
6034                          */
6035                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
6036
6037                         return 0;
6038                 }
6039                 size = EOF - upl_f_offset;
6040
6041                 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
6042                 pages_in_upl = upl_size / PAGE_SIZE;
6043         } else {
6044                 size = upl_size;
6045         }
6046
6047
6048         if (vm_initiated) {
6049                 vnode_pageout(vp, NULL, (upl_offset_t)0, upl_f_offset, (upl_size_t)upl_size,
6050                     UPL_MSYNC | UPL_VNODE_PAGER | UPL_KEEPCACHED, &error);
6051
6052                 return error;
6053         }
6054         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
6055
6056         /*
6057          * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
6058          *
6059          * - only pages that are currently dirty are returned... these are the ones we need to clean
6060          * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
6061          * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
6062          * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
6063          *   someone dirties this page while the I/O is in progress, we don't lose track of the new state
6064          *
6065          * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
6066          */
6067
6068         if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE)) {
6069                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
6070         } else {
6071                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
6072         }
6073
6074         kret = ubc_create_upl_kernel(vp,
6075             upl_f_offset,
6076             upl_size,
6077             &upl,
6078             &pl,
6079             upl_flags,
6080             VM_KERN_MEMORY_FILE);
6081         if (kret != KERN_SUCCESS) {
6082                 panic("cluster_push: failed to get pagelist");
6083         }
6084
6085         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
6086
6087         /*
6088          * since we only asked for the dirty pages back
6089          * it's possible that we may only get a few or even none, so...
6090          * before we start marching forward, we must make sure we know
6091          * where the last present page is in the UPL, otherwise we could
6092          * end up working with a freed upl due to the FREE_ON_EMPTY semantics
6093          * employed by commit_range and abort_range.
6094          */
6095         for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
6096                 if (upl_page_present(pl, last_pg)) {
6097                         break;
6098                 }
6099         }
6100         pages_in_upl = last_pg + 1;
6101
6102         if (pages_in_upl == 0) {
6103                 ubc_upl_abort(upl, 0);
6104
6105                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
6106                 return 0;
6107         }
6108
6109         for (last_pg = 0; last_pg < pages_in_upl;) {
6110                 /*
6111                  * find the next dirty page in the UPL
6112                  * this will become the first page in the
6113                  * next I/O to generate
6114                  */
6115                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
6116                         if (upl_dirty_page(pl, start_pg)) {
6117                                 break;
6118                         }
6119                         if (upl_page_present(pl, start_pg)) {
6120                                 /*
6121                                  * RET_ONLY_DIRTY will return non-dirty 'precious' pages
6122                                  * just release these unchanged since we're not going
6123                                  * to steal them or change their state
6124                                  */
6125                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
6126                         }
6127                 }
6128                 if (start_pg >= pages_in_upl) {
6129                         /*
6130                          * done... no more dirty pages to push
6131                          */
6132                         break;
6133                 }
6134                 if (start_pg > last_pg) {
6135                         /*
6136                          * skipped over some non-dirty pages
6137                          */
6138                         size -= ((start_pg - last_pg) * PAGE_SIZE);
6139                 }
6140
6141                 /*
6142                  * find a range of dirty pages to write
6143                  */
6144                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
6145                         if (!upl_dirty_page(pl, last_pg)) {
6146                                 break;
6147                         }
6148                 }
6149                 upl_offset = start_pg * PAGE_SIZE;
6150
6151                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
6152
6153                 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag;
6154
6155                 if (!(flags & IO_SYNC)) {
6156                         io_flags |= CL_ASYNC;
6157                 }
6158
6159                 if (flags & IO_CLOSE) {
6160                         io_flags |= CL_CLOSE;
6161                 }
6162
6163                 if (flags & IO_NOCACHE) {
6164                         io_flags |= CL_NOCACHE;
6165                 }
6166
6167                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
6168                     io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6169
6170                 if (error == 0 && retval) {
6171                         error = retval;
6172                 }
6173
6174                 size -= io_size;
6175         }
6176         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, error, 0, 0);
6177
6178         return error;
6179 }
6180
6181
6182 /*
6183  * sparse_cluster_switch is called with the write behind lock held
6184  */
6185 static int
6186 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6187 {
6188         int     cl_index;
6189         int     error;
6190
6191         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, 0, 0);
6192
6193         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
6194                 int       flags;
6195                 struct cl_extent cl;
6196
6197                 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
6198                         if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
6199                                 if (flags & UPL_POP_DIRTY) {
6200                                         cl.e_addr = cl.b_addr + 1;
6201
6202                                         error = sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated);
6203
6204                                         if (error) {
6205                                                 break;
6206                                         }
6207                                 }
6208                         }
6209                 }
6210         }
6211         wbp->cl_number -= cl_index;
6212
6213         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, 0);
6214
6215         return error;
6216 }
6217
6218
6219 /*
6220  * sparse_cluster_push must be called with the write-behind lock held if the scmap is
6221  * still associated with the write-behind context... however, if the scmap has been disassociated
6222  * from the write-behind context (the cluster_push case), the wb lock is not held
6223  */
6224 static int
6225 sparse_cluster_push(struct cl_writebehind *wbp, void **scmap, vnode_t vp, off_t EOF, int push_flag,
6226     int io_flags, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6227 {
6228         struct cl_extent cl;
6229         off_t           offset;
6230         u_int           length;
6231         void            *l_scmap;
6232         int error = 0;
6233
6234         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
6235
6236         if (push_flag & PUSH_ALL) {
6237                 vfs_drt_control(scmap, 1);
6238         }
6239
6240         l_scmap = *scmap;
6241
6242         for (;;) {
6243                 int retval;
6244
6245                 if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) {
6246                         break;
6247                 }
6248
6249                 if (vm_initiated == TRUE) {
6250                         lck_mtx_unlock(&wbp->cl_lockw);
6251                 }
6252
6253                 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
6254                 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
6255
6256                 retval = cluster_push_now(vp, &cl, EOF, io_flags, callback, callback_arg, vm_initiated);
6257                 if (error == 0 && retval) {
6258                         error = retval;
6259                 }
6260
6261                 if (vm_initiated == TRUE) {
6262                         lck_mtx_lock(&wbp->cl_lockw);
6263
6264                         if (*scmap != l_scmap) {
6265                                 break;
6266                         }
6267                 }
6268
6269                 if (error) {
6270                         if (vfs_drt_mark_pages(scmap, offset, length, NULL) != KERN_SUCCESS) {
6271                                 panic("Failed to restore dirty state on failure\n");
6272                         }
6273
6274                         break;
6275                 }
6276
6277                 if (!(push_flag & PUSH_ALL)) {
6278                         break;
6279                 }
6280         }
6281         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6282
6283         return error;
6284 }
6285
6286
6287 /*
6288  * sparse_cluster_add is called with the write behind lock held
6289  */
6290 static int
6291 sparse_cluster_add(struct cl_writebehind *wbp, void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF,
6292     int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6293 {
6294         u_int   new_dirty;
6295         u_int   length;
6296         off_t   offset;
6297         int     error;
6298         int     push_flag = 0; /* Is this a valid value? */
6299
6300         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
6301
6302         offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6303         length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
6304
6305         while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
6306                 /*
6307                  * no room left in the map
6308                  * only a partial update was done
6309                  * push out some pages and try again
6310                  */
6311
6312                 if (vfs_get_scmap_push_behavior_internal(scmap, &push_flag)) {
6313                         push_flag = 0;
6314                 }
6315
6316                 error = sparse_cluster_push(wbp, scmap, vp, EOF, push_flag, 0, callback, callback_arg, vm_initiated);
6317
6318                 if (error) {
6319                         break;
6320                 }
6321
6322                 offset += (new_dirty * PAGE_SIZE_64);
6323                 length -= (new_dirty * PAGE_SIZE);
6324         }
6325         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6326
6327         return error;
6328 }
6329
6330
6331 static int
6332 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
6333 {
6334         upl_page_info_t  *pl;
6335         upl_t            upl;
6336         addr64_t         ubc_paddr;
6337         kern_return_t    kret;
6338         int              error = 0;
6339         int              did_read = 0;
6340         int              abort_flags;
6341         int              upl_flags;
6342         int              bflag;
6343
6344         if (flags & IO_PASSIVE) {
6345                 bflag = CL_PASSIVE;
6346         } else {
6347                 bflag = 0;
6348         }
6349
6350         if (flags & IO_NOCACHE) {
6351                 bflag |= CL_NOCACHE;
6352         }
6353
6354         upl_flags = UPL_SET_LITE;
6355
6356         if (!(flags & CL_READ)) {
6357                 /*
6358                  * "write" operation:  let the UPL subsystem know
6359                  * that we intend to modify the buffer cache pages
6360                  * we're gathering.
6361                  */
6362                 upl_flags |= UPL_WILL_MODIFY;
6363         } else {
6364                 /*
6365                  * indicate that there is no need to pull the
6366                  * mapping for this page... we're only going
6367                  * to read from it, not modify it.
6368                  */
6369                 upl_flags |= UPL_FILE_IO;
6370         }
6371         kret = ubc_create_upl_kernel(vp,
6372             uio->uio_offset & ~PAGE_MASK_64,
6373             PAGE_SIZE,
6374             &upl,
6375             &pl,
6376             upl_flags,
6377             VM_KERN_MEMORY_FILE);
6378
6379         if (kret != KERN_SUCCESS) {
6380                 return EINVAL;
6381         }
6382
6383         if (!upl_valid_page(pl, 0)) {
6384                 /*
6385                  * issue a synchronous read to cluster_io
6386                  */
6387                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6388                     CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6389                 if (error) {
6390                         ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
6391
6392                         return error;
6393                 }
6394                 did_read = 1;
6395         }
6396         ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
6397
6398 /*
6399  *      NOTE:  There is no prototype for the following in BSD. It, and the definitions
6400  *      of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
6401  *      osfmk/ppc/mappings.h.  They are not included here because there appears to be no
6402  *      way to do so without exporting them to kexts as well.
6403  */
6404         if (flags & CL_READ) {
6405 //              copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);    /* Copy physical to physical and flush the destination */
6406                 copypv(ubc_paddr, usr_paddr, xsize, 2 |        1 |        4);           /* Copy physical to physical and flush the destination */
6407         } else {
6408 //              copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);    /* Copy physical to physical and flush the source */
6409                 copypv(usr_paddr, ubc_paddr, xsize, 2 |        1 |        8);           /* Copy physical to physical and flush the source */
6410         }
6411         if (!(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
6412                 /*
6413                  * issue a synchronous write to cluster_io
6414                  */
6415                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6416                     bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6417         }
6418         if (error == 0) {
6419                 uio_update(uio, (user_size_t)xsize);
6420         }
6421
6422         if (did_read) {
6423                 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
6424         } else {
6425                 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
6426         }
6427
6428         ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
6429
6430         return error;
6431 }
6432
6433 int
6434 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid)
6435 {
6436         int       pg_offset;
6437         int       pg_index;
6438         int       csize;
6439         int       segflg;
6440         int       retval = 0;
6441         int       xsize;
6442         upl_page_info_t *pl;
6443         int       dirty_count;
6444
6445         xsize = *io_resid;
6446
6447         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6448             (int)uio->uio_offset, upl_offset, xsize, 0, 0);
6449
6450         segflg = uio->uio_segflg;
6451
6452         switch (segflg) {
6453         case UIO_USERSPACE32:
6454         case UIO_USERISPACE32:
6455                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6456                 break;
6457
6458         case UIO_USERSPACE:
6459         case UIO_USERISPACE:
6460                 uio->uio_segflg = UIO_PHYS_USERSPACE;
6461                 break;
6462
6463         case UIO_USERSPACE64:
6464         case UIO_USERISPACE64:
6465                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6466                 break;
6467
6468         case UIO_SYSSPACE:
6469                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6470                 break;
6471         }
6472         pl = ubc_upl_pageinfo(upl);
6473
6474         pg_index  = upl_offset / PAGE_SIZE;
6475         pg_offset = upl_offset & PAGE_MASK;
6476         csize     = min(PAGE_SIZE - pg_offset, xsize);
6477
6478         dirty_count = 0;
6479         while (xsize && retval == 0) {
6480                 addr64_t  paddr;
6481
6482                 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
6483                 if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE)) {
6484                         dirty_count++;
6485                 }
6486
6487                 retval = uiomove64(paddr, csize, uio);
6488
6489                 pg_index += 1;
6490                 pg_offset = 0;
6491                 xsize    -= csize;
6492                 csize     = min(PAGE_SIZE, xsize);
6493         }
6494         *io_resid = xsize;
6495
6496         uio->uio_segflg = segflg;
6497
6498         task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl));
6499         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6500             (int)uio->uio_offset, xsize, retval, segflg, 0);
6501
6502         return retval;
6503 }
6504
6505
6506 int
6507 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
6508 {
6509         return cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1);
6510 }
6511
6512
6513 static int
6514 cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference)
6515 {
6516         int       segflg;
6517         int       io_size;
6518         int       xsize;
6519         int       start_offset;
6520         int       retval = 0;
6521         memory_object_control_t  control;
6522
6523         io_size = *io_resid;
6524
6525         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6526             (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
6527
6528         control = ubc_getobject(vp, UBC_FLAGS_NONE);
6529
6530         if (control == MEMORY_OBJECT_CONTROL_NULL) {
6531                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6532                     (int)uio->uio_offset, io_size, retval, 3, 0);
6533
6534                 return 0;
6535         }
6536         segflg = uio->uio_segflg;
6537
6538         switch (segflg) {
6539         case UIO_USERSPACE32:
6540         case UIO_USERISPACE32:
6541                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6542                 break;
6543
6544         case UIO_USERSPACE64:
6545         case UIO_USERISPACE64:
6546                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6547                 break;
6548
6549         case UIO_USERSPACE:
6550         case UIO_USERISPACE:
6551                 uio->uio_segflg = UIO_PHYS_USERSPACE;
6552                 break;
6553
6554         case UIO_SYSSPACE:
6555                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6556                 break;
6557         }
6558
6559         if ((io_size = *io_resid)) {
6560                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
6561                 xsize = uio_resid(uio);
6562
6563                 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
6564                     start_offset, io_size, mark_dirty, take_reference);
6565                 xsize -= uio_resid(uio);
6566                 io_size -= xsize;
6567         }
6568         uio->uio_segflg = segflg;
6569         *io_resid       = io_size;
6570
6571         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6572             (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0);
6573
6574         return retval;
6575 }
6576
6577
6578 int
6579 is_file_clean(vnode_t vp, off_t filesize)
6580 {
6581         off_t f_offset;
6582         int   flags;
6583         int   total_dirty = 0;
6584
6585         for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
6586                 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
6587                         if (flags & UPL_POP_DIRTY) {
6588                                 total_dirty++;
6589                         }
6590                 }
6591         }
6592         if (total_dirty) {
6593                 return EINVAL;
6594         }
6595
6596         return 0;
6597 }
6598
6599
6600
6601 /*
6602  * Dirty region tracking/clustering mechanism.
6603  *
6604  * This code (vfs_drt_*) provides a mechanism for tracking and clustering
6605  * dirty regions within a larger space (file).  It is primarily intended to
6606  * support clustering in large files with many dirty areas.
6607  *
6608  * The implementation assumes that the dirty regions are pages.
6609  *
6610  * To represent dirty pages within the file, we store bit vectors in a
6611  * variable-size circular hash.
6612  */
6613
6614 /*
6615  * Bitvector size.  This determines the number of pages we group in a
6616  * single hashtable entry.  Each hashtable entry is aligned to this
6617  * size within the file.
6618  */
6619 #define DRT_BITVECTOR_PAGES             ((1024 * 256) / PAGE_SIZE)
6620
6621 /*
6622  * File offset handling.
6623  *
6624  * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
6625  * the correct formula is  (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6626  */
6627 #define DRT_ADDRESS_MASK                (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6628 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK)
6629
6630 /*
6631  * Hashtable address field handling.
6632  *
6633  * The low-order bits of the hashtable address are used to conserve
6634  * space.
6635  *
6636  * DRT_HASH_COUNT_MASK must be large enough to store the range
6637  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
6638  * to indicate that the bucket is actually unoccupied.
6639  */
6640 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
6641 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \
6642         do {                                                                                            \
6643                 (scm)->scm_hashtable[(i)].dhe_control =                                                 \
6644                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
6645         } while (0)
6646 #define DRT_HASH_COUNT_MASK             0x1ff
6647 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
6648 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \
6649         do {                                                                                                            \
6650                 (scm)->scm_hashtable[(i)].dhe_control =                                                                 \
6651                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \
6652         } while (0)
6653 #define DRT_HASH_CLEAR(scm, i)                                                                                          \
6654         do {                                                                                                            \
6655                 (scm)->scm_hashtable[(i)].dhe_control = 0;                                                              \
6656         } while (0)
6657 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
6658 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
6659 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \
6660         do {                                                                                            \
6661                 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \
6662                 DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \
6663         } while(0);
6664
6665
6666 #if CONFIG_EMBEDDED
6667 /*
6668  * Hash table moduli.
6669  *
6670  * Since the hashtable entry's size is dependent on the size of
6671  * the bitvector, and since the hashtable size is constrained to
6672  * both being prime and fitting within the desired allocation
6673  * size, these values need to be manually determined.
6674  *
6675  * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6676  *
6677  * The small hashtable allocation is 4096 bytes, so the modulus is 251.
6678  * The large hashtable allocation is 32768 bytes, so the modulus is 2039.
6679  * The xlarge hashtable allocation is 131072 bytes, so the modulus is 8179.
6680  */
6681
6682 #define DRT_HASH_SMALL_MODULUS  251
6683 #define DRT_HASH_LARGE_MODULUS  2039
6684 #define DRT_HASH_XLARGE_MODULUS  8179
6685
6686 /*
6687  * Physical memory required before the large hash modulus is permitted.
6688  *
6689  * On small memory systems, the large hash modulus can lead to phsyical
6690  * memory starvation, so we avoid using it there.
6691  */
6692 #define DRT_HASH_LARGE_MEMORY_REQUIRED  (1024LL * 1024LL * 1024LL)      /* 1GiB */
6693 #define DRT_HASH_XLARGE_MEMORY_REQUIRED  (8 * 1024LL * 1024LL * 1024LL)  /* 8GiB */
6694
6695 #define DRT_SMALL_ALLOCATION    4096    /* 80 bytes spare */
6696 #define DRT_LARGE_ALLOCATION    32768   /* 144 bytes spare */
6697 #define DRT_XLARGE_ALLOCATION    131072  /* 208 bytes spare */
6698
6699 #else
6700 /*
6701  * Hash table moduli.
6702  *
6703  * Since the hashtable entry's size is dependent on the size of
6704  * the bitvector, and since the hashtable size is constrained to
6705  * both being prime and fitting within the desired allocation
6706  * size, these values need to be manually determined.
6707  *
6708  * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6709  *
6710  * The small hashtable allocation is 16384 bytes, so the modulus is 1019.
6711  * The large hashtable allocation is 131072 bytes, so the modulus is 8179.
6712  * The xlarge hashtable allocation is 524288 bytes, so the modulus is 32749.
6713  */
6714
6715 #define DRT_HASH_SMALL_MODULUS  1019
6716 #define DRT_HASH_LARGE_MODULUS  8179
6717 #define DRT_HASH_XLARGE_MODULUS  32749
6718
6719 /*
6720  * Physical memory required before the large hash modulus is permitted.
6721  *
6722  * On small memory systems, the large hash modulus can lead to phsyical
6723  * memory starvation, so we avoid using it there.
6724  */
6725 #define DRT_HASH_LARGE_MEMORY_REQUIRED  (4 * 1024LL * 1024LL * 1024LL)  /* 4GiB */
6726 #define DRT_HASH_XLARGE_MEMORY_REQUIRED  (32 * 1024LL * 1024LL * 1024LL)  /* 32GiB */
6727
6728 #define DRT_SMALL_ALLOCATION    16384   /* 80 bytes spare */
6729 #define DRT_LARGE_ALLOCATION    131072  /* 208 bytes spare */
6730 #define DRT_XLARGE_ALLOCATION   524288  /* 304 bytes spare */
6731
6732 #endif
6733
6734 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
6735
6736 /*
6737  * Hashtable entry.
6738  */
6739 struct vfs_drt_hashentry {
6740         u_int64_t       dhe_control;
6741 /*
6742  * dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
6743  * DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
6744  * Since PAGE_SIZE is only known at boot time,
6745  *      -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
6746  *      -declare dhe_bitvector array for largest possible length
6747  */
6748 #define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
6749         u_int32_t       dhe_bitvector[MAX_DRT_BITVECTOR_PAGES / 32];
6750 };
6751
6752 /*
6753  * Hashtable bitvector handling.
6754  *
6755  * Bitvector fields are 32 bits long.
6756  */
6757
6758 #define DRT_HASH_SET_BIT(scm, i, bit)                           \
6759         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
6760
6761 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \
6762         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
6763
6764 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \
6765         ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
6766
6767 #define DRT_BITVECTOR_CLEAR(scm, i)                             \
6768         bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6769
6770 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \
6771         bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \
6772             &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \
6773             (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6774
6775 /*
6776  * Dirty Region Tracking structure.
6777  *
6778  * The hashtable is allocated entirely inside the DRT structure.
6779  *
6780  * The hash is a simple circular prime modulus arrangement, the structure
6781  * is resized from small to large if it overflows.
6782  */
6783
6784 struct vfs_drt_clustermap {
6785         u_int32_t               scm_magic;      /* sanity/detection */
6786 #define DRT_SCM_MAGIC           0x12020003
6787         u_int32_t               scm_modulus;    /* current ring size */
6788         u_int32_t               scm_buckets;    /* number of occupied buckets */
6789         u_int32_t               scm_lastclean;  /* last entry we cleaned */
6790         u_int32_t               scm_iskips;     /* number of slot skips */
6791
6792         struct vfs_drt_hashentry scm_hashtable[0];
6793 };
6794
6795
6796 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus)
6797 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus)
6798
6799 /*
6800  * Debugging codes and arguments.
6801  */
6802 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
6803 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
6804 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
6805 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
6806 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
6807                                                             * dirty */
6808                                                            /* 0, setcount */
6809                                                            /* 1 (clean, no map) */
6810                                                            /* 2 (map alloc fail) */
6811                                                            /* 3, resid (partial) */
6812 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87))
6813 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
6814                                                             * lastclean, iskips */
6815
6816
6817 static kern_return_t    vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
6818 static kern_return_t    vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
6819 static kern_return_t    vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
6820     u_int64_t offset, int *indexp);
6821 static kern_return_t    vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
6822     u_int64_t offset,
6823     int *indexp,
6824     int recursed);
6825 static kern_return_t    vfs_drt_do_mark_pages(
6826         void            **cmapp,
6827         u_int64_t       offset,
6828         u_int           length,
6829         u_int           *setcountp,
6830         int             dirty);
6831 static void             vfs_drt_trace(
6832         struct vfs_drt_clustermap *cmap,
6833         int code,
6834         int arg1,
6835         int arg2,
6836         int arg3,
6837         int arg4);
6838
6839
6840 /*
6841  * Allocate and initialise a sparse cluster map.
6842  *
6843  * Will allocate a new map, resize or compact an existing map.
6844  *
6845  * XXX we should probably have at least one intermediate map size,
6846  * as the 1:16 ratio seems a bit drastic.
6847  */
6848 static kern_return_t
6849 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
6850 {
6851         struct vfs_drt_clustermap *cmap = NULL, *ocmap = NULL;
6852         kern_return_t   kret = KERN_SUCCESS;
6853         u_int64_t       offset = 0;
6854         u_int32_t       i = 0;
6855         int             modulus_size = 0, map_size = 0, active_buckets = 0, index = 0, copycount = 0;
6856
6857         ocmap = NULL;
6858         if (cmapp != NULL) {
6859                 ocmap = *cmapp;
6860         }
6861
6862         /*
6863          * Decide on the size of the new map.
6864          */
6865         if (ocmap == NULL) {
6866                 modulus_size = DRT_HASH_SMALL_MODULUS;
6867                 map_size = DRT_SMALL_ALLOCATION;
6868         } else {
6869                 /* count the number of active buckets in the old map */
6870                 active_buckets = 0;
6871                 for (i = 0; i < ocmap->scm_modulus; i++) {
6872                         if (!DRT_HASH_VACANT(ocmap, i) &&
6873                             (DRT_HASH_GET_COUNT(ocmap, i) != 0)) {
6874                                 active_buckets++;
6875                         }
6876                 }
6877                 /*
6878                  * If we're currently using the small allocation, check to
6879                  * see whether we should grow to the large one.
6880                  */
6881                 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
6882                         /*
6883                          * If the ring is nearly full and we are allowed to
6884                          * use the large modulus, upgrade.
6885                          */
6886                         if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
6887                             (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
6888                                 modulus_size = DRT_HASH_LARGE_MODULUS;
6889                                 map_size = DRT_LARGE_ALLOCATION;
6890                         } else {
6891                                 modulus_size = DRT_HASH_SMALL_MODULUS;
6892                                 map_size = DRT_SMALL_ALLOCATION;
6893                         }
6894                 } else if (ocmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
6895                         if ((active_buckets > (DRT_HASH_LARGE_MODULUS - 5)) &&
6896                             (max_mem >= DRT_HASH_XLARGE_MEMORY_REQUIRED)) {
6897                                 modulus_size = DRT_HASH_XLARGE_MODULUS;
6898                                 map_size = DRT_XLARGE_ALLOCATION;
6899                         } else {
6900                                 modulus_size = DRT_HASH_LARGE_MODULUS;
6901                                 map_size = DRT_LARGE_ALLOCATION;
6902                         }
6903                 } else {
6904                         /* already using the xlarge modulus */
6905                         modulus_size = DRT_HASH_XLARGE_MODULUS;
6906                         map_size = DRT_XLARGE_ALLOCATION;
6907
6908                         /*
6909                          * If the ring is completely full, there's
6910                          * nothing useful for us to do.  Behave as
6911                          * though we had compacted into the new
6912                          * array and return.
6913                          */
6914                         if (active_buckets >= DRT_HASH_XLARGE_MODULUS) {
6915                                 return KERN_SUCCESS;
6916                         }
6917                 }
6918         }
6919
6920         /*
6921          * Allocate and initialise the new map.
6922          */
6923
6924         kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, map_size, VM_KERN_MEMORY_FILE);
6925         if (kret != KERN_SUCCESS) {
6926                 return kret;
6927         }
6928         cmap->scm_magic = DRT_SCM_MAGIC;
6929         cmap->scm_modulus = modulus_size;
6930         cmap->scm_buckets = 0;
6931         cmap->scm_lastclean = 0;
6932         cmap->scm_iskips = 0;
6933         for (i = 0; i < cmap->scm_modulus; i++) {
6934                 DRT_HASH_CLEAR(cmap, i);
6935                 DRT_HASH_VACATE(cmap, i);
6936                 DRT_BITVECTOR_CLEAR(cmap, i);
6937         }
6938
6939         /*
6940          * If there's an old map, re-hash entries from it into the new map.
6941          */
6942         copycount = 0;
6943         if (ocmap != NULL) {
6944                 for (i = 0; i < ocmap->scm_modulus; i++) {
6945                         /* skip empty buckets */
6946                         if (DRT_HASH_VACANT(ocmap, i) ||
6947                             (DRT_HASH_GET_COUNT(ocmap, i) == 0)) {
6948                                 continue;
6949                         }
6950                         /* get new index */
6951                         offset = DRT_HASH_GET_ADDRESS(ocmap, i);
6952                         kret = vfs_drt_get_index(&cmap, offset, &index, 1);
6953                         if (kret != KERN_SUCCESS) {
6954                                 /* XXX need to bail out gracefully here */
6955                                 panic("vfs_drt: new cluster map mysteriously too small");
6956                                 index = 0;
6957                         }
6958                         /* copy */
6959                         DRT_HASH_COPY(ocmap, i, cmap, index);
6960                         copycount++;
6961                 }
6962         }
6963
6964         /* log what we've done */
6965         vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
6966
6967         /*
6968          * It's important to ensure that *cmapp always points to
6969          * a valid map, so we must overwrite it before freeing
6970          * the old map.
6971          */
6972         *cmapp = cmap;
6973         if (ocmap != NULL) {
6974                 /* emit stats into trace buffer */
6975                 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
6976                     ocmap->scm_modulus,
6977                     ocmap->scm_buckets,
6978                     ocmap->scm_lastclean,
6979                     ocmap->scm_iskips);
6980
6981                 vfs_drt_free_map(ocmap);
6982         }
6983         return KERN_SUCCESS;
6984 }
6985
6986
6987 /*
6988  * Free a sparse cluster map.
6989  */
6990 static kern_return_t
6991 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
6992 {
6993         vm_size_t map_size = 0;
6994
6995         if (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
6996                 map_size = DRT_SMALL_ALLOCATION;
6997         } else if (cmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
6998                 map_size = DRT_LARGE_ALLOCATION;
6999         } else if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7000                 map_size = DRT_XLARGE_ALLOCATION;
7001         } else {
7002                 panic("vfs_drt_free_map: Invalid modulus %d\n", cmap->scm_modulus);
7003         }
7004
7005         kmem_free(kernel_map, (vm_offset_t)cmap, map_size);
7006         return KERN_SUCCESS;
7007 }
7008
7009
7010 /*
7011  * Find the hashtable slot currently occupied by an entry for the supplied offset.
7012  */
7013 static kern_return_t
7014 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
7015 {
7016         int             index;
7017         u_int32_t       i;
7018
7019         offset = DRT_ALIGN_ADDRESS(offset);
7020         index = DRT_HASH(cmap, offset);
7021
7022         /* traverse the hashtable */
7023         for (i = 0; i < cmap->scm_modulus; i++) {
7024                 /*
7025                  * If the slot is vacant, we can stop.
7026                  */
7027                 if (DRT_HASH_VACANT(cmap, index)) {
7028                         break;
7029                 }
7030
7031                 /*
7032                  * If the address matches our offset, we have success.
7033                  */
7034                 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
7035                         *indexp = index;
7036                         return KERN_SUCCESS;
7037                 }
7038
7039                 /*
7040                  * Move to the next slot, try again.
7041                  */
7042                 index = DRT_HASH_NEXT(cmap, index);
7043         }
7044         /*
7045          * It's not there.
7046          */
7047         return KERN_FAILURE;
7048 }
7049
7050 /*
7051  * Find the hashtable slot for the supplied offset.  If we haven't allocated
7052  * one yet, allocate one and populate the address field.  Note that it will
7053  * not have a nonzero page count and thus will still technically be free, so
7054  * in the case where we are called to clean pages, the slot will remain free.
7055  */
7056 static kern_return_t
7057 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
7058 {
7059         struct vfs_drt_clustermap *cmap;
7060         kern_return_t   kret;
7061         u_int32_t       index;
7062         u_int32_t       i;
7063
7064         cmap = *cmapp;
7065
7066         /* look for an existing entry */
7067         kret = vfs_drt_search_index(cmap, offset, indexp);
7068         if (kret == KERN_SUCCESS) {
7069                 return kret;
7070         }
7071
7072         /* need to allocate an entry */
7073         offset = DRT_ALIGN_ADDRESS(offset);
7074         index = DRT_HASH(cmap, offset);
7075
7076         /* scan from the index forwards looking for a vacant slot */
7077         for (i = 0; i < cmap->scm_modulus; i++) {
7078                 /* slot vacant? */
7079                 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap, index) == 0) {
7080                         cmap->scm_buckets++;
7081                         if (index < cmap->scm_lastclean) {
7082                                 cmap->scm_lastclean = index;
7083                         }
7084                         DRT_HASH_SET_ADDRESS(cmap, index, offset);
7085                         DRT_HASH_SET_COUNT(cmap, index, 0);
7086                         DRT_BITVECTOR_CLEAR(cmap, index);
7087                         *indexp = index;
7088                         vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
7089                         return KERN_SUCCESS;
7090                 }
7091                 cmap->scm_iskips += i;
7092                 index = DRT_HASH_NEXT(cmap, index);
7093         }
7094
7095         /*
7096          * We haven't found a vacant slot, so the map is full.  If we're not
7097          * already recursed, try reallocating/compacting it.
7098          */
7099         if (recursed) {
7100                 return KERN_FAILURE;
7101         }
7102         kret = vfs_drt_alloc_map(cmapp);
7103         if (kret == KERN_SUCCESS) {
7104                 /* now try to insert again */
7105                 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
7106         }
7107         return kret;
7108 }
7109
7110 /*
7111  * Implementation of set dirty/clean.
7112  *
7113  * In the 'clean' case, not finding a map is OK.
7114  */
7115 static kern_return_t
7116 vfs_drt_do_mark_pages(
7117         void            **private,
7118         u_int64_t       offset,
7119         u_int           length,
7120         u_int           *setcountp,
7121         int             dirty)
7122 {
7123         struct vfs_drt_clustermap *cmap, **cmapp;
7124         kern_return_t   kret;
7125         int             i, index, pgoff, pgcount, setcount, ecount;
7126
7127         cmapp = (struct vfs_drt_clustermap **)private;
7128         cmap = *cmapp;
7129
7130         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
7131
7132         if (setcountp != NULL) {
7133                 *setcountp = 0;
7134         }
7135
7136         /* allocate a cluster map if we don't already have one */
7137         if (cmap == NULL) {
7138                 /* no cluster map, nothing to clean */
7139                 if (!dirty) {
7140                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
7141                         return KERN_SUCCESS;
7142                 }
7143                 kret = vfs_drt_alloc_map(cmapp);
7144                 if (kret != KERN_SUCCESS) {
7145                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
7146                         return kret;
7147                 }
7148         }
7149         setcount = 0;
7150
7151         /*
7152          * Iterate over the length of the region.
7153          */
7154         while (length > 0) {
7155                 /*
7156                  * Get the hashtable index for this offset.
7157                  *
7158                  * XXX this will add blank entries if we are clearing a range
7159                  * that hasn't been dirtied.
7160                  */
7161                 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
7162                 cmap = *cmapp;  /* may have changed! */
7163                 /* this may be a partial-success return */
7164                 if (kret != KERN_SUCCESS) {
7165                         if (setcountp != NULL) {
7166                                 *setcountp = setcount;
7167                         }
7168                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
7169
7170                         return kret;
7171                 }
7172
7173                 /*
7174                  * Work out how many pages we're modifying in this
7175                  * hashtable entry.
7176                  */
7177                 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
7178                 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
7179
7180                 /*
7181                  * Iterate over pages, dirty/clearing as we go.
7182                  */
7183                 ecount = DRT_HASH_GET_COUNT(cmap, index);
7184                 for (i = 0; i < pgcount; i++) {
7185                         if (dirty) {
7186                                 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7187                                         if (ecount >= DRT_BITVECTOR_PAGES) {
7188                                                 panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7189                                         }
7190                                         DRT_HASH_SET_BIT(cmap, index, pgoff + i);
7191                                         ecount++;
7192                                         setcount++;
7193                                 }
7194                         } else {
7195                                 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7196                                         if (ecount <= 0) {
7197                                                 panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7198                                         }
7199                                         assert(ecount > 0);
7200                                         DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
7201                                         ecount--;
7202                                         setcount++;
7203                                 }
7204                         }
7205                 }
7206                 DRT_HASH_SET_COUNT(cmap, index, ecount);
7207
7208                 offset += pgcount * PAGE_SIZE;
7209                 length -= pgcount * PAGE_SIZE;
7210         }
7211         if (setcountp != NULL) {
7212                 *setcountp = setcount;
7213         }
7214
7215         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
7216
7217         return KERN_SUCCESS;
7218 }
7219
7220 /*
7221  * Mark a set of pages as dirty/clean.
7222  *
7223  * This is a public interface.
7224  *
7225  * cmapp
7226  *      Pointer to storage suitable for holding a pointer.  Note that
7227  *      this must either be NULL or a value set by this function.
7228  *
7229  * size
7230  *      Current file size in bytes.
7231  *
7232  * offset
7233  *      Offset of the first page to be marked as dirty, in bytes.  Must be
7234  *      page-aligned.
7235  *
7236  * length
7237  *      Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE.
7238  *
7239  * setcountp
7240  *      Number of pages newly marked dirty by this call (optional).
7241  *
7242  * Returns KERN_SUCCESS if all the pages were successfully marked.
7243  */
7244 static kern_return_t
7245 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp)
7246 {
7247         /* XXX size unused, drop from interface */
7248         return vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1);
7249 }
7250
7251 #if 0
7252 static kern_return_t
7253 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
7254 {
7255         return vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7256 }
7257 #endif
7258
7259 /*
7260  * Get a cluster of dirty pages.
7261  *
7262  * This is a public interface.
7263  *
7264  * cmapp
7265  *      Pointer to storage managed by drt_mark_pages.  Note that this must
7266  *      be NULL or a value set by drt_mark_pages.
7267  *
7268  * offsetp
7269  *      Returns the byte offset into the file of the first page in the cluster.
7270  *
7271  * lengthp
7272  *      Returns the length in bytes of the cluster of dirty pages.
7273  *
7274  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there
7275  * are no dirty pages meeting the minmum size criteria.  Private storage will
7276  * be released if there are no more dirty pages left in the map
7277  *
7278  */
7279 static kern_return_t
7280 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
7281 {
7282         struct vfs_drt_clustermap *cmap;
7283         u_int64_t       offset;
7284         u_int           length;
7285         u_int32_t       j;
7286         int             index, i, fs, ls;
7287
7288         /* sanity */
7289         if ((cmapp == NULL) || (*cmapp == NULL)) {
7290                 return KERN_FAILURE;
7291         }
7292         cmap = *cmapp;
7293
7294         /* walk the hashtable */
7295         for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
7296                 index = DRT_HASH(cmap, offset);
7297
7298                 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0)) {
7299                         continue;
7300                 }
7301
7302                 /* scan the bitfield for a string of bits */
7303                 fs = -1;
7304
7305                 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7306                         if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7307                                 fs = i;
7308                                 break;
7309                         }
7310                 }
7311                 if (fs == -1) {
7312                         /*  didn't find any bits set */
7313                         panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
7314                             cmap, index, DRT_HASH_GET_COUNT(cmap, index));
7315                 }
7316                 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
7317                         if (!DRT_HASH_TEST_BIT(cmap, index, i)) {
7318                                 break;
7319                         }
7320                 }
7321
7322                 /* compute offset and length, mark pages clean */
7323                 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
7324                 length = ls * PAGE_SIZE;
7325                 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7326                 cmap->scm_lastclean = index;
7327
7328                 /* return successful */
7329                 *offsetp = (off_t)offset;
7330                 *lengthp = length;
7331
7332                 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
7333                 return KERN_SUCCESS;
7334         }
7335         /*
7336          * We didn't find anything... hashtable is empty
7337          * emit stats into trace buffer and
7338          * then free it
7339          */
7340         vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7341             cmap->scm_modulus,
7342             cmap->scm_buckets,
7343             cmap->scm_lastclean,
7344             cmap->scm_iskips);
7345
7346         vfs_drt_free_map(cmap);
7347         *cmapp = NULL;
7348
7349         return KERN_FAILURE;
7350 }
7351
7352
7353 static kern_return_t
7354 vfs_drt_control(void **cmapp, int op_type)
7355 {
7356         struct vfs_drt_clustermap *cmap;
7357
7358         /* sanity */
7359         if ((cmapp == NULL) || (*cmapp == NULL)) {
7360                 return KERN_FAILURE;
7361         }
7362         cmap = *cmapp;
7363
7364         switch (op_type) {
7365         case 0:
7366                 /* emit stats into trace buffer */
7367                 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7368                     cmap->scm_modulus,
7369                     cmap->scm_buckets,
7370                     cmap->scm_lastclean,
7371                     cmap->scm_iskips);
7372
7373                 vfs_drt_free_map(cmap);
7374                 *cmapp = NULL;
7375                 break;
7376
7377         case 1:
7378                 cmap->scm_lastclean = 0;
7379                 break;
7380         }
7381         return KERN_SUCCESS;
7382 }
7383
7384
7385
7386 /*
7387  * Emit a summary of the state of the clustermap into the trace buffer
7388  * along with some caller-provided data.
7389  */
7390 #if KDEBUG
7391 static void
7392 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
7393 {
7394         KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
7395 }
7396 #else
7397 static void
7398 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
7399     __unused int arg1, __unused int arg2, __unused int arg3,
7400     __unused int arg4)
7401 {
7402 }
7403 #endif
7404
7405 #if 0
7406 /*
7407  * Perform basic sanity check on the hash entry summary count
7408  * vs. the actual bits set in the entry.
7409  */
7410 static void
7411 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
7412 {
7413         int index, i;
7414         int bits_on;
7415
7416         for (index = 0; index < cmap->scm_modulus; index++) {
7417                 if (DRT_HASH_VACANT(cmap, index)) {
7418                         continue;
7419                 }
7420
7421                 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7422                         if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7423                                 bits_on++;
7424                         }
7425                 }
7426                 if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) {
7427                         panic("bits_on = %d,  index = %d\n", bits_on, index);
7428                 }
7429         }
7430 }
7431 #endif
7432
7433 /*
7434  * Internal interface only.
7435  */
7436 static kern_return_t
7437 vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag)
7438 {
7439         struct vfs_drt_clustermap *cmap;
7440
7441         /* sanity */
7442         if ((cmapp == NULL) || (*cmapp == NULL) || (push_flag == NULL)) {
7443                 return KERN_FAILURE;
7444         }
7445         cmap = *cmapp;
7446
7447         if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7448                 /*
7449                  * If we have a full xlarge sparse cluster,
7450                  * we push it out all at once so the cluster
7451                  * map can be available to absorb more I/Os.
7452                  * This is done on large memory configs so
7453                  * the small I/Os don't interfere with the
7454                  * pro workloads.
7455                  */
7456                 *push_flag = PUSH_ALL;
7457         }
7458         return KERN_SUCCESS;
7459 }