bsd/vfs/vfs_cluster.c

   1 /*
   2  * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95
  62  */
  63
  64 #include <sys/param.h>
  65 #include <sys/proc_internal.h>
  66 #include <sys/buf_internal.h>
  67 #include <sys/mount_internal.h>
  68 #include <sys/vnode_internal.h>
  69 #include <sys/trace.h>
  70 #include <sys/malloc.h>
  71 #include <sys/time.h>
  72 #include <sys/kernel.h>
  73 #include <sys/resourcevar.h>
  74 #include <miscfs/specfs/specdev.h>
  75 #include <sys/uio_internal.h>
  76 #include <libkern/libkern.h>
  77 #include <machine/machine_routines.h>
  78
  79 #include <sys/ubc_internal.h>
  80 #include <vm/vnode_pager.h>
  81
  82 #include <mach/mach_types.h>
  83 #include <mach/memory_object_types.h>
  84 #include <mach/vm_map.h>
  85 #include <mach/upl.h>
  86 #include <kern/task.h>
  87
  88 #include <vm/vm_kern.h>
  89 #include <vm/vm_map.h>
  90 #include <vm/vm_pageout.h>
  91 #include <vm/vm_fault.h>
  92
  93 #include <sys/kdebug.h>
  94 #include <libkern/OSAtomic.h>
  95
  96 #include <sys/sdt.h>
  97
  98 #include <stdbool.h>
  99
 100 #if 0
 101 #undef KERNEL_DEBUG
 102 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 103 #endif
 104
 105
 106 #define CL_READ         0x01
 107 #define CL_WRITE        0x02
 108 #define CL_ASYNC        0x04
 109 #define CL_COMMIT       0x08
 110 #define CL_PAGEOUT      0x10
 111 #define CL_AGE          0x20
 112 #define CL_NOZERO       0x40
 113 #define CL_PAGEIN       0x80
 114 #define CL_DEV_MEMORY   0x100
 115 #define CL_PRESERVE     0x200
 116 #define CL_THROTTLE     0x400
 117 #define CL_KEEPCACHED   0x800
 118 #define CL_DIRECT_IO    0x1000
 119 #define CL_PASSIVE      0x2000
 120 #define CL_IOSTREAMING  0x4000
 121 #define CL_CLOSE        0x8000
 122 #define CL_ENCRYPTED    0x10000
 123 #define CL_RAW_ENCRYPTED        0x20000
 124 #define CL_NOCACHE      0x40000
 125
 126 #define MAX_VECTOR_UPL_ELEMENTS 8
 127 #define MAX_VECTOR_UPL_SIZE     (2 * MAX_UPL_SIZE_BYTES)
 128
 129 extern upl_t vector_upl_create(vm_offset_t);
 130 extern boolean_t vector_upl_is_valid(upl_t);
 131 extern boolean_t vector_upl_set_subupl(upl_t,upl_t, u_int32_t);
 132 extern void vector_upl_set_pagelist(upl_t);
 133 extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
 134
 135 struct clios {
 136         lck_mtx_t io_mtxp;
 137         u_int  io_completed;       /* amount of io that has currently completed */
 138         u_int  io_issued;          /* amount of io that was successfully issued */
 139         int    io_error;           /* error code of first error encountered */
 140         int    io_wanted;          /* someone is sleeping waiting for a change in state */
 141 };
 142
 143 struct cl_direct_read_lock {
 144         LIST_ENTRY(cl_direct_read_lock)         chain;
 145         int32_t                                                         ref_count;
 146         vnode_t                                                         vp;
 147         lck_rw_t                                                        rw_lock;
 148 };
 149
 150 #define CL_DIRECT_READ_LOCK_BUCKETS 61
 151
 152 static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
 153         cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
 154
 155 static lck_spin_t cl_direct_read_spin_lock;
 156
 157 static lck_grp_t        *cl_mtx_grp;
 158 static lck_attr_t       *cl_mtx_attr;
 159 static lck_grp_attr_t   *cl_mtx_grp_attr;
 160 static lck_mtx_t        *cl_transaction_mtxp;
 161
 162 #define IO_UNKNOWN      0
 163 #define IO_DIRECT       1
 164 #define IO_CONTIG       2
 165 #define IO_COPY         3
 166
 167 #define PUSH_DELAY      0x01
 168 #define PUSH_ALL        0x02
 169 #define PUSH_SYNC       0x04
 170
 171
 172 static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset);
 173 static void cluster_wait_IO(buf_t cbp_head, int async);
 174 static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait);
 175
 176 static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length);
 177
 178 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
 179                       int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg);
 180 static int cluster_iodone(buf_t bp, void *callback_arg);
 181 static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
 182 static int cluster_is_throttled(vnode_t vp);
 183
 184 static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name);
 185
 186 static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags);
 187
 188 static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
 189 static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference);
 190
 191 static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size,  off_t filesize, int flags,
 192                              int (*)(buf_t, void *), void *callback_arg);
 193 static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
 194                                int flags, int (*)(buf_t, void *), void *callback_arg);
 195 static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
 196                                int (*)(buf_t, void *), void *callback_arg, int flags);
 197
 198 static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
 199                               off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg);
 200 static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
 201                                 int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg);
 202 static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
 203                                 int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag);
 204
 205 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg);
 206
 207 static int      cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
 208 static void     cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
 209
 210 static int      cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg);
 211
 212 static int      cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *), void *callback_arg);
 213
 214 static void     sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg);
 215 static void     sparse_cluster_push(void **cmapp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*)(buf_t, void *), void *callback_arg);
 216 static void     sparse_cluster_add(void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF, int (*)(buf_t, void *), void *callback_arg);
 217
 218 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp);
 219 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
 220 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
 221
 222
 223 /*
 224  * For throttled IO to check whether
 225  * a block is cached by the boot cache
 226  * and thus it can avoid delaying the IO.
 227  *
 228  * bootcache_contains_block is initially
 229  * NULL. The BootCache will set it while
 230  * the cache is active and clear it when
 231  * the cache is jettisoned.
 232  *
 233  * Returns 0 if the block is not
 234  * contained in the cache, 1 if it is
 235  * contained.
 236  *
 237  * The function pointer remains valid
 238  * after the cache has been evicted even
 239  * if bootcache_contains_block has been
 240  * cleared.
 241  *
 242  * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
 243  */
 244 int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
 245
 246
 247 /*
 248  * limit the internal I/O size so that we
 249  * can represent it in a 32 bit int
 250  */
 251 #define MAX_IO_REQUEST_SIZE     (1024 * 1024 * 512)
 252 #define MAX_IO_CONTIG_SIZE      MAX_UPL_SIZE_BYTES
 253 #define MAX_VECTS               16
 254 /*
 255  * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
 256  * allowing the caller to bypass the buffer cache.  For small I/Os (less than 16k),
 257  * we have not historically allowed the write to bypass the UBC.
 258  */
 259 #define MIN_DIRECT_WRITE_SIZE   (16384)
 260
 261 #define WRITE_THROTTLE          6
 262 #define WRITE_THROTTLE_SSD      2
 263 #define WRITE_BEHIND            1
 264 #define WRITE_BEHIND_SSD        1
 265
 266 #define PREFETCH                3
 267 #define PREFETCH_SSD            2
 268 uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3);   /* maximum bytes in a specluative read-ahead */
 269 uint32_t speculative_prefetch_max_iosize = (512 * 1024);        /* maximum I/O size to use in a specluative read-ahead on SSDs*/
 270
 271
 272 #define IO_SCALE(vp, base)              (vp->v_mount->mnt_ioscale * (base))
 273 #define MAX_CLUSTER_SIZE(vp)            (cluster_max_io_size(vp->v_mount, CL_WRITE))
 274 #define MAX_PREFETCH(vp, size, is_ssd)  (size * IO_SCALE(vp, ((is_ssd && !ignore_is_ssd) ? PREFETCH_SSD : PREFETCH)))
 275
 276 int     ignore_is_ssd = 0;
 277 int     speculative_reads_disabled = 0;
 278
 279 /*
 280  * throttle the number of async writes that
 281  * can be outstanding on a single vnode
 282  * before we issue a synchronous write
 283  */
 284 #define THROTTLE_MAXCNT 0
 285
 286 uint32_t throttle_max_iosize = (128 * 1024);
 287
 288 #define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
 289
 290 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
 291
 292
 293 void
 294 cluster_init(void) {
 295         /*
 296          * allocate lock group attribute and group
 297          */
 298         cl_mtx_grp_attr = lck_grp_attr_alloc_init();
 299         cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
 300
 301         /*
 302          * allocate the lock attribute
 303          */
 304         cl_mtx_attr = lck_attr_alloc_init();
 305
 306         cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
 307
 308         if (cl_transaction_mtxp == NULL)
 309                 panic("cluster_init: failed to allocate cl_transaction_mtxp");
 310
 311         lck_spin_init(&cl_direct_read_spin_lock, cl_mtx_grp, cl_mtx_attr);
 312
 313         for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i)
 314                 LIST_INIT(&cl_direct_read_locks[i]);
 315 }
 316
 317
 318 uint32_t
 319 cluster_max_io_size(mount_t mp, int type)
 320 {
 321         uint32_t        max_io_size;
 322         uint32_t        segcnt;
 323         uint32_t        maxcnt;
 324
 325         switch(type) {
 326
 327         case CL_READ:
 328                 segcnt = mp->mnt_segreadcnt;
 329                 maxcnt = mp->mnt_maxreadcnt;
 330                 break;
 331         case CL_WRITE:
 332                 segcnt = mp->mnt_segwritecnt;
 333                 maxcnt = mp->mnt_maxwritecnt;
 334                 break;
 335         default:
 336                 segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
 337                 maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
 338                 break;
 339         }
 340         if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
 341                /*
 342                 * don't allow a size beyond the max UPL size we can create
 343                 */
 344                segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
 345        }
 346        max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
 347
 348        if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
 349                /*
 350                 * don't allow a size smaller than the old fixed limit
 351                 */
 352                max_io_size = MAX_UPL_TRANSFER_BYTES;
 353        } else {
 354                /*
 355                 * make sure the size specified is a multiple of PAGE_SIZE
 356                 */
 357                max_io_size &= ~PAGE_MASK;
 358        }
 359        return (max_io_size);
 360 }
 361
 362
 363
 364
 365 #define CLW_ALLOCATE            0x01
 366 #define CLW_RETURNLOCKED        0x02
 367 #define CLW_IONOCACHE           0x04
 368 #define CLW_IOPASSIVE   0x08
 369
 370 /*
 371  * if the read ahead context doesn't yet exist,
 372  * allocate and initialize it...
 373  * the vnode lock serializes multiple callers
 374  * during the actual assignment... first one
 375  * to grab the lock wins... the other callers
 376  * will release the now unnecessary storage
 377  *
 378  * once the context is present, try to grab (but don't block on)
 379  * the lock associated with it... if someone
 380  * else currently owns it, than the read
 381  * will run without read-ahead.  this allows
 382  * multiple readers to run in parallel and
 383  * since there's only 1 read ahead context,
 384  * there's no real loss in only allowing 1
 385  * reader to have read-ahead enabled.
 386  */
 387 static struct cl_readahead *
 388 cluster_get_rap(vnode_t vp)
 389 {
 390         struct ubc_info         *ubc;
 391         struct cl_readahead     *rap;
 392
 393         ubc = vp->v_ubcinfo;
 394
 395         if ((rap = ubc->cl_rahead) == NULL) {
 396                 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
 397
 398                 bzero(rap, sizeof *rap);
 399                 rap->cl_lastr = -1;
 400                 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
 401
 402                 vnode_lock(vp);
 403
 404                 if (ubc->cl_rahead == NULL)
 405                         ubc->cl_rahead = rap;
 406                 else {
 407                         lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
 408                         FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
 409                         rap = ubc->cl_rahead;
 410                 }
 411                 vnode_unlock(vp);
 412         }
 413         if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
 414                 return(rap);
 415
 416         return ((struct cl_readahead *)NULL);
 417 }
 418
 419
 420 /*
 421  * if the write behind context doesn't yet exist,
 422  * and CLW_ALLOCATE is specified, allocate and initialize it...
 423  * the vnode lock serializes multiple callers
 424  * during the actual assignment... first one
 425  * to grab the lock wins... the other callers
 426  * will release the now unnecessary storage
 427  *
 428  * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
 429  * the lock associated with the write behind context before
 430  * returning
 431  */
 432
 433 static struct cl_writebehind *
 434 cluster_get_wbp(vnode_t vp, int flags)
 435 {
 436         struct ubc_info *ubc;
 437         struct cl_writebehind *wbp;
 438
 439         ubc = vp->v_ubcinfo;
 440
 441         if ((wbp = ubc->cl_wbehind) == NULL) {
 442
 443                 if ( !(flags & CLW_ALLOCATE))
 444                         return ((struct cl_writebehind *)NULL);
 445
 446                 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
 447
 448                 bzero(wbp, sizeof *wbp);
 449                 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
 450
 451                 vnode_lock(vp);
 452
 453                 if (ubc->cl_wbehind == NULL)
 454                         ubc->cl_wbehind = wbp;
 455                 else {
 456                         lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
 457                         FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
 458                         wbp = ubc->cl_wbehind;
 459                 }
 460                 vnode_unlock(vp);
 461         }
 462         if (flags & CLW_RETURNLOCKED)
 463                 lck_mtx_lock(&wbp->cl_lockw);
 464
 465         return (wbp);
 466 }
 467
 468
 469 static void
 470 cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags)
 471 {
 472         struct cl_writebehind *wbp;
 473
 474         if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
 475
 476                 if (wbp->cl_number) {
 477                         lck_mtx_lock(&wbp->cl_lockw);
 478
 479                         cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg);
 480
 481                         lck_mtx_unlock(&wbp->cl_lockw);
 482                 }
 483         }
 484 }
 485
 486
 487 static int
 488 cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
 489 {
 490         daddr64_t blkno;
 491         size_t    io_size;
 492         int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
 493
 494         if (bootcache_check_fn) {
 495                 if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ, NULL))
 496                         return(0);
 497
 498                 if (io_size == 0)
 499                         return (0);
 500
 501                 if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno))
 502                         return(1);
 503         }
 504         return(0);
 505 }
 506
 507
 508 static int
 509 cluster_is_throttled(vnode_t vp)
 510 {
 511         return (throttle_io_will_be_throttled(-1, vp->v_mount));
 512 }
 513
 514
 515 static void
 516 cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name)
 517 {
 518
 519         lck_mtx_lock(&iostate->io_mtxp);
 520
 521         while ((iostate->io_issued - iostate->io_completed) > target) {
 522
 523                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
 524                              iostate->io_issued, iostate->io_completed, target, 0, 0);
 525
 526                 iostate->io_wanted = 1;
 527                 msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
 528
 529                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
 530                              iostate->io_issued, iostate->io_completed, target, 0, 0);
 531         }
 532         lck_mtx_unlock(&iostate->io_mtxp);
 533 }
 534
 535 static void cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
 536                                                                                   upl_offset_t upl_offset, upl_size_t size)
 537 {
 538         if (!size)
 539                 return;
 540
 541         upl_t associated_upl = upl_associated_upl(upl);
 542
 543         if (!associated_upl)
 544                 return;
 545
 546 #if 0
 547         printf("1: %d %d\n", upl_offset, upl_offset + size);
 548 #endif
 549
 550         /*
 551          * The associated UPL is page aligned to file offsets whereas the
 552          * UPL it's attached to has different alignment requirements.  The
 553          * upl_offset that we have refers to @upl.  The code that follows
 554          * has to deal with the first and last pages in this transaction
 555          * which might straddle pages in the associated UPL.  To keep
 556          * track of these pages, we use the mark bits: if the mark bit is
 557          * set, we know another transaction has completed its part of that
 558          * page and so we can unlock that page here.
 559          *
 560          * The following illustrates what we have to deal with:
 561          *
 562          *    MEM u <------------ 1 PAGE ------------> e
 563          *        +-------------+----------------------+-----------------
 564          *        |             |######################|#################
 565          *        +-------------+----------------------+-----------------
 566          *   FILE | <--- a ---> o <------------ 1 PAGE ------------>
 567          *
 568          * So here we show a write to offset @o.  The data that is to be
 569          * written is in a buffer that is not page aligned; it has offset
 570          * @a in the page.  The upl that carries the data starts in memory
 571          * at @u.  The associated upl starts in the file at offset @o.  A
 572          * transaction will always end on a page boundary (like @e above)
 573          * except for the very last transaction in the group.  We cannot
 574          * unlock the page at @o in the associated upl until both the
 575          * transaction ending at @e and the following transaction (that
 576          * starts at @e) has completed.
 577          */
 578
 579         /*
 580          * We record whether or not the two UPLs are aligned as the mark
 581          * bit in the first page of @upl.
 582          */
 583         upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
 584         bool is_unaligned = upl_page_get_mark(pl, 0);
 585
 586         if (is_unaligned) {
 587                 upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
 588
 589                 upl_offset_t upl_end = upl_offset + size;
 590                 assert(upl_end >= PAGE_SIZE);
 591
 592                 upl_size_t assoc_upl_size = upl_get_size(associated_upl);
 593
 594                 /*
 595                  * In the very first transaction in the group, upl_offset will
 596                  * not be page aligned, but after that it will be and in that
 597                  * case we want the preceding page in the associated UPL hence
 598                  * the minus one.
 599                  */
 600                 assert(upl_offset);
 601                 if (upl_offset)
 602                         upl_offset = trunc_page_32(upl_offset - 1);
 603
 604                 lck_mtx_lock_spin(&iostate->io_mtxp);
 605
 606                 // Look at the first page...
 607                 if (upl_offset
 608                         && !upl_page_get_mark(assoc_pl, upl_offset >> PAGE_SHIFT)) {
 609                         /*
 610                          * The first page isn't marked so let another transaction
 611                          * completion handle it.
 612                          */
 613                         upl_page_set_mark(assoc_pl, upl_offset >> PAGE_SHIFT, true);
 614                         upl_offset += PAGE_SIZE;
 615                 }
 616
 617                 // And now the last page...
 618
 619                 /*
 620                  * This needs to be > rather than >= because if it's equal, it
 621                  * means there's another transaction that is sharing the last
 622                  * page.
 623                  */
 624                 if (upl_end > assoc_upl_size)
 625                         upl_end = assoc_upl_size;
 626                 else {
 627                         upl_end = trunc_page_32(upl_end);
 628                         const int last_pg = (upl_end >> PAGE_SHIFT) - 1;
 629
 630                         if (!upl_page_get_mark(assoc_pl, last_pg)) {
 631                                 /*
 632                                  * The last page isn't marked so mark the page and let another
 633                                  * transaction completion handle it.
 634                                  */
 635                                 upl_page_set_mark(assoc_pl, last_pg, true);
 636                                 upl_end -= PAGE_SIZE;
 637                         }
 638                 }
 639
 640                 lck_mtx_unlock(&iostate->io_mtxp);
 641
 642 #if 0
 643                 printf("2: %d %d\n", upl_offset, upl_end);
 644 #endif
 645
 646                 if (upl_end <= upl_offset)
 647                         return;
 648
 649                 size = upl_end - upl_offset;
 650         } else {
 651                 assert(!(upl_offset & PAGE_MASK));
 652                 assert(!(size & PAGE_MASK));
 653         }
 654
 655         boolean_t empty;
 656
 657         /*
 658          * We can unlock these pages now and as this is for a
 659          * direct/uncached write, we want to dump the pages too.
 660          */
 661         kern_return_t kr = upl_abort_range(associated_upl, upl_offset, size,
 662                                                                            UPL_ABORT_DUMP_PAGES, &empty);
 663
 664         assert(!kr);
 665
 666         if (!kr && empty) {
 667                 upl_set_associated_upl(upl, NULL);
 668                 upl_deallocate(associated_upl);
 669         }
 670 }
 671
 672 static int
 673 cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
 674 {
 675         int upl_abort_code = 0;
 676         int page_in  = 0;
 677         int page_out = 0;
 678
 679         if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE))
 680                 /*
 681                  * direct write of any flavor, or a direct read that wasn't aligned
 682                  */
 683                 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
 684         else {
 685                 if (io_flags & B_PAGEIO) {
 686                         if (io_flags & B_READ)
 687                                 page_in  = 1;
 688                         else
 689                                 page_out = 1;
 690                 }
 691                 if (io_flags & B_CACHE)
 692                         /*
 693                          * leave pages in the cache unchanged on error
 694                          */
 695                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 696                 else if (page_out && ((error != ENXIO) || vnode_isswap(vp)))
 697                         /*
 698                          * transient error... leave pages unchanged
 699                          */
 700                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
 701                 else if (page_in)
 702                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
 703                 else
 704                         upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
 705
 706                 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
 707         }
 708         return (upl_abort_code);
 709 }
 710
 711
 712 static int
 713 cluster_iodone(buf_t bp, void *callback_arg)
 714 {
 715         int     b_flags;
 716         int     error;
 717         int     total_size;
 718         int     total_resid;
 719         int     upl_offset;
 720         int     zero_offset;
 721         int     pg_offset = 0;
 722         int     commit_size = 0;
 723         int     upl_flags = 0;
 724         int     transaction_size = 0;
 725         upl_t   upl;
 726         buf_t   cbp;
 727         buf_t   cbp_head;
 728         buf_t   cbp_next;
 729         buf_t   real_bp;
 730         vnode_t vp;
 731         struct  clios *iostate;
 732         boolean_t       transaction_complete = FALSE;
 733
 734         __IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
 735
 736         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
 737                      cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
 738
 739         if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) {
 740                 boolean_t       need_wakeup = FALSE;
 741
 742                 lck_mtx_lock_spin(cl_transaction_mtxp);
 743
 744                 bp->b_flags |= B_TDONE;
 745
 746                 if (bp->b_flags & B_TWANTED) {
 747                         CLR(bp->b_flags, B_TWANTED);
 748                         need_wakeup = TRUE;
 749                 }
 750                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 751                         /*
 752                          * all I/O requests that are part of this transaction
 753                          * have to complete before we can process it
 754                          */
 755                         if ( !(cbp->b_flags & B_TDONE)) {
 756
 757                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 758                                              cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
 759
 760                                 lck_mtx_unlock(cl_transaction_mtxp);
 761
 762                                 if (need_wakeup == TRUE)
 763                                         wakeup(bp);
 764
 765                                 return 0;
 766                         }
 767                         if (cbp->b_flags & B_EOT)
 768                                 transaction_complete = TRUE;
 769                 }
 770                 lck_mtx_unlock(cl_transaction_mtxp);
 771
 772                 if (need_wakeup == TRUE)
 773                         wakeup(bp);
 774
 775                 if (transaction_complete == FALSE) {
 776                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 777                                      cbp_head, 0, 0, 0, 0);
 778                         return 0;
 779                 }
 780         }
 781         error       = 0;
 782         total_size  = 0;
 783         total_resid = 0;
 784
 785         cbp        = cbp_head;
 786         vp         = cbp->b_vp;
 787         upl_offset = cbp->b_uploffset;
 788         upl        = cbp->b_upl;
 789         b_flags    = cbp->b_flags;
 790         real_bp    = cbp->b_real_bp;
 791         zero_offset= cbp->b_validend;
 792         iostate    = (struct clios *)cbp->b_iostate;
 793
 794         if (real_bp)
 795                 real_bp->b_dev = cbp->b_dev;
 796
 797         while (cbp) {
 798                 if ((cbp->b_flags & B_ERROR) && error == 0)
 799                         error = cbp->b_error;
 800
 801                 total_resid += cbp->b_resid;
 802                 total_size  += cbp->b_bcount;
 803
 804                 cbp_next = cbp->b_trans_next;
 805
 806                 if (cbp_next == NULL)
 807                         /*
 808                          * compute the overall size of the transaction
 809                          * in case we created one that has 'holes' in it
 810                          * 'total_size' represents the amount of I/O we
 811                          * did, not the span of the transaction w/r to the UPL
 812                          */
 813                         transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
 814
 815                 if (cbp != cbp_head)
 816                         free_io_buf(cbp);
 817
 818                 cbp = cbp_next;
 819         }
 820
 821         if (ISSET(b_flags, B_COMMIT_UPL)) {
 822                 cluster_handle_associated_upl(iostate,
 823                                                                           cbp_head->b_upl,
 824                                                                           upl_offset,
 825                                                                           transaction_size);
 826         }
 827
 828         if (error == 0 && total_resid)
 829                 error = EIO;
 830
 831         if (error == 0) {
 832                 int     (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone);
 833
 834                 if (cliodone_func != NULL) {
 835                         cbp_head->b_bcount = transaction_size;
 836
 837                         error = (*cliodone_func)(cbp_head, callback_arg);
 838                 }
 839         }
 840         if (zero_offset)
 841                 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
 842
 843         free_io_buf(cbp_head);
 844
 845         if (iostate) {
 846                 int need_wakeup = 0;
 847
 848                 /*
 849                  * someone has issued multiple I/Os asynchrounsly
 850                  * and is waiting for them to complete (streaming)
 851                  */
 852                 lck_mtx_lock_spin(&iostate->io_mtxp);
 853
 854                 if (error && iostate->io_error == 0)
 855                         iostate->io_error = error;
 856
 857                 iostate->io_completed += total_size;
 858
 859                 if (iostate->io_wanted) {
 860                         /*
 861                          * someone is waiting for the state of
 862                          * this io stream to change
 863                          */
 864                         iostate->io_wanted = 0;
 865                         need_wakeup = 1;
 866                 }
 867                 lck_mtx_unlock(&iostate->io_mtxp);
 868
 869                 if (need_wakeup)
 870                         wakeup((caddr_t)&iostate->io_wanted);
 871         }
 872
 873         if (b_flags & B_COMMIT_UPL) {
 874                 pg_offset   = upl_offset & PAGE_MASK;
 875                 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 876
 877                 if (error)
 878                         upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
 879                 else {
 880                         upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
 881
 882                         if ((b_flags & B_PHYS) && (b_flags & B_READ))
 883                                 upl_flags |= UPL_COMMIT_SET_DIRTY;
 884
 885                         if (b_flags & B_AGE)
 886                                 upl_flags |= UPL_COMMIT_INACTIVATE;
 887
 888                         ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
 889                 }
 890         }
 891         if (real_bp) {
 892                 if (error) {
 893                         real_bp->b_flags |= B_ERROR;
 894                         real_bp->b_error = error;
 895                 }
 896                 real_bp->b_resid = total_resid;
 897
 898                 buf_biodone(real_bp);
 899         }
 900         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
 901                      upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0);
 902
 903         return (error);
 904 }
 905
 906
 907 uint32_t
 908 cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
 909 {
 910         if (cluster_is_throttled(vp)) {
 911                 *limit = THROTTLE_MAX_IOSIZE;
 912                 return 1;
 913         }
 914         return 0;
 915 }
 916
 917
 918 void
 919 cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
 920 {
 921
 922         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
 923                      upl_offset, size, bp, 0, 0);
 924
 925         if (bp == NULL || bp->b_datap == 0) {
 926                 upl_page_info_t *pl;
 927                 addr64_t        zero_addr;
 928
 929                 pl = ubc_upl_pageinfo(upl);
 930
 931                 if (upl_device_page(pl) == TRUE) {
 932                         zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
 933
 934                         bzero_phys_nc(zero_addr, size);
 935                 } else {
 936                         while (size) {
 937                                 int     page_offset;
 938                                 int     page_index;
 939                                 int     zero_cnt;
 940
 941                                 page_index  = upl_offset / PAGE_SIZE;
 942                                 page_offset = upl_offset & PAGE_MASK;
 943
 944                                 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
 945                                 zero_cnt  = min(PAGE_SIZE - page_offset, size);
 946
 947                                 bzero_phys(zero_addr, zero_cnt);
 948
 949                                 size       -= zero_cnt;
 950                                 upl_offset += zero_cnt;
 951                         }
 952                 }
 953         } else
 954                 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
 955
 956         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
 957                      upl_offset, size, 0, 0, 0);
 958 }
 959
 960
 961 static void
 962 cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset)
 963 {
 964         cbp_head->b_validend = zero_offset;
 965         cbp_tail->b_flags |= B_EOT;
 966 }
 967
 968 static void
 969 cluster_wait_IO(buf_t cbp_head, int async)
 970 {
 971         buf_t   cbp;
 972
 973         if (async) {
 974                 /*
 975                  * async callback completion will not normally
 976                  * generate a wakeup upon I/O completion...
 977                  * by setting B_TWANTED, we will force a wakeup
 978                  * to occur as any outstanding I/Os complete...
 979                  * I/Os already completed will have B_TDONE already
 980                  * set and we won't cause us to block
 981                  * note that we're actually waiting for the bp to have
 982                  * completed the callback function... only then
 983                  * can we safely take back ownership of the bp
 984                  */
 985                 lck_mtx_lock_spin(cl_transaction_mtxp);
 986
 987                 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
 988                       cbp->b_flags |= B_TWANTED;
 989
 990                 lck_mtx_unlock(cl_transaction_mtxp);
 991         }
 992         for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
 993
 994                 if (async) {
 995                         while (!ISSET(cbp->b_flags, B_TDONE)) {
 996
 997                                 lck_mtx_lock_spin(cl_transaction_mtxp);
 998
 999                                 if (!ISSET(cbp->b_flags, B_TDONE)) {
1000                                         DTRACE_IO1(wait__start, buf_t, cbp);
1001                                         (void) msleep(cbp, cl_transaction_mtxp, PDROP | (PRIBIO+1), "cluster_wait_IO", NULL);
1002                                         DTRACE_IO1(wait__done, buf_t, cbp);
1003                                 } else
1004                                         lck_mtx_unlock(cl_transaction_mtxp);
1005                         }
1006                 } else
1007                         buf_biowait(cbp);
1008         }
1009 }
1010
1011 static void
1012 cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait)
1013 {
1014         buf_t   cbp;
1015         int     error;
1016         boolean_t isswapout = FALSE;
1017
1018         /*
1019          * cluster_complete_transaction will
1020          * only be called if we've issued a complete chain in synchronous mode
1021          * or, we've already done a cluster_wait_IO on an incomplete chain
1022          */
1023         if (needwait) {
1024                 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next)
1025                         buf_biowait(cbp);
1026         }
1027         /*
1028          * we've already waited on all of the I/Os in this transaction,
1029          * so mark all of the buf_t's in this transaction as B_TDONE
1030          * so that cluster_iodone sees the transaction as completed
1031          */
1032         for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next)
1033                 cbp->b_flags |= B_TDONE;
1034         cbp = *cbp_head;
1035
1036         if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp))
1037                 isswapout = TRUE;
1038
1039         error = cluster_iodone(cbp, callback_arg);
1040
1041         if ( !(flags & CL_ASYNC) && error && *retval == 0) {
1042                 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO))
1043                         *retval = error;
1044                 else if (isswapout == TRUE)
1045                         *retval = error;
1046         }
1047         *cbp_head = (buf_t)NULL;
1048 }
1049
1050
1051 static int
1052 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
1053            int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1054 {
1055         buf_t   cbp;
1056         u_int   size;
1057         u_int   io_size;
1058         int     io_flags;
1059         int     bmap_flags;
1060         int     error = 0;
1061         int     retval = 0;
1062         buf_t   cbp_head = NULL;
1063         buf_t   cbp_tail = NULL;
1064         int     trans_count = 0;
1065         int     max_trans_count;
1066         u_int   pg_count;
1067         int     pg_offset;
1068         u_int   max_iosize;
1069         u_int   max_vectors;
1070         int     priv;
1071         int     zero_offset = 0;
1072         int     async_throttle = 0;
1073         mount_t mp;
1074         vm_offset_t upl_end_offset;
1075         boolean_t   need_EOT = FALSE;
1076
1077         /*
1078          * we currently don't support buffers larger than a page
1079          */
1080         if (real_bp && non_rounded_size > PAGE_SIZE)
1081                 panic("%s(): Called with real buffer of size %d bytes which "
1082                                 "is greater than the maximum allowed size of "
1083                                 "%d bytes (the system PAGE_SIZE).\n",
1084                                 __FUNCTION__, non_rounded_size, PAGE_SIZE);
1085
1086         mp = vp->v_mount;
1087
1088         /*
1089          * we don't want to do any funny rounding of the size for IO requests
1090          * coming through the DIRECT or CONTIGUOUS paths...  those pages don't
1091          * belong to us... we can't extend (nor do we need to) the I/O to fill
1092          * out a page
1093          */
1094         if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) {
1095                 /*
1096                  * round the requested size up so that this I/O ends on a
1097                  * page boundary in case this is a 'write'... if the filesystem
1098                  * has blocks allocated to back the page beyond the EOF, we want to
1099                  * make sure to write out the zero's that are sitting beyond the EOF
1100                  * so that in case the filesystem doesn't explicitly zero this area
1101                  * if a hole is created via a lseek/write beyond the current EOF,
1102                  * it will return zeros when it's read back from the disk.  If the
1103                  * physical allocation doesn't extend for the whole page, we'll
1104                  * only write/read from the disk up to the end of this allocation
1105                  * via the extent info returned from the VNOP_BLOCKMAP call.
1106                  */
1107                 pg_offset = upl_offset & PAGE_MASK;
1108
1109                 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
1110         } else {
1111                 /*
1112                  * anyone advertising a blocksize of 1 byte probably
1113                  * can't deal with us rounding up the request size
1114                  * AFP is one such filesystem/device
1115                  */
1116                 size = non_rounded_size;
1117         }
1118         upl_end_offset = upl_offset + size;
1119
1120         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
1121
1122         /*
1123          * Set the maximum transaction size to the maximum desired number of
1124          * buffers.
1125          */
1126         max_trans_count = 8;
1127         if (flags & CL_DEV_MEMORY)
1128                 max_trans_count = 16;
1129
1130         if (flags & CL_READ) {
1131                 io_flags = B_READ;
1132                 bmap_flags = VNODE_READ;
1133
1134                 max_iosize  = mp->mnt_maxreadcnt;
1135                 max_vectors = mp->mnt_segreadcnt;
1136         } else {
1137                 io_flags = B_WRITE;
1138                 bmap_flags = VNODE_WRITE;
1139
1140                 max_iosize  = mp->mnt_maxwritecnt;
1141                 max_vectors = mp->mnt_segwritecnt;
1142         }
1143         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
1144
1145         /*
1146          * make sure the maximum iosize is a
1147          * multiple of the page size
1148          */
1149         max_iosize  &= ~PAGE_MASK;
1150
1151         /*
1152          * Ensure the maximum iosize is sensible.
1153          */
1154         if (!max_iosize)
1155                 max_iosize = PAGE_SIZE;
1156
1157         if (flags & CL_THROTTLE) {
1158                 if ( !(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
1159                         if (max_iosize > THROTTLE_MAX_IOSIZE)
1160                                 max_iosize = THROTTLE_MAX_IOSIZE;
1161                         async_throttle = THROTTLE_MAXCNT;
1162                 } else {
1163                         if ( (flags & CL_DEV_MEMORY) )
1164                                 async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
1165                         else {
1166                                 u_int max_cluster;
1167                                 u_int max_cluster_size;
1168                                 u_int scale;
1169
1170                                 max_cluster_size = MAX_CLUSTER_SIZE(vp);
1171
1172                                 if (max_iosize > max_cluster_size)
1173                                         max_cluster = max_cluster_size;
1174                                 else
1175                                         max_cluster = max_iosize;
1176
1177                                 if (size < max_cluster)
1178                                         max_cluster = size;
1179
1180                                 if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
1181                                         scale = WRITE_THROTTLE_SSD;
1182                                 else
1183                                         scale = WRITE_THROTTLE;
1184
1185                                 if (flags & CL_CLOSE)
1186                                         scale += MAX_CLUSTERS;
1187
1188                                 async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
1189                         }
1190                 }
1191         }
1192         if (flags & CL_AGE)
1193                 io_flags |= B_AGE;
1194         if (flags & (CL_PAGEIN | CL_PAGEOUT))
1195                 io_flags |= B_PAGEIO;
1196         if (flags & (CL_IOSTREAMING))
1197                 io_flags |= B_IOSTREAMING;
1198         if (flags & CL_COMMIT)
1199                 io_flags |= B_COMMIT_UPL;
1200         if (flags & CL_DIRECT_IO)
1201                 io_flags |= B_PHYS;
1202         if (flags & (CL_PRESERVE | CL_KEEPCACHED))
1203                 io_flags |= B_CACHE;
1204         if (flags & CL_PASSIVE)
1205                 io_flags |= B_PASSIVE;
1206         if (flags & CL_ENCRYPTED)
1207                 io_flags |= B_ENCRYPTED_IO;
1208
1209         if (vp->v_flag & VSYSTEM)
1210                 io_flags |= B_META;
1211
1212         if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1213                 /*
1214                  * then we are going to end up
1215                  * with a page that we can't complete (the file size wasn't a multiple
1216                  * of PAGE_SIZE and we're trying to read to the end of the file
1217                  * so we'll go ahead and zero out the portion of the page we can't
1218                  * read in from the file
1219                  */
1220                 zero_offset = upl_offset + non_rounded_size;
1221         } else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
1222                 assert(ISSET(flags, CL_COMMIT));
1223
1224                 // For a direct/uncached write, we need to lock pages...
1225
1226                 upl_t cached_upl;
1227
1228                 /*
1229                  * Create a UPL to lock the pages in the cache whilst the
1230                  * write is in progress.
1231                  */
1232                 ubc_create_upl(vp, f_offset, non_rounded_size, &cached_upl,
1233                                            NULL, UPL_SET_LITE);
1234
1235                 /*
1236                  * Attach this UPL to the other UPL so that we can find it
1237                  * later.
1238                  */
1239                 upl_set_associated_upl(upl, cached_upl);
1240
1241                 if (upl_offset & PAGE_MASK) {
1242                         /*
1243                          * The two UPLs are not aligned, so mark the first page in
1244                          * @upl so that cluster_handle_associated_upl can handle
1245                          * it accordingly.
1246                          */
1247                         upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1248                         upl_page_set_mark(pl, 0, true);
1249                 }
1250         }
1251
1252         while (size) {
1253                 daddr64_t blkno;
1254                 daddr64_t lblkno;
1255                 u_int   io_size_wanted;
1256                 size_t  io_size_tmp;
1257
1258                 if (size > max_iosize)
1259                         io_size = max_iosize;
1260                 else
1261                         io_size = size;
1262
1263                 io_size_wanted = io_size;
1264                 io_size_tmp = (size_t)io_size;
1265
1266                 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL)))
1267                         break;
1268
1269                 if (io_size_tmp > io_size_wanted)
1270                         io_size = io_size_wanted;
1271                 else
1272                         io_size = (u_int)io_size_tmp;
1273
1274                 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
1275                         real_bp->b_blkno = blkno;
1276
1277                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
1278                              (int)f_offset, (int)(blkno>>32), (int)blkno, io_size, 0);
1279
1280                 if (io_size == 0) {
1281                         /*
1282                          * vnop_blockmap didn't return an error... however, it did
1283                          * return an extent size of 0 which means we can't
1284                          * make forward progress on this I/O... a hole in the
1285                          * file would be returned as a blkno of -1 with a non-zero io_size
1286                          * a real extent is returned with a blkno != -1 and a non-zero io_size
1287                          */
1288                         error = EINVAL;
1289                         break;
1290                 }
1291                 if ( !(flags & CL_READ) && blkno == -1) {
1292                         off_t   e_offset;
1293                         int     pageout_flags;
1294
1295                         if (upl_get_internal_vectorupl(upl))
1296                                 panic("Vector UPLs should not take this code-path\n");
1297                         /*
1298                          * we're writing into a 'hole'
1299                          */
1300                         if (flags & CL_PAGEOUT) {
1301                                 /*
1302                                  * if we got here via cluster_pageout
1303                                  * then just error the request and return
1304                                  * the 'hole' should already have been covered
1305                                  */
1306                                 error = EINVAL;
1307                                 break;
1308                         }
1309                         /*
1310                          * we can get here if the cluster code happens to
1311                          * pick up a page that was dirtied via mmap vs
1312                          * a 'write' and the page targets a 'hole'...
1313                          * i.e. the writes to the cluster were sparse
1314                          * and the file was being written for the first time
1315                          *
1316                          * we can also get here if the filesystem supports
1317                          * 'holes' that are less than PAGE_SIZE.... because
1318                          * we can't know if the range in the page that covers
1319                          * the 'hole' has been dirtied via an mmap or not,
1320                          * we have to assume the worst and try to push the
1321                          * entire page to storage.
1322                          *
1323                          * Try paging out the page individually before
1324                          * giving up entirely and dumping it (the pageout
1325                          * path will insure that the zero extent accounting
1326                          * has been taken care of before we get back into cluster_io)
1327                          *
1328                          * go direct to vnode_pageout so that we don't have to
1329                          * unbusy the page from the UPL... we used to do this
1330                          * so that we could call ubc_msync, but that results
1331                          * in a potential deadlock if someone else races us to acquire
1332                          * that page and wins and in addition needs one of the pages
1333                          * we're continuing to hold in the UPL
1334                          */
1335                         pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT;
1336
1337                         if ( !(flags & CL_ASYNC))
1338                                 pageout_flags |= UPL_IOSYNC;
1339                         if ( !(flags & CL_COMMIT))
1340                                 pageout_flags |= UPL_NOCOMMIT;
1341
1342                         if (cbp_head) {
1343                                 buf_t last_cbp;
1344
1345                                 /*
1346                                  * first we have to wait for the the current outstanding I/Os
1347                                  * to complete... EOT hasn't been set yet on this transaction
1348                                  * so the pages won't be released just because all of the current
1349                                  * I/O linked to this transaction has completed...
1350                                  */
1351                                 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1352
1353                                 /*
1354                                  * we've got a transcation that
1355                                  * includes the page we're about to push out through vnode_pageout...
1356                                  * find the last bp in the list which will be the one that
1357                                  * includes the head of this page and round it's iosize down
1358                                  * to a page boundary...
1359                                  */
1360                                 for (last_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next)
1361                                         last_cbp = cbp;
1362
1363                                 cbp->b_bcount &= ~PAGE_MASK;
1364
1365                                 if (cbp->b_bcount == 0) {
1366                                         /*
1367                                          * this buf no longer has any I/O associated with it
1368                                          */
1369                                         free_io_buf(cbp);
1370
1371                                         if (cbp == cbp_head) {
1372                                                 /*
1373                                                  * the buf we just freed was the only buf in
1374                                                  * this transaction... so there's no I/O to do
1375                                                  */
1376                                                 cbp_head = NULL;
1377                                         } else {
1378                                                 /*
1379                                                  * remove the buf we just freed from
1380                                                  * the transaction list
1381                                                  */
1382                                                 last_cbp->b_trans_next = NULL;
1383                                                 cbp_tail = last_cbp;
1384                                         }
1385                                 }
1386                                 if (cbp_head) {
1387                                         /*
1388                                          * there was more to the current transaction
1389                                          * than just the page we are pushing out via vnode_pageout...
1390                                          * mark it as finished and complete it... we've already
1391                                          * waited for the I/Os to complete above in the call to cluster_wait_IO
1392                                          */
1393                                         cluster_EOT(cbp_head, cbp_tail, 0);
1394
1395                                         cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1396
1397                                         trans_count = 0;
1398                                 }
1399                         }
1400                         if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
1401                                 error = EINVAL;
1402                         }
1403                         e_offset = round_page_64(f_offset + 1);
1404                         io_size = e_offset - f_offset;
1405
1406                         f_offset   += io_size;
1407                         upl_offset += io_size;
1408
1409                         if (size >= io_size)
1410                                 size -= io_size;
1411                         else
1412                                 size = 0;
1413                         /*
1414                          * keep track of how much of the original request
1415                          * that we've actually completed... non_rounded_size
1416                          * may go negative due to us rounding the request
1417                          * to a page size multiple (i.e.  size > non_rounded_size)
1418                          */
1419                         non_rounded_size -= io_size;
1420
1421                         if (non_rounded_size <= 0) {
1422                                 /*
1423                                  * we've transferred all of the data in the original
1424                                  * request, but we were unable to complete the tail
1425                                  * of the last page because the file didn't have
1426                                  * an allocation to back that portion... this is ok.
1427                                  */
1428                                 size = 0;
1429                         }
1430                         if (error) {
1431                                 if (size == 0)
1432                                         flags &= ~CL_COMMIT;
1433                                 break;
1434                         }
1435                         continue;
1436                 }
1437                 lblkno = (daddr64_t)(f_offset / 0x1000);
1438                 /*
1439                  * we have now figured out how much I/O we can do - this is in 'io_size'
1440                  * pg_offset is the starting point in the first page for the I/O
1441                  * pg_count is the number of full and partial pages that 'io_size' encompasses
1442                  */
1443                 pg_offset = upl_offset & PAGE_MASK;
1444
1445                 if (flags & CL_DEV_MEMORY) {
1446                         /*
1447                          * treat physical requests as one 'giant' page
1448                          */
1449                         pg_count = 1;
1450                 } else
1451                         pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
1452
1453                 if ((flags & CL_READ) && blkno == -1) {
1454                         vm_offset_t  commit_offset;
1455                         int bytes_to_zero;
1456                         int complete_transaction_now = 0;
1457
1458                         /*
1459                          * if we're reading and blkno == -1, then we've got a
1460                          * 'hole' in the file that we need to deal with by zeroing
1461                          * out the affected area in the upl
1462                          */
1463                         if (io_size >= (u_int)non_rounded_size) {
1464                                 /*
1465                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1466                                  * than 'zero_offset' will be non-zero
1467                                  * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1468                                  * (indicated by the io_size finishing off the I/O request for this UPL)
1469                                  * than we're not going to issue an I/O for the
1470                                  * last page in this upl... we need to zero both the hole and the tail
1471                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in
1472                                  */
1473                                 bytes_to_zero = non_rounded_size;
1474                                 if (!(flags & CL_NOZERO))
1475                                         bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
1476
1477                                 zero_offset = 0;
1478                         } else
1479                                 bytes_to_zero = io_size;
1480
1481                         pg_count = 0;
1482
1483                         cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
1484
1485                         if (cbp_head) {
1486                                 int     pg_resid;
1487
1488                                 /*
1489                                  * if there is a current I/O chain pending
1490                                  * then the first page of the group we just zero'd
1491                                  * will be handled by the I/O completion if the zero
1492                                  * fill started in the middle of the page
1493                                  */
1494                                 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1495
1496                                 pg_resid = commit_offset - upl_offset;
1497
1498                                 if (bytes_to_zero >= pg_resid) {
1499                                         /*
1500                                          * the last page of the current I/O
1501                                          * has been completed...
1502                                          * compute the number of fully zero'd
1503                                          * pages that are beyond it
1504                                          * plus the last page if its partial
1505                                          * and we have no more I/O to issue...
1506                                          * otherwise a partial page is left
1507                                          * to begin the next I/O
1508                                          */
1509                                         if ((int)io_size >= non_rounded_size)
1510                                                 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
1511                                         else
1512                                                 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
1513
1514                                         complete_transaction_now = 1;
1515                                 }
1516                         } else {
1517                                 /*
1518                                  * no pending I/O to deal with
1519                                  * so, commit all of the fully zero'd pages
1520                                  * plus the last page if its partial
1521                                  * and we have no more I/O to issue...
1522                                  * otherwise a partial page is left
1523                                  * to begin the next I/O
1524                                  */
1525                                 if ((int)io_size >= non_rounded_size)
1526                                         pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
1527                                 else
1528                                         pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
1529
1530                                 commit_offset = upl_offset & ~PAGE_MASK;
1531                         }
1532
1533                         // Associated UPL is currently only used in the direct write path
1534                         assert(!upl_associated_upl(upl));
1535
1536                         if ( (flags & CL_COMMIT) && pg_count) {
1537                                 ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE,
1538                                                      UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1539                         }
1540                         upl_offset += io_size;
1541                         f_offset   += io_size;
1542                         size       -= io_size;
1543
1544                         /*
1545                          * keep track of how much of the original request
1546                          * that we've actually completed... non_rounded_size
1547                          * may go negative due to us rounding the request
1548                          * to a page size multiple (i.e.  size > non_rounded_size)
1549                          */
1550                         non_rounded_size -= io_size;
1551
1552                         if (non_rounded_size <= 0) {
1553                                 /*
1554                                  * we've transferred all of the data in the original
1555                                  * request, but we were unable to complete the tail
1556                                  * of the last page because the file didn't have
1557                                  * an allocation to back that portion... this is ok.
1558                                  */
1559                                 size = 0;
1560                         }
1561                         if (cbp_head && (complete_transaction_now || size == 0))  {
1562                                 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1563
1564                                 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
1565
1566                                 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1567
1568                                 trans_count = 0;
1569                         }
1570                         continue;
1571                 }
1572                 if (pg_count > max_vectors) {
1573                         if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
1574                                 io_size = PAGE_SIZE - pg_offset;
1575                                 pg_count = 1;
1576                         } else {
1577                                 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
1578                                 pg_count = max_vectors;
1579                         }
1580                 }
1581                 /*
1582                  * If the transaction is going to reach the maximum number of
1583                  * desired elements, truncate the i/o to the nearest page so
1584                  * that the actual i/o is initiated after this buffer is
1585                  * created and added to the i/o chain.
1586                  *
1587                  * I/O directed to physically contiguous memory
1588                  * doesn't have a requirement to make sure we 'fill' a page
1589                  */
1590                 if ( !(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
1591                                 ((upl_offset + io_size) & PAGE_MASK)) {
1592                         vm_offset_t aligned_ofs;
1593
1594                         aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
1595                         /*
1596                          * If the io_size does not actually finish off even a
1597                          * single page we have to keep adding buffers to the
1598                          * transaction despite having reached the desired limit.
1599                          *
1600                          * Eventually we get here with the page being finished
1601                          * off (and exceeded) and then we truncate the size of
1602                          * this i/o request so that it is page aligned so that
1603                          * we can finally issue the i/o on the transaction.
1604                          */
1605                         if (aligned_ofs > upl_offset) {
1606                                 io_size = aligned_ofs - upl_offset;
1607                                 pg_count--;
1608                         }
1609                 }
1610
1611                 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
1612                         /*
1613                          * if we're not targeting a virtual device i.e. a disk image
1614                          * it's safe to dip into the reserve pool since real devices
1615                          * can complete this I/O request without requiring additional
1616                          * bufs from the alloc_io_buf pool
1617                          */
1618                         priv = 1;
1619                 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
1620                         /*
1621                          * Throttle the speculative IO
1622                          */
1623                         priv = 0;
1624                 else
1625                         priv = 1;
1626
1627                 cbp = alloc_io_buf(vp, priv);
1628
1629                 if (flags & CL_PAGEOUT) {
1630                         u_int i;
1631
1632                         /*
1633                          * since blocks are in offsets of 0x1000, scale
1634                          * iteration to (PAGE_SIZE * pg_count) of blks.
1635                          */
1636                         for (i = 0; i < (PAGE_SIZE * pg_count)/0x1000; i++) {
1637                                 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
1638                                         panic("BUSY bp found in cluster_io");
1639                         }
1640                 }
1641                 if (flags & CL_ASYNC) {
1642                         if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg))
1643                                 panic("buf_setcallback failed\n");
1644                 }
1645                 cbp->b_cliodone = (void *)callback;
1646                 cbp->b_flags |= io_flags;
1647                 if (flags & CL_NOCACHE)
1648                         cbp->b_attr.ba_flags |= BA_NOCACHE;
1649
1650                 cbp->b_lblkno = lblkno;
1651                 cbp->b_blkno  = blkno;
1652                 cbp->b_bcount = io_size;
1653
1654                 if (buf_setupl(cbp, upl, upl_offset))
1655                         panic("buf_setupl failed\n");
1656 #if CONFIG_IOSCHED
1657                 upl_set_blkno(upl, upl_offset, io_size, blkno);
1658 #endif
1659                 cbp->b_trans_next = (buf_t)NULL;
1660
1661                 if ((cbp->b_iostate = (void *)iostate))
1662                         /*
1663                          * caller wants to track the state of this
1664                          * io... bump the amount issued against this stream
1665                          */
1666                         iostate->io_issued += io_size;
1667
1668                 if (flags & CL_READ) {
1669                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
1670                                      (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1671                 }
1672                 else {
1673                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
1674                                      (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1675                 }
1676
1677                 if (cbp_head) {
1678                         cbp_tail->b_trans_next = cbp;
1679                         cbp_tail = cbp;
1680                 } else {
1681                         cbp_head = cbp;
1682                         cbp_tail = cbp;
1683
1684                         if ( (cbp_head->b_real_bp = real_bp) )
1685                                 real_bp = (buf_t)NULL;
1686                 }
1687                 *(buf_t *)(&cbp->b_trans_head) = cbp_head;
1688
1689                 trans_count++;
1690
1691                 upl_offset += io_size;
1692                 f_offset   += io_size;
1693                 size       -= io_size;
1694                 /*
1695                  * keep track of how much of the original request
1696                  * that we've actually completed... non_rounded_size
1697                  * may go negative due to us rounding the request
1698                  * to a page size multiple (i.e.  size > non_rounded_size)
1699                  */
1700                 non_rounded_size -= io_size;
1701
1702                 if (non_rounded_size <= 0) {
1703                         /*
1704                          * we've transferred all of the data in the original
1705                          * request, but we were unable to complete the tail
1706                          * of the last page because the file didn't have
1707                          * an allocation to back that portion... this is ok.
1708                          */
1709                         size = 0;
1710                 }
1711                 if (size == 0) {
1712                         /*
1713                          * we have no more I/O to issue, so go
1714                          * finish the final transaction
1715                          */
1716                         need_EOT = TRUE;
1717                 } else if ( ((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) &&
1718                             ((flags & CL_ASYNC) || trans_count > max_trans_count) ) {
1719                         /*
1720                          * I/O directed to physically contiguous memory...
1721                          * which doesn't have a requirement to make sure we 'fill' a page
1722                          * or...
1723                          * the current I/O we've prepared fully
1724                          * completes the last page in this request
1725                          * and ...
1726                          * it's either an ASYNC request or
1727                          * we've already accumulated more than 8 I/O's into
1728                          * this transaction so mark it as complete so that
1729                          * it can finish asynchronously or via the cluster_complete_transaction
1730                          * below if the request is synchronous
1731                          */
1732                         need_EOT = TRUE;
1733                 }
1734                 if (need_EOT == TRUE)
1735                         cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
1736
1737                 if (flags & CL_THROTTLE)
1738                         (void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
1739
1740                 if ( !(io_flags & B_READ))
1741                         vnode_startwrite(vp);
1742
1743                 if (flags & CL_RAW_ENCRYPTED) {
1744                         /*
1745                          * User requested raw encrypted bytes.
1746                          * Twiddle the bit in the ba_flags for the buffer
1747                          */
1748                         cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO;
1749                 }
1750
1751                 (void) VNOP_STRATEGY(cbp);
1752
1753                 if (need_EOT == TRUE) {
1754                         if ( !(flags & CL_ASYNC))
1755                                 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
1756
1757                         need_EOT = FALSE;
1758                         trans_count = 0;
1759                         cbp_head = NULL;
1760                 }
1761         }
1762         if (error) {
1763                 int abort_size;
1764
1765                 io_size = 0;
1766
1767                 if (cbp_head) {
1768                         /*
1769                          * Wait until all of the outstanding I/O
1770                          * for this partial transaction has completed
1771                          */
1772                         cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1773
1774                         /*
1775                          * Rewind the upl offset to the beginning of the
1776                          * transaction.
1777                          */
1778                         upl_offset = cbp_head->b_uploffset;
1779                 }
1780
1781                 if (ISSET(flags, CL_COMMIT)) {
1782                         cluster_handle_associated_upl(iostate, upl, upl_offset,
1783                                                                                   upl_end_offset - upl_offset);
1784                 }
1785
1786                 // Free all the IO buffers in this transaction
1787                 for (cbp = cbp_head; cbp;) {
1788                         buf_t   cbp_next;
1789
1790                         size       += cbp->b_bcount;
1791                         io_size    += cbp->b_bcount;
1792
1793                         cbp_next = cbp->b_trans_next;
1794                         free_io_buf(cbp);
1795                         cbp = cbp_next;
1796                 }
1797
1798                 if (iostate) {
1799                         int need_wakeup = 0;
1800
1801                         /*
1802                          * update the error condition for this stream
1803                          * since we never really issued the io
1804                          * just go ahead and adjust it back
1805                          */
1806                         lck_mtx_lock_spin(&iostate->io_mtxp);
1807
1808                         if (iostate->io_error == 0)
1809                                 iostate->io_error = error;
1810                         iostate->io_issued -= io_size;
1811
1812                         if (iostate->io_wanted) {
1813                                 /*
1814                                  * someone is waiting for the state of
1815                                  * this io stream to change
1816                                  */
1817                                 iostate->io_wanted = 0;
1818                                 need_wakeup = 1;
1819                         }
1820                         lck_mtx_unlock(&iostate->io_mtxp);
1821
1822                         if (need_wakeup)
1823                                 wakeup((caddr_t)&iostate->io_wanted);
1824                 }
1825
1826                 if (flags & CL_COMMIT) {
1827                         int     upl_flags;
1828
1829                         pg_offset  = upl_offset & PAGE_MASK;
1830                         abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK;
1831
1832                         upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags, vp);
1833
1834                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
1835                                      upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0);
1836                 }
1837                 if (retval == 0)
1838                         retval = error;
1839         } else if (cbp_head)
1840                         panic("%s(): cbp_head is not NULL.\n", __FUNCTION__);
1841
1842         if (real_bp) {
1843                 /*
1844                  * can get here if we either encountered an error
1845                  * or we completely zero-filled the request and
1846                  * no I/O was issued
1847                  */
1848                 if (error) {
1849                         real_bp->b_flags |= B_ERROR;
1850                         real_bp->b_error = error;
1851                 }
1852                 buf_biodone(real_bp);
1853         }
1854         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
1855
1856         return (retval);
1857 }
1858
1859 #define reset_vector_run_state()                                                                                \
1860         issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
1861
1862 static int
1863 vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
1864            int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1865 {
1866         vector_upl_set_pagelist(vector_upl);
1867
1868         if(io_flag & CL_READ) {
1869                 if(vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK)==0))
1870                         io_flag &= ~CL_PRESERVE; /*don't zero fill*/
1871                 else
1872                         io_flag |= CL_PRESERVE; /*zero fill*/
1873         }
1874         return (cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg));
1875
1876 }
1877
1878 static int
1879 cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
1880 {
1881         int           pages_in_prefetch;
1882
1883         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
1884                      (int)f_offset, size, (int)filesize, 0, 0);
1885
1886         if (f_offset >= filesize) {
1887                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1888                              (int)f_offset, 0, 0, 0, 0);
1889                 return(0);
1890         }
1891         if ((off_t)size > (filesize - f_offset))
1892                 size = filesize - f_offset;
1893         pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1894
1895         advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
1896
1897         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1898                      (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1899
1900         return (pages_in_prefetch);
1901 }
1902
1903
1904
1905 static void
1906 cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg,
1907                    int bflag)
1908 {
1909         daddr64_t       r_addr;
1910         off_t           f_offset;
1911         int             size_of_prefetch;
1912         u_int           max_prefetch;
1913
1914
1915         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
1916                      (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
1917
1918         if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
1919                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1920                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
1921                 return;
1922         }
1923         if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
1924                 rap->cl_ralen = 0;
1925                 rap->cl_maxra = 0;
1926
1927                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1928                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
1929
1930                 return;
1931         }
1932         max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), (vp->v_mount->mnt_kern_flag & MNTK_SSD));
1933
1934         if (max_prefetch > speculative_prefetch_max)
1935                 max_prefetch = speculative_prefetch_max;
1936
1937         if (max_prefetch <= PAGE_SIZE) {
1938                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1939                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
1940                 return;
1941         }
1942         if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
1943                 if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
1944
1945                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1946                                      rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
1947                         return;
1948                 }
1949         }
1950         r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
1951         f_offset = (off_t)(r_addr * PAGE_SIZE_64);
1952
1953         size_of_prefetch = 0;
1954
1955         ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
1956
1957         if (size_of_prefetch) {
1958                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1959                              rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
1960                 return;
1961         }
1962         if (f_offset < filesize) {
1963                 daddr64_t read_size;
1964
1965                 rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
1966
1967                 read_size = (extent->e_addr + 1) - extent->b_addr;
1968
1969                 if (read_size > rap->cl_ralen) {
1970                         if (read_size > max_prefetch / PAGE_SIZE)
1971                                 rap->cl_ralen = max_prefetch / PAGE_SIZE;
1972                         else
1973                                 rap->cl_ralen = read_size;
1974                 }
1975                 size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
1976
1977                 if (size_of_prefetch)
1978                         rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
1979         }
1980         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1981                      rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
1982 }
1983
1984
1985 int
1986 cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
1987                 int size, off_t filesize, int flags)
1988 {
1989         return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
1990
1991 }
1992
1993
1994 int
1995 cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
1996                 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
1997 {
1998         int           io_size;
1999         int           rounded_size;
2000         off_t         max_size;
2001         int           local_flags;
2002
2003         local_flags = CL_PAGEOUT | CL_THROTTLE;
2004
2005         if ((flags & UPL_IOSYNC) == 0)
2006                 local_flags |= CL_ASYNC;
2007         if ((flags & UPL_NOCOMMIT) == 0)
2008                 local_flags |= CL_COMMIT;
2009         if ((flags & UPL_KEEPCACHED))
2010                 local_flags |= CL_KEEPCACHED;
2011         if (flags & UPL_PAGING_ENCRYPTED)
2012                 local_flags |= CL_ENCRYPTED;
2013
2014
2015         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
2016                      (int)f_offset, size, (int)filesize, local_flags, 0);
2017
2018         /*
2019          * If they didn't specify any I/O, then we are done...
2020          * we can't issue an abort because we don't know how
2021          * big the upl really is
2022          */
2023         if (size <= 0)
2024                 return (EINVAL);
2025
2026         if (vp->v_mount->mnt_flag & MNT_RDONLY) {
2027                 if (local_flags & CL_COMMIT)
2028                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2029                 return (EROFS);
2030         }
2031         /*
2032          * can't page-in from a negative offset
2033          * or if we're starting beyond the EOF
2034          * or if the file offset isn't page aligned
2035          * or the size requested isn't a multiple of PAGE_SIZE
2036          */
2037         if (f_offset < 0 || f_offset >= filesize ||
2038            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
2039                 if (local_flags & CL_COMMIT)
2040                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2041                 return (EINVAL);
2042         }
2043         max_size = filesize - f_offset;
2044
2045         if (size < max_size)
2046                 io_size = size;
2047         else
2048                 io_size = max_size;
2049
2050         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2051
2052         if (size > rounded_size) {
2053                 if (local_flags & CL_COMMIT)
2054                         ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
2055                                         UPL_ABORT_FREE_ON_EMPTY);
2056         }
2057         return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
2058                            local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg));
2059 }
2060
2061
2062 int
2063 cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2064                int size, off_t filesize, int flags)
2065 {
2066         return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2067 }
2068
2069
2070 int
2071 cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2072                int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2073 {
2074         u_int         io_size;
2075         int           rounded_size;
2076         off_t         max_size;
2077         int           retval;
2078         int           local_flags = 0;
2079
2080         if (upl == NULL || size < 0)
2081                 panic("cluster_pagein: NULL upl passed in");
2082
2083         if ((flags & UPL_IOSYNC) == 0)
2084                 local_flags |= CL_ASYNC;
2085         if ((flags & UPL_NOCOMMIT) == 0)
2086                 local_flags |= CL_COMMIT;
2087         if (flags & UPL_IOSTREAMING)
2088                 local_flags |= CL_IOSTREAMING;
2089         if (flags & UPL_PAGING_ENCRYPTED)
2090                 local_flags |= CL_ENCRYPTED;
2091
2092
2093         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
2094                      (int)f_offset, size, (int)filesize, local_flags, 0);
2095
2096         /*
2097          * can't page-in from a negative offset
2098          * or if we're starting beyond the EOF
2099          * or if the file offset isn't page aligned
2100          * or the size requested isn't a multiple of PAGE_SIZE
2101          */
2102         if (f_offset < 0 || f_offset >= filesize ||
2103            (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
2104                 if (local_flags & CL_COMMIT)
2105                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2106                 return (EINVAL);
2107         }
2108         max_size = filesize - f_offset;
2109
2110         if (size < max_size)
2111                 io_size = size;
2112         else
2113                 io_size = max_size;
2114
2115         rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2116
2117         if (size > rounded_size && (local_flags & CL_COMMIT))
2118                 ubc_upl_abort_range(upl, upl_offset + rounded_size,
2119                                     size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2120
2121         retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
2122                             local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2123
2124         return (retval);
2125 }
2126
2127
2128 int
2129 cluster_bp(buf_t bp)
2130 {
2131        return cluster_bp_ext(bp, NULL, NULL);
2132 }
2133
2134
2135 int
2136 cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg)
2137 {
2138         off_t  f_offset;
2139         int    flags;
2140
2141         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
2142                      bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
2143
2144         if (bp->b_flags & B_READ)
2145                 flags = CL_ASYNC | CL_READ;
2146         else
2147                 flags = CL_ASYNC;
2148         if (bp->b_flags & B_PASSIVE)
2149                 flags |= CL_PASSIVE;
2150
2151         f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
2152
2153         return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg));
2154 }
2155
2156
2157
2158 int
2159 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
2160 {
2161         return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
2162 }
2163
2164
2165 int
2166 cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
2167                   int xflags, int (*callback)(buf_t, void *), void *callback_arg)
2168 {
2169         user_ssize_t    cur_resid;
2170         int             retval = 0;
2171         int             flags;
2172         int             zflags;
2173         int             bflag;
2174         int             write_type = IO_COPY;
2175         u_int32_t       write_length;
2176
2177         flags = xflags;
2178
2179         if (flags & IO_PASSIVE)
2180                 bflag = CL_PASSIVE;
2181         else
2182                 bflag = 0;
2183
2184         if (vp->v_flag & VNOCACHE_DATA){
2185                 flags |= IO_NOCACHE;
2186                 bflag |= CL_NOCACHE;
2187         }
2188         if (uio == NULL) {
2189                 /*
2190                  * no user data...
2191                  * this call is being made to zero-fill some range in the file
2192                  */
2193                 retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
2194
2195                 return(retval);
2196         }
2197         /*
2198          * do a write through the cache if one of the following is true....
2199          *   NOCACHE is not true or NODIRECT is true
2200          *   the uio request doesn't target USERSPACE
2201          * otherwise, find out if we want the direct or contig variant for
2202          * the first vector in the uio request
2203          */
2204         if ( ((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) )
2205                 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2206
2207         if ( (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT)
2208                 /*
2209                  * must go through the cached variant in this case
2210                  */
2211                 write_type = IO_COPY;
2212
2213         while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
2214
2215                 switch (write_type) {
2216
2217                 case IO_COPY:
2218                         /*
2219                          * make sure the uio_resid isn't too big...
2220                          * internally, we want to handle all of the I/O in
2221                          * chunk sizes that fit in a 32 bit int
2222                          */
2223                         if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
2224                                 /*
2225                                  * we're going to have to call cluster_write_copy
2226                                  * more than once...
2227                                  *
2228                                  * only want the last call to cluster_write_copy to
2229                                  * have the IO_TAILZEROFILL flag set and only the
2230                                  * first call should have IO_HEADZEROFILL
2231                                  */
2232                                 zflags = flags & ~IO_TAILZEROFILL;
2233                                 flags &= ~IO_HEADZEROFILL;
2234
2235                                 write_length = MAX_IO_REQUEST_SIZE;
2236                         } else {
2237                                 /*
2238                                  * last call to cluster_write_copy
2239                                  */
2240                                 zflags = flags;
2241
2242                                 write_length = (u_int32_t)cur_resid;
2243                         }
2244                         retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
2245                         break;
2246
2247                 case IO_CONTIG:
2248                         zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL);
2249
2250                         if (flags & IO_HEADZEROFILL) {
2251                                 /*
2252                                  * only do this once per request
2253                                  */
2254                                 flags &= ~IO_HEADZEROFILL;
2255
2256                                 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
2257                                                             headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2258                                 if (retval)
2259                                         break;
2260                         }
2261                         retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
2262
2263                         if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
2264                                 /*
2265                                  * we're done with the data from the user specified buffer(s)
2266                                  * and we've been requested to zero fill at the tail
2267                                  * treat this as an IO_HEADZEROFILL which doesn't require a uio
2268                                  * by rearranging the args and passing in IO_HEADZEROFILL
2269                                  */
2270                                 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset,
2271                                                             (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2272                         }
2273                         break;
2274
2275                 case IO_DIRECT:
2276                         /*
2277                          * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
2278                          */
2279                         retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
2280                         break;
2281
2282                 case IO_UNKNOWN:
2283                         retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2284                         break;
2285                 }
2286                 /*
2287                  * in case we end up calling cluster_write_copy (from cluster_write_direct)
2288                  * multiple times to service a multi-vector request that is not aligned properly
2289                  * we need to update the oldEOF so that we
2290                  * don't zero-fill the head of a page if we've successfully written
2291                  * data to that area... 'cluster_write_copy' will zero-fill the head of a
2292                  * page that is beyond the oldEOF if the write is unaligned... we only
2293                  * want that to happen for the very first page of the cluster_write,
2294                  * NOT the first page of each vector making up a multi-vector write.
2295                  */
2296                 if (uio->uio_offset > oldEOF)
2297                         oldEOF = uio->uio_offset;
2298         }
2299         return (retval);
2300 }
2301
2302
2303 static int
2304 cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
2305                      int flags, int (*callback)(buf_t, void *), void *callback_arg)
2306 {
2307         upl_t            upl;
2308         upl_page_info_t  *pl;
2309         vm_offset_t      upl_offset;
2310         vm_offset_t      vector_upl_offset = 0;
2311         u_int32_t        io_req_size;
2312         u_int32_t        offset_in_file;
2313         u_int32_t        offset_in_iovbase;
2314         u_int32_t        io_size;
2315         int              io_flag = 0;
2316         upl_size_t       upl_size, vector_upl_size = 0;
2317         vm_size_t        upl_needed_size;
2318         mach_msg_type_number_t  pages_in_pl;
2319         upl_control_flags_t upl_flags;
2320         kern_return_t    kret;
2321         mach_msg_type_number_t  i;
2322         int              force_data_sync;
2323         int              retval = 0;
2324         int              first_IO = 1;
2325         struct clios     iostate;
2326         user_addr_t      iov_base;
2327         u_int32_t        mem_alignment_mask;
2328         u_int32_t        devblocksize;
2329         u_int32_t        max_io_size;
2330         u_int32_t        max_upl_size;
2331         u_int32_t        max_vector_size;
2332         boolean_t        io_throttled = FALSE;
2333
2334         u_int32_t        vector_upl_iosize = 0;
2335         int              issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1);
2336         off_t            v_upl_uio_offset = 0;
2337         int              vector_upl_index=0;
2338         upl_t            vector_upl = NULL;
2339
2340
2341         /*
2342          * When we enter this routine, we know
2343          *  -- the resid will not exceed iov_len
2344          */
2345         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
2346                      (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2347
2348         max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
2349
2350         io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO;
2351
2352         if (flags & IO_PASSIVE)
2353                 io_flag |= CL_PASSIVE;
2354
2355         if (flags & IO_NOCACHE)
2356                 io_flag |= CL_NOCACHE;
2357
2358         if (flags & IO_SKIP_ENCRYPTION)
2359                 io_flag |= CL_ENCRYPTED;
2360
2361         iostate.io_completed = 0;
2362         iostate.io_issued = 0;
2363         iostate.io_error = 0;
2364         iostate.io_wanted = 0;
2365
2366         lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
2367
2368         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2369         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2370
2371         if (devblocksize == 1) {
2372                /*
2373                 * the AFP client advertises a devblocksize of 1
2374                 * however, its BLOCKMAP routine maps to physical
2375                 * blocks that are PAGE_SIZE in size...
2376                 * therefore we can't ask for I/Os that aren't page aligned
2377                 * or aren't multiples of PAGE_SIZE in size
2378                 * by setting devblocksize to PAGE_SIZE, we re-instate
2379                 * the old behavior we had before the mem_alignment_mask
2380                 * changes went in...
2381                 */
2382                devblocksize = PAGE_SIZE;
2383         }
2384
2385 next_dwrite:
2386         io_req_size = *write_length;
2387         iov_base = uio_curriovbase(uio);
2388
2389         offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
2390         offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
2391
2392         if (offset_in_file || offset_in_iovbase) {
2393                 /*
2394                  * one of the 2 important offsets is misaligned
2395                  * so fire an I/O through the cache for this entire vector
2396                  */
2397                 goto wait_for_dwrites;
2398         }
2399         if (iov_base & (devblocksize - 1)) {
2400                 /*
2401                  * the offset in memory must be on a device block boundary
2402                  * so that we can guarantee that we can generate an
2403                  * I/O that ends on a page boundary in cluster_io
2404                  */
2405                 goto wait_for_dwrites;
2406         }
2407
2408         while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
2409                 int     throttle_type;
2410
2411                 if ( (throttle_type = cluster_is_throttled(vp)) ) {
2412                         /*
2413                          * we're in the throttle window, at the very least
2414                          * we want to limit the size of the I/O we're about
2415                          * to issue
2416                          */
2417                         if ( (flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
2418                                 /*
2419                                  * we're in the throttle window and at least 1 I/O
2420                                  * has already been issued by a throttleable thread
2421                                  * in this window, so return with EAGAIN to indicate
2422                                  * to the FS issuing the cluster_write call that it
2423                                  * should now throttle after dropping any locks
2424                                  */
2425                                 throttle_info_update_by_mount(vp->v_mount);
2426
2427                                 io_throttled = TRUE;
2428                                 goto wait_for_dwrites;
2429                         }
2430                         max_vector_size = THROTTLE_MAX_IOSIZE;
2431                         max_io_size = THROTTLE_MAX_IOSIZE;
2432                 } else {
2433                         max_vector_size = MAX_VECTOR_UPL_SIZE;
2434                         max_io_size = max_upl_size;
2435                 }
2436
2437                 if (first_IO) {
2438                         cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
2439                         first_IO = 0;
2440                 }
2441                 io_size  = io_req_size & ~PAGE_MASK;
2442                 iov_base = uio_curriovbase(uio);
2443
2444                 if (io_size > max_io_size)
2445                         io_size = max_io_size;
2446
2447                 if(useVectorUPL && (iov_base & PAGE_MASK)) {
2448                         /*
2449                          * We have an iov_base that's not page-aligned.
2450                          * Issue all I/O's that have been collected within
2451                          * this Vectored UPL.
2452                          */
2453                         if(vector_upl_index) {
2454                                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2455                                 reset_vector_run_state();
2456                         }
2457
2458                        /*
2459                         * After this point, if we are using the Vector UPL path and the base is
2460                         * not page-aligned then the UPL with that base will be the first in the vector UPL.
2461                         */
2462                 }
2463
2464                 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2465                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2466
2467                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
2468                              (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
2469
2470                 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2471                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2472                         pages_in_pl = 0;
2473                         upl_size = upl_needed_size;
2474                         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2475                                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE
2476                                     | UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE);
2477
2478                         kret = vm_map_get_upl(map,
2479                                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2480                                               &upl_size,
2481                                               &upl,
2482                                               NULL,
2483                                               &pages_in_pl,
2484                                               &upl_flags,
2485                                               force_data_sync);
2486
2487                         if (kret != KERN_SUCCESS) {
2488                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2489                                              0, 0, 0, kret, 0);
2490                                 /*
2491                                  * failed to get pagelist
2492                                  *
2493                                  * we may have already spun some portion of this request
2494                                  * off as async requests... we need to wait for the I/O
2495                                  * to complete before returning
2496                                  */
2497                                 goto wait_for_dwrites;
2498                         }
2499                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2500                         pages_in_pl = upl_size / PAGE_SIZE;
2501
2502                         for (i = 0; i < pages_in_pl; i++) {
2503                                 if (!upl_valid_page(pl, i))
2504                                         break;
2505                         }
2506                         if (i == pages_in_pl)
2507                                 break;
2508
2509                         /*
2510                          * didn't get all the pages back that we
2511                          * needed... release this upl and try again
2512                          */
2513                         ubc_upl_abort(upl, 0);
2514                 }
2515                 if (force_data_sync >= 3) {
2516                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2517                                      i, pages_in_pl, upl_size, kret, 0);
2518                         /*
2519                          * for some reason, we couldn't acquire a hold on all
2520                          * the pages needed in the user's address space
2521                          *
2522                          * we may have already spun some portion of this request
2523                          * off as async requests... we need to wait for the I/O
2524                          * to complete before returning
2525                          */
2526                         goto wait_for_dwrites;
2527                 }
2528
2529                 /*
2530                  * Consider the possibility that upl_size wasn't satisfied.
2531                  */
2532                 if (upl_size < upl_needed_size) {
2533                         if (upl_size && upl_offset == 0)
2534                                 io_size = upl_size;
2535                         else
2536                                 io_size = 0;
2537                 }
2538                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2539                              (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
2540
2541                 if (io_size == 0) {
2542                         ubc_upl_abort(upl, 0);
2543                         /*
2544                          * we may have already spun some portion of this request
2545                          * off as async requests... we need to wait for the I/O
2546                          * to complete before returning
2547                          */
2548                         goto wait_for_dwrites;
2549                 }
2550
2551                 if(useVectorUPL) {
2552                         vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
2553                         if(end_off)
2554                                 issueVectorUPL = 1;
2555                         /*
2556                          * After this point, if we are using a vector UPL, then
2557                          * either all the UPL elements end on a page boundary OR
2558                          * this UPL is the last element because it does not end
2559                          * on a page boundary.
2560                          */
2561                 }
2562
2563                 /*
2564                  * we want push out these writes asynchronously so that we can overlap
2565                  * the preparation of the next I/O
2566                  * if there are already too many outstanding writes
2567                  * wait until some complete before issuing the next
2568                  */
2569                 cluster_iostate_wait(&iostate, max_upl_size * IO_SCALE(vp, 2), "cluster_write_direct");
2570
2571                 if (iostate.io_error) {
2572                         /*
2573                          * one of the earlier writes we issued ran into a hard error
2574                          * don't issue any more writes, cleanup the UPL
2575                          * that was just created but not used, then
2576                          * go wait for all writes that are part of this stream
2577                          * to complete before returning the error to the caller
2578                          */
2579                         ubc_upl_abort(upl, 0);
2580
2581                         goto wait_for_dwrites;
2582                 }
2583
2584                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
2585                              (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
2586
2587                 if(!useVectorUPL)
2588                         retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2589                                    io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2590
2591                 else {
2592                         if(!vector_upl_index) {
2593                                 vector_upl = vector_upl_create(upl_offset);
2594                                 v_upl_uio_offset = uio->uio_offset;
2595                                 vector_upl_offset = upl_offset;
2596                         }
2597
2598                         vector_upl_set_subupl(vector_upl,upl,upl_size);
2599                         vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
2600                         vector_upl_index++;
2601                         vector_upl_iosize += io_size;
2602                         vector_upl_size += upl_size;
2603
2604                         if(issueVectorUPL || vector_upl_index ==  MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
2605                                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2606                                 reset_vector_run_state();
2607                         }
2608                 }
2609
2610                 /*
2611                  * update the uio structure to
2612                  * reflect the I/O that we just issued
2613                  */
2614                 uio_update(uio, (user_size_t)io_size);
2615
2616                 /*
2617                  * in case we end up calling through to cluster_write_copy to finish
2618                  * the tail of this request, we need to update the oldEOF so that we
2619                  * don't zero-fill the head of a page if we've successfully written
2620                  * data to that area... 'cluster_write_copy' will zero-fill the head of a
2621                  * page that is beyond the oldEOF if the write is unaligned... we only
2622                  * want that to happen for the very first page of the cluster_write,
2623                  * NOT the first page of each vector making up a multi-vector write.
2624                  */
2625                 if (uio->uio_offset > oldEOF)
2626                         oldEOF = uio->uio_offset;
2627
2628                 io_req_size -= io_size;
2629
2630                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
2631                              (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
2632
2633         } /* end while */
2634
2635         if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
2636
2637                 retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
2638
2639                 if (retval == 0 && *write_type == IO_DIRECT) {
2640
2641                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE,
2642                                      (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2643
2644                         goto next_dwrite;
2645                 }
2646         }
2647
2648 wait_for_dwrites:
2649
2650         if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
2651                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2652                 reset_vector_run_state();
2653         }
2654         /*
2655          * make sure all async writes issued as part of this stream
2656          * have completed before we return
2657          */
2658         cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
2659
2660         if (iostate.io_error)
2661                 retval = iostate.io_error;
2662
2663         lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
2664
2665         if (io_throttled == TRUE && retval == 0)
2666                 retval = EAGAIN;
2667
2668         if (io_req_size && retval == 0) {
2669                 /*
2670                  * we couldn't handle the tail of this request in DIRECT mode
2671                  * so fire it through the copy path
2672                  *
2673                  * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2674                  * so we can just pass 0 in for the headOff and tailOff
2675                  */
2676                 if (uio->uio_offset > oldEOF)
2677                         oldEOF = uio->uio_offset;
2678
2679                 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
2680
2681                 *write_type = IO_UNKNOWN;
2682         }
2683         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
2684                      (int)uio->uio_offset, io_req_size, retval, 4, 0);
2685
2686         return (retval);
2687 }
2688
2689
2690 static int
2691 cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length,
2692                      int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2693 {
2694         upl_page_info_t *pl;
2695         addr64_t         src_paddr = 0;
2696         upl_t            upl[MAX_VECTS];
2697         vm_offset_t      upl_offset;
2698         u_int32_t        tail_size = 0;
2699         u_int32_t        io_size;
2700         u_int32_t        xsize;
2701         upl_size_t       upl_size;
2702         vm_size_t        upl_needed_size;
2703         mach_msg_type_number_t  pages_in_pl;
2704         upl_control_flags_t upl_flags;
2705         kern_return_t    kret;
2706         struct clios     iostate;
2707         int              error  = 0;
2708         int              cur_upl = 0;
2709         int              num_upl = 0;
2710         int              n;
2711         user_addr_t      iov_base;
2712         u_int32_t        devblocksize;
2713         u_int32_t        mem_alignment_mask;
2714
2715         /*
2716          * When we enter this routine, we know
2717          *  -- the io_req_size will not exceed iov_len
2718          *  -- the target address is physically contiguous
2719          */
2720         cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
2721
2722         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2723         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2724
2725         iostate.io_completed = 0;
2726         iostate.io_issued = 0;
2727         iostate.io_error = 0;
2728         iostate.io_wanted = 0;
2729
2730         lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
2731
2732 next_cwrite:
2733         io_size = *write_length;
2734
2735         iov_base = uio_curriovbase(uio);
2736
2737         upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2738         upl_needed_size = upl_offset + io_size;
2739
2740         pages_in_pl = 0;
2741         upl_size = upl_needed_size;
2742         upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2743                     UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE
2744                     | UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE);
2745
2746         vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2747         kret = vm_map_get_upl(map,
2748                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2749                               &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0);
2750
2751         if (kret != KERN_SUCCESS) {
2752                 /*
2753                  * failed to get pagelist
2754                  */
2755                 error = EINVAL;
2756                 goto wait_for_cwrites;
2757         }
2758         num_upl++;
2759
2760         /*
2761          * Consider the possibility that upl_size wasn't satisfied.
2762          */
2763         if (upl_size < upl_needed_size) {
2764                 /*
2765                  * This is a failure in the physical memory case.
2766                  */
2767                 error = EINVAL;
2768                 goto wait_for_cwrites;
2769         }
2770         pl = ubc_upl_pageinfo(upl[cur_upl]);
2771
2772         src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
2773
2774         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
2775                 u_int32_t   head_size;
2776
2777                 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
2778
2779                 if (head_size > io_size)
2780                         head_size = io_size;
2781
2782                 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
2783
2784                 if (error)
2785                         goto wait_for_cwrites;
2786
2787                 upl_offset += head_size;
2788                 src_paddr  += head_size;
2789                 io_size    -= head_size;
2790
2791                 iov_base   += head_size;
2792         }
2793         if ((u_int32_t)iov_base & mem_alignment_mask) {
2794                 /*
2795                  * request doesn't set up on a memory boundary
2796                  * the underlying DMA engine can handle...
2797                  * return an error instead of going through
2798                  * the slow copy path since the intent of this
2799                  * path is direct I/O from device memory
2800                  */
2801                 error = EINVAL;
2802                 goto wait_for_cwrites;
2803         }
2804
2805         tail_size = io_size & (devblocksize - 1);
2806         io_size  -= tail_size;
2807
2808         while (io_size && error == 0) {
2809
2810                 if (io_size > MAX_IO_CONTIG_SIZE)
2811                         xsize = MAX_IO_CONTIG_SIZE;
2812                 else
2813                         xsize = io_size;
2814                 /*
2815                  * request asynchronously so that we can overlap
2816                  * the preparation of the next I/O... we'll do
2817                  * the commit after all the I/O has completed
2818                  * since its all issued against the same UPL
2819                  * if there are already too many outstanding writes
2820                  * wait until some have completed before issuing the next
2821                  */
2822                 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
2823
2824                 if (iostate.io_error) {
2825                         /*
2826                          * one of the earlier writes we issued ran into a hard error
2827                          * don't issue any more writes...
2828                          * go wait for all writes that are part of this stream
2829                          * to complete before returning the error to the caller
2830                          */
2831                         goto wait_for_cwrites;
2832                 }
2833                 /*
2834                  * issue an asynchronous write to cluster_io
2835                  */
2836                 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
2837                                    xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
2838
2839                 if (error == 0) {
2840                         /*
2841                          * The cluster_io write completed successfully,
2842                          * update the uio structure
2843                          */
2844                         uio_update(uio, (user_size_t)xsize);
2845
2846                         upl_offset += xsize;
2847                         src_paddr  += xsize;
2848                         io_size    -= xsize;
2849                 }
2850         }
2851         if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
2852
2853                 error = cluster_io_type(uio, write_type, write_length, 0);
2854
2855                 if (error == 0 && *write_type == IO_CONTIG) {
2856                         cur_upl++;
2857                         goto next_cwrite;
2858                 }
2859         } else
2860                 *write_type = IO_UNKNOWN;
2861
2862 wait_for_cwrites:
2863         /*
2864          * make sure all async writes that are part of this stream
2865          * have completed before we proceed
2866          */
2867         cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
2868
2869         if (iostate.io_error)
2870                 error = iostate.io_error;
2871
2872         lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
2873
2874         if (error == 0 && tail_size)
2875                 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
2876
2877         for (n = 0; n < num_upl; n++)
2878                 /*
2879                  * just release our hold on each physically contiguous
2880                  * region without changing any state
2881                  */
2882                 ubc_upl_abort(upl[n], 0);
2883
2884         return (error);
2885 }
2886
2887
2888 /*
2889  * need to avoid a race between an msync of a range of pages dirtied via mmap
2890  * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
2891  * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
2892  *
2893  * we should never force-zero-fill pages that are already valid in the cache...
2894  * the entire page contains valid data (either from disk, zero-filled or dirtied
2895  * via an mmap) so we can only do damage by trying to zero-fill
2896  *
2897  */
2898 static int
2899 cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
2900 {
2901         int zero_pg_index;
2902         boolean_t need_cluster_zero = TRUE;
2903
2904         if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2905
2906                 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
2907                 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
2908
2909                 if (upl_valid_page(pl, zero_pg_index)) {
2910                         /*
2911                          * never force zero valid pages - dirty or clean
2912                          * we'll leave these in the UPL for cluster_write_copy to deal with
2913                          */
2914                         need_cluster_zero = FALSE;
2915                 }
2916         }
2917         if (need_cluster_zero == TRUE)
2918                 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2919
2920         return (bytes_to_zero);
2921 }
2922
2923
2924 static int
2925 cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
2926                    off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2927 {
2928         upl_page_info_t *pl;
2929         upl_t            upl;
2930         vm_offset_t      upl_offset = 0;
2931         vm_size_t        upl_size;
2932         off_t            upl_f_offset;
2933         int              pages_in_upl;
2934         int              start_offset;
2935         int              xfer_resid;
2936         int              io_size;
2937         int              io_offset;
2938         int              bytes_to_zero;
2939         int              bytes_to_move;
2940         kern_return_t    kret;
2941         int              retval = 0;
2942         int              io_resid;
2943         long long        total_size;
2944         long long        zero_cnt;
2945         off_t            zero_off;
2946         long long        zero_cnt1;
2947         off_t            zero_off1;
2948         off_t            write_off = 0;
2949         int              write_cnt = 0;
2950         boolean_t        first_pass = FALSE;
2951         struct cl_extent cl;
2952         struct cl_writebehind *wbp;
2953         int              bflag;
2954         u_int            max_cluster_pgcount;
2955         u_int            max_io_size;
2956
2957         if (uio) {
2958                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
2959                              (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
2960
2961                 io_resid = io_req_size;
2962         } else {
2963                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
2964                              0, 0, (int)oldEOF, (int)newEOF, 0);
2965
2966                 io_resid = 0;
2967         }
2968         if (flags & IO_PASSIVE)
2969                 bflag = CL_PASSIVE;
2970         else
2971                 bflag = 0;
2972         if (flags & IO_NOCACHE)
2973                 bflag |= CL_NOCACHE;
2974
2975         if (flags & IO_SKIP_ENCRYPTION)
2976                 bflag |= CL_ENCRYPTED;
2977
2978         zero_cnt  = 0;
2979         zero_cnt1 = 0;
2980         zero_off  = 0;
2981         zero_off1 = 0;
2982
2983         max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
2984         max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
2985
2986         if (flags & IO_HEADZEROFILL) {
2987                 /*
2988                  * some filesystems (HFS is one) don't support unallocated holes within a file...
2989                  * so we zero fill the intervening space between the old EOF and the offset
2990                  * where the next chunk of real data begins.... ftruncate will also use this
2991                  * routine to zero fill to the new EOF when growing a file... in this case, the
2992                  * uio structure will not be provided
2993                  */
2994                 if (uio) {
2995                         if (headOff < uio->uio_offset) {
2996                                 zero_cnt = uio->uio_offset - headOff;
2997                                 zero_off = headOff;
2998                         }
2999                 } else if (headOff < newEOF) {
3000                         zero_cnt = newEOF - headOff;
3001                         zero_off = headOff;
3002                 }
3003         } else {
3004                 if (uio && uio->uio_offset > oldEOF) {
3005                         zero_off = uio->uio_offset & ~PAGE_MASK_64;
3006
3007                         if (zero_off >= oldEOF) {
3008                                 zero_cnt = uio->uio_offset - zero_off;
3009
3010                                 flags |= IO_HEADZEROFILL;
3011                         }
3012                 }
3013         }
3014         if (flags & IO_TAILZEROFILL) {
3015                 if (uio) {
3016                         zero_off1 = uio->uio_offset + io_req_size;
3017
3018                         if (zero_off1 < tailOff)
3019                                 zero_cnt1 = tailOff - zero_off1;
3020                 }
3021         } else {
3022                 if (uio && newEOF > oldEOF) {
3023                         zero_off1 = uio->uio_offset + io_req_size;
3024
3025                         if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
3026                                 zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
3027
3028                                 flags |= IO_TAILZEROFILL;
3029                         }
3030                 }
3031         }
3032         if (zero_cnt == 0 && uio == (struct uio *) 0) {
3033                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
3034                              retval, 0, 0, 0, 0);
3035                 return (0);
3036         }
3037         if (uio) {
3038                 write_off = uio->uio_offset;
3039                 write_cnt = uio_resid(uio);
3040                 /*
3041                  * delay updating the sequential write info
3042                  * in the control block until we've obtained
3043                  * the lock for it
3044                  */
3045                 first_pass = TRUE;
3046         }
3047         while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
3048                 /*
3049                  * for this iteration of the loop, figure out where our starting point is
3050                  */
3051                 if (zero_cnt) {
3052                         start_offset = (int)(zero_off & PAGE_MASK_64);
3053                         upl_f_offset = zero_off - start_offset;
3054                 } else if (io_resid) {
3055                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3056                         upl_f_offset = uio->uio_offset - start_offset;
3057                 } else {
3058                         start_offset = (int)(zero_off1 & PAGE_MASK_64);
3059                         upl_f_offset = zero_off1 - start_offset;
3060                 }
3061                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
3062                              (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
3063
3064                 if (total_size > max_io_size)
3065                         total_size = max_io_size;
3066
3067                 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
3068
3069                 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
3070                         /*
3071                          * assumption... total_size <= io_resid
3072                          * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
3073                          */
3074                         if ((start_offset + total_size) > max_io_size)
3075                                 total_size = max_io_size - start_offset;
3076                         xfer_resid = total_size;
3077
3078                         retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
3079
3080                         if (retval)
3081                                 break;
3082
3083                         io_resid    -= (total_size - xfer_resid);
3084                         total_size   = xfer_resid;
3085                         start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3086                         upl_f_offset = uio->uio_offset - start_offset;
3087
3088                         if (total_size == 0) {
3089                                 if (start_offset) {
3090                                         /*
3091                                          * the write did not finish on a page boundary
3092                                          * which will leave upl_f_offset pointing to the
3093                                          * beginning of the last page written instead of
3094                                          * the page beyond it... bump it in this case
3095                                          * so that the cluster code records the last page
3096                                          * written as dirty
3097                                          */
3098                                         upl_f_offset += PAGE_SIZE_64;
3099                                 }
3100                                 upl_size = 0;
3101
3102                                 goto check_cluster;
3103                         }
3104                 }
3105                 /*
3106                  * compute the size of the upl needed to encompass
3107                  * the requested write... limit each call to cluster_io
3108                  * to the maximum UPL size... cluster_io will clip if
3109                  * this exceeds the maximum io_size for the device,
3110                  * make sure to account for
3111                  * a starting offset that's not page aligned
3112                  */
3113                 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3114
3115                 if (upl_size > max_io_size)
3116                         upl_size = max_io_size;
3117
3118                 pages_in_upl = upl_size / PAGE_SIZE;
3119                 io_size      = upl_size - start_offset;
3120
3121                 if ((long long)io_size > total_size)
3122                         io_size = total_size;
3123
3124                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
3125
3126
3127                 /*
3128                  * Gather the pages from the buffer cache.
3129                  * The UPL_WILL_MODIFY flag lets the UPL subsystem know
3130                  * that we intend to modify these pages.
3131                  */
3132                 kret = ubc_create_upl(vp,
3133                                       upl_f_offset,
3134                                       upl_size,
3135                                       &upl,
3136                                       &pl,
3137                                       UPL_SET_LITE | (( uio!=NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY));
3138                 if (kret != KERN_SUCCESS)
3139                         panic("cluster_write_copy: failed to get pagelist");
3140
3141                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
3142                         upl, (int)upl_f_offset, start_offset, 0, 0);
3143
3144                 if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
3145                         int   read_size;
3146
3147                         /*
3148                          * we're starting in the middle of the first page of the upl
3149                          * and the page isn't currently valid, so we're going to have
3150                          * to read it in first... this is a synchronous operation
3151                          */
3152                         read_size = PAGE_SIZE;
3153
3154                         if ((upl_f_offset + read_size) > oldEOF)
3155                                 read_size = oldEOF - upl_f_offset;
3156
3157                         retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
3158                                             CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3159                         if (retval) {
3160                                 /*
3161                                  * we had an error during the read which causes us to abort
3162                                  * the current cluster_write request... before we do, we need
3163                                  * to release the rest of the pages in the upl without modifying
3164                                  * there state and mark the failed page in error
3165                                  */
3166                                 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY);
3167
3168                                 if (upl_size > PAGE_SIZE)
3169                                         ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3170
3171                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3172                                              upl, 0, 0, retval, 0);
3173                                 break;
3174                         }
3175                 }
3176                 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
3177                         /*
3178                          * the last offset we're writing to in this upl does not end on a page
3179                          * boundary... if it's not beyond the old EOF, then we'll also need to
3180                          * pre-read this page in if it isn't already valid
3181                          */
3182                         upl_offset = upl_size - PAGE_SIZE;
3183
3184                         if ((upl_f_offset + start_offset + io_size) < oldEOF &&
3185                             !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
3186                                 int   read_size;
3187
3188                                 read_size = PAGE_SIZE;
3189
3190                                 if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF)
3191                                         read_size = oldEOF - (upl_f_offset + upl_offset);
3192
3193                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
3194                                                     CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3195                                 if (retval) {
3196                                         /*
3197                                          * we had an error during the read which causes us to abort
3198                                          * the current cluster_write request... before we do, we
3199                                          * need to release the rest of the pages in the upl without
3200                                          * modifying there state and mark the failed page in error
3201                                          */
3202                                         ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY);
3203
3204                                         if (upl_size > PAGE_SIZE)
3205                                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3206
3207                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3208                                                      upl, 0, 0, retval, 0);
3209                                         break;
3210                                 }
3211                         }
3212                 }
3213                 xfer_resid = io_size;
3214                 io_offset = start_offset;
3215
3216                 while (zero_cnt && xfer_resid) {
3217
3218                         if (zero_cnt < (long long)xfer_resid)
3219                                 bytes_to_zero = zero_cnt;
3220                         else
3221                                 bytes_to_zero = xfer_resid;
3222
3223                         bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
3224
3225                         xfer_resid -= bytes_to_zero;
3226                         zero_cnt   -= bytes_to_zero;
3227                         zero_off   += bytes_to_zero;
3228                         io_offset  += bytes_to_zero;
3229                 }
3230                 if (xfer_resid && io_resid) {
3231                         u_int32_t  io_requested;
3232
3233                         bytes_to_move = min(io_resid, xfer_resid);
3234                         io_requested = bytes_to_move;
3235
3236                         retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
3237
3238                         if (retval) {
3239                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3240
3241                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3242                                              upl, 0, 0, retval, 0);
3243                         } else {
3244                                 io_resid   -= bytes_to_move;
3245                                 xfer_resid -= bytes_to_move;
3246                                 io_offset  += bytes_to_move;
3247                         }
3248                 }
3249                 while (xfer_resid && zero_cnt1 && retval == 0) {
3250
3251                         if (zero_cnt1 < (long long)xfer_resid)
3252                                 bytes_to_zero = zero_cnt1;
3253                         else
3254                                 bytes_to_zero = xfer_resid;
3255
3256                         bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
3257
3258                         xfer_resid -= bytes_to_zero;
3259                         zero_cnt1  -= bytes_to_zero;
3260                         zero_off1  += bytes_to_zero;
3261                         io_offset  += bytes_to_zero;
3262                 }
3263                 if (retval == 0) {
3264                         int cl_index;
3265                         int ret_cluster_try_push;
3266
3267                         io_size += start_offset;
3268
3269                         if ((upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
3270                                 /*
3271                                  * if we're extending the file with this write
3272                                  * we'll zero fill the rest of the page so that
3273                                  * if the file gets extended again in such a way as to leave a
3274                                  * hole starting at this EOF, we'll have zero's in the correct spot
3275                                  */
3276                                 cluster_zero(upl, io_size, upl_size - io_size, NULL);
3277                         }
3278                         /*
3279                          * release the upl now if we hold one since...
3280                          * 1) pages in it may be present in the sparse cluster map
3281                          *    and may span 2 separate buckets there... if they do and
3282                          *    we happen to have to flush a bucket to make room and it intersects
3283                          *    this upl, a deadlock may result on page BUSY
3284                          * 2) we're delaying the I/O... from this point forward we're just updating
3285                          *    the cluster state... no need to hold the pages, so commit them
3286                          * 3) IO_SYNC is set...
3287                          *    because we had to ask for a UPL that provides currenty non-present pages, the
3288                          *    UPL has been automatically set to clear the dirty flags (both software and hardware)
3289                          *    upon committing it... this is not the behavior we want since it's possible for
3290                          *    pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3291                          *    we'll pick these pages back up later with the correct behavior specified.
3292                          * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3293                          *    of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3294                          *    we hold since the flushing context is holding the cluster lock.
3295                          */
3296                         ubc_upl_commit_range(upl, 0, upl_size,
3297                                              UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
3298 check_cluster:
3299                         /*
3300                          * calculate the last logical block number
3301                          * that this delayed I/O encompassed
3302                          */
3303                         cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
3304
3305                         if (flags & IO_SYNC) {
3306                                 /*
3307                                  * if the IO_SYNC flag is set than we need to
3308                                  * bypass any clusters and immediately issue
3309                                  * the I/O
3310                                  */
3311                                 goto issue_io;
3312                         }
3313                         /*
3314                          * take the lock to protect our accesses
3315                          * of the writebehind and sparse cluster state
3316                          */
3317                         wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3318
3319                         if (wbp->cl_scmap) {
3320
3321                                 if ( !(flags & IO_NOCACHE)) {
3322                                         /*
3323                                          * we've fallen into the sparse
3324                                          * cluster method of delaying dirty pages
3325                                          */
3326                                         sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg);
3327
3328                                         lck_mtx_unlock(&wbp->cl_lockw);
3329
3330                                         continue;
3331                                 }
3332                                 /*
3333                                  * must have done cached writes that fell into
3334                                  * the sparse cluster mechanism... we've switched
3335                                  * to uncached writes on the file, so go ahead
3336                                  * and push whatever's in the sparse map
3337                                  * and switch back to normal clustering
3338                                  */
3339                                 wbp->cl_number = 0;
3340
3341                                 sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg);
3342                                 /*
3343                                  * no clusters of either type present at this point
3344                                  * so just go directly to start_new_cluster since
3345                                  * we know we need to delay this I/O since we've
3346                                  * already released the pages back into the cache
3347                                  * to avoid the deadlock with sparse_cluster_push
3348                                  */
3349                                 goto start_new_cluster;
3350                         }
3351                         if (first_pass) {
3352                                 if (write_off == wbp->cl_last_write)
3353                                         wbp->cl_seq_written += write_cnt;
3354                                 else
3355                                         wbp->cl_seq_written = write_cnt;
3356
3357                                 wbp->cl_last_write = write_off + write_cnt;
3358
3359                                 first_pass = FALSE;
3360                         }
3361                         if (wbp->cl_number == 0)
3362                                 /*
3363                                  * no clusters currently present
3364                                  */
3365                                 goto start_new_cluster;
3366
3367                         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3368                                 /*
3369                                  * check each cluster that we currently hold
3370                                  * try to merge some or all of this write into
3371                                  * one or more of the existing clusters... if
3372                                  * any portion of the write remains, start a
3373                                  * new cluster
3374                                  */
3375                                 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
3376                                         /*
3377                                          * the current write starts at or after the current cluster
3378                                          */
3379                                         if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3380                                                 /*
3381                                                  * we have a write that fits entirely
3382                                                  * within the existing cluster limits
3383                                                  */
3384                                                 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
3385                                                         /*
3386                                                          * update our idea of where the cluster ends
3387                                                          */
3388                                                         wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
3389                                                 break;
3390                                         }
3391                                         if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3392                                                 /*
3393                                                  * we have a write that starts in the middle of the current cluster
3394                                                  * but extends beyond the cluster's limit... we know this because
3395                                                  * of the previous checks
3396                                                  * we'll extend the current cluster to the max
3397                                                  * and update the b_addr for the current write to reflect that
3398                                                  * the head of it was absorbed into this cluster...
3399                                                  * note that we'll always have a leftover tail in this case since
3400                                                  * full absorbtion would have occurred in the clause above
3401                                                  */
3402                                                 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
3403
3404                                                 cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
3405                                         }
3406                                         /*
3407                                          * we come here for the case where the current write starts
3408                                          * beyond the limit of the existing cluster or we have a leftover
3409                                          * tail after a partial absorbtion
3410                                          *
3411                                          * in either case, we'll check the remaining clusters before
3412                                          * starting a new one
3413                                          */
3414                                 } else {
3415                                         /*
3416                                          * the current write starts in front of the cluster we're currently considering
3417                                          */
3418                                         if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= max_cluster_pgcount) {
3419                                                 /*
3420                                                  * we can just merge the new request into
3421                                                  * this cluster and leave it in the cache
3422                                                  * since the resulting cluster is still
3423                                                  * less than the maximum allowable size
3424                                                  */
3425                                                 wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
3426
3427                                                 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
3428                                                         /*
3429                                                          * the current write completely
3430                                                          * envelops the existing cluster and since
3431                                                          * each write is limited to at most max_cluster_pgcount pages
3432                                                          * we can just use the start and last blocknos of the write
3433                                                          * to generate the cluster limits
3434                                                          */
3435                                                         wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
3436                                                 }
3437                                                 break;
3438                                         }
3439
3440                                         /*
3441                                          * if we were to combine this write with the current cluster
3442                                          * we would exceed the cluster size limit.... so,
3443                                          * let's see if there's any overlap of the new I/O with
3444                                          * the cluster we're currently considering... in fact, we'll
3445                                          * stretch the cluster out to it's full limit and see if we
3446                                          * get an intersection with the current write
3447                                          *
3448                                          */
3449                                         if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
3450                                                 /*
3451                                                  * the current write extends into the proposed cluster
3452                                                  * clip the length of the current write after first combining it's
3453                                                  * tail with the newly shaped cluster
3454                                                  */
3455                                                 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
3456
3457                                                 cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
3458                                         }
3459                                         /*
3460                                          * if we get here, there was no way to merge
3461                                          * any portion of this write with this cluster
3462                                          * or we could only merge part of it which
3463                                          * will leave a tail...
3464                                          * we'll check the remaining clusters before starting a new one
3465                                          */
3466                                 }
3467                         }
3468                         if (cl_index < wbp->cl_number)
3469                                 /*
3470                                  * we found an existing cluster(s) that we
3471                                  * could entirely merge this I/O into
3472                                  */
3473                                 goto delay_io;
3474
3475                         if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) &&
3476                             wbp->cl_number == MAX_CLUSTERS &&
3477                             wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
3478                                 uint32_t        n;
3479
3480                                 if (vp->v_mount->mnt_kern_flag & MNTK_SSD)
3481                                         n = WRITE_BEHIND_SSD;
3482                                 else
3483                                         n = WRITE_BEHIND;
3484
3485                                 while (n--)
3486                                         cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg);
3487                         }
3488                         if (wbp->cl_number < MAX_CLUSTERS) {
3489                                 /*
3490                                  * we didn't find an existing cluster to
3491                                  * merge into, but there's room to start
3492                                  * a new one
3493                                  */
3494                                 goto start_new_cluster;
3495                         }
3496                         /*
3497                          * no exisitng cluster to merge with and no
3498                          * room to start a new one... we'll try
3499                          * pushing one of the existing ones... if none of
3500                          * them are able to be pushed, we'll switch
3501                          * to the sparse cluster mechanism
3502                          * cluster_try_push updates cl_number to the
3503                          * number of remaining clusters... and
3504                          * returns the number of currently unused clusters
3505                          */
3506                         ret_cluster_try_push = 0;
3507
3508                         /*
3509                          * if writes are not deferred, call cluster push immediately
3510                          */
3511                         if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
3512
3513                                 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg);
3514                         }
3515
3516                         /*
3517                          * execute following regardless of writes being deferred or not
3518                          */
3519                         if (ret_cluster_try_push == 0) {
3520                                 /*
3521                                  * no more room in the normal cluster mechanism
3522                                  * so let's switch to the more expansive but expensive
3523                                  * sparse mechanism....
3524                                  */
3525                                 sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg);
3526                                 sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg);
3527
3528                                 lck_mtx_unlock(&wbp->cl_lockw);
3529
3530                                 continue;
3531                         }
3532 start_new_cluster:
3533                         wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
3534                         wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
3535
3536                         wbp->cl_clusters[wbp->cl_number].io_flags = 0;
3537
3538                         if (flags & IO_NOCACHE)
3539                                 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
3540
3541                         if (bflag & CL_PASSIVE)
3542                                 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
3543
3544                         wbp->cl_number++;
3545 delay_io:
3546                         lck_mtx_unlock(&wbp->cl_lockw);
3547
3548                         continue;
3549 issue_io:
3550                         /*
3551                          * we don't hold the lock at this point
3552                          *
3553                          * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3554                          * so that we correctly deal with a change in state of the hardware modify bit...
3555                          * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3556                          * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3557                          * responsible for generating the correct sized I/O(s)
3558                          */
3559                         retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg);
3560                 }
3561         }
3562         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0);
3563
3564         return (retval);
3565 }
3566
3567
3568
3569 int
3570 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
3571 {
3572         return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
3573 }
3574
3575
3576 int
3577 cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg)
3578 {
3579         int             retval = 0;
3580         int             flags;
3581         user_ssize_t    cur_resid;
3582         u_int32_t       io_size;
3583         u_int32_t       read_length = 0;
3584         int             read_type = IO_COPY;
3585
3586         flags = xflags;
3587
3588         if (vp->v_flag & VNOCACHE_DATA)
3589                 flags |= IO_NOCACHE;
3590         if ((vp->v_flag & VRAOFF) || speculative_reads_disabled)
3591                 flags |= IO_RAOFF;
3592
3593         if (flags & IO_SKIP_ENCRYPTION)
3594                 flags |= IO_ENCRYPTED;
3595         /*
3596          * If we're doing an encrypted IO, then first check to see
3597          * if the IO requested was page aligned.  If not, then bail
3598          * out immediately.
3599          */
3600         if (flags & IO_ENCRYPTED) {
3601                 if (read_length & PAGE_MASK) {
3602                         retval = EINVAL;
3603                         return retval;
3604                 }
3605         }
3606
3607         /*
3608          * do a read through the cache if one of the following is true....
3609          *   NOCACHE is not true
3610          *   the uio request doesn't target USERSPACE
3611          * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
3612          * Reading encrypted data from a CP filesystem should never result in the data touching
3613          * the UBC.
3614          *
3615          * otherwise, find out if we want the direct or contig variant for
3616          * the first vector in the uio request
3617          */
3618         if ( ((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) || (flags & IO_ENCRYPTED) ) {
3619
3620                 retval = cluster_io_type(uio, &read_type, &read_length, 0);
3621         }
3622
3623         while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
3624
3625                 switch (read_type) {
3626
3627                 case IO_COPY:
3628                         /*
3629                          * make sure the uio_resid isn't too big...
3630                          * internally, we want to handle all of the I/O in
3631                          * chunk sizes that fit in a 32 bit int
3632                          */
3633                         if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE))
3634                                 io_size = MAX_IO_REQUEST_SIZE;
3635                         else
3636                                 io_size = (u_int32_t)cur_resid;
3637
3638                         retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
3639                         break;
3640
3641                 case IO_DIRECT:
3642                         retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
3643                         break;
3644
3645                 case IO_CONTIG:
3646                         retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
3647                         break;
3648
3649                 case IO_UNKNOWN:
3650                         retval = cluster_io_type(uio, &read_type, &read_length, 0);
3651                         break;
3652                 }
3653         }
3654         return (retval);
3655 }
3656
3657
3658
3659 static void
3660 cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
3661 {
3662         int range;
3663         int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
3664
3665         if ((range = last_pg - start_pg)) {
3666                 if (take_reference)
3667                         abort_flags |= UPL_ABORT_REFERENCE;
3668
3669                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
3670         }
3671 }
3672
3673
3674 static int
3675 cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3676 {
3677         upl_page_info_t *pl;
3678         upl_t            upl;
3679         vm_offset_t      upl_offset;
3680         u_int32_t        upl_size;
3681         off_t            upl_f_offset;
3682         int              start_offset;
3683         int              start_pg;
3684         int              last_pg;
3685         int              uio_last = 0;
3686         int              pages_in_upl;
3687         off_t            max_size;
3688         off_t            last_ioread_offset;
3689         off_t            last_request_offset;
3690         kern_return_t    kret;
3691         int              error  = 0;
3692         int              retval = 0;
3693         u_int32_t        size_of_prefetch;
3694         u_int32_t        xsize;
3695         u_int32_t        io_size;
3696         u_int32_t        max_rd_size;
3697         u_int32_t        max_io_size;
3698         u_int32_t        max_prefetch;
3699         u_int            rd_ahead_enabled = 1;
3700         u_int            prefetch_enabled = 1;
3701         struct cl_readahead *   rap;
3702         struct clios            iostate;
3703         struct cl_extent        extent;
3704         int              bflag;
3705         int              take_reference = 1;
3706         int              policy = IOPOL_DEFAULT;
3707         boolean_t        iolock_inited = FALSE;
3708
3709         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
3710                      (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
3711
3712         if (flags & IO_ENCRYPTED) {
3713                 panic ("encrypted blocks will hit UBC!");
3714         }
3715
3716         policy = throttle_get_io_policy(NULL);
3717
3718         if (policy == THROTTLE_LEVEL_TIER3 || policy == THROTTLE_LEVEL_TIER2 || (flags & IO_NOCACHE))
3719                 take_reference = 0;
3720
3721         if (flags & IO_PASSIVE)
3722                 bflag = CL_PASSIVE;
3723         else
3724                 bflag = 0;
3725
3726         if (flags & IO_NOCACHE)
3727                 bflag |= CL_NOCACHE;
3728
3729         if (flags & IO_SKIP_ENCRYPTION)
3730                 bflag |= CL_ENCRYPTED;
3731
3732         max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
3733         max_prefetch = MAX_PREFETCH(vp, max_io_size, (vp->v_mount->mnt_kern_flag & MNTK_SSD));
3734         max_rd_size = max_prefetch;
3735
3736         last_request_offset = uio->uio_offset + io_req_size;
3737
3738         if (last_request_offset > filesize)
3739                 last_request_offset = filesize;
3740
3741         if ((flags & (IO_RAOFF|IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
3742                 rd_ahead_enabled = 0;
3743                 rap = NULL;
3744         } else {
3745                 if (cluster_is_throttled(vp)) {
3746                         /*
3747                          * we're in the throttle window, at the very least
3748                          * we want to limit the size of the I/O we're about
3749                          * to issue
3750                          */
3751                         rd_ahead_enabled = 0;
3752                         prefetch_enabled = 0;
3753
3754                         max_rd_size = THROTTLE_MAX_IOSIZE;
3755                 }
3756                 if ((rap = cluster_get_rap(vp)) == NULL)
3757                         rd_ahead_enabled = 0;
3758                 else {
3759                         extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
3760                         extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
3761                 }
3762         }
3763         if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
3764                 /*
3765                  * determine if we already have a read-ahead in the pipe courtesy of the
3766                  * last read systemcall that was issued...
3767                  * if so, pick up it's extent to determine where we should start
3768                  * with respect to any read-ahead that might be necessary to
3769                  * garner all the data needed to complete this read systemcall
3770                  */
3771                 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
3772
3773                 if (last_ioread_offset < uio->uio_offset)
3774                         last_ioread_offset = (off_t)0;
3775                 else if (last_ioread_offset > last_request_offset)
3776                         last_ioread_offset = last_request_offset;
3777         } else
3778                 last_ioread_offset = (off_t)0;
3779
3780         while (io_req_size && uio->uio_offset < filesize && retval == 0) {
3781
3782                 max_size = filesize - uio->uio_offset;
3783
3784                 if ((off_t)(io_req_size) < max_size)
3785                         io_size = io_req_size;
3786                 else
3787                         io_size = max_size;
3788
3789                 if (!(flags & IO_NOCACHE)) {
3790
3791                         while (io_size) {
3792                                 u_int32_t io_resid;
3793                                 u_int32_t io_requested;
3794
3795                                 /*
3796                                  * if we keep finding the pages we need already in the cache, then
3797                                  * don't bother to call cluster_read_prefetch since it costs CPU cycles
3798                                  * to determine that we have all the pages we need... once we miss in
3799                                  * the cache and have issued an I/O, than we'll assume that we're likely
3800                                  * to continue to miss in the cache and it's to our advantage to try and prefetch
3801                                  */
3802                                 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
3803                                         if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
3804                                                 /*
3805                                                  * we've already issued I/O for this request and
3806                                                  * there's still work to do and
3807                                                  * our prefetch stream is running dry, so issue a
3808                                                  * pre-fetch I/O... the I/O latency will overlap
3809                                                  * with the copying of the data
3810                                                  */
3811                                                 if (size_of_prefetch > max_rd_size)
3812                                                         size_of_prefetch = max_rd_size;
3813
3814                                                 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
3815
3816                                                 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
3817
3818                                                 if (last_ioread_offset > last_request_offset)
3819                                                         last_ioread_offset = last_request_offset;
3820                                         }
3821                                 }
3822                                 /*
3823                                  * limit the size of the copy we're about to do so that
3824                                  * we can notice that our I/O pipe is running dry and
3825                                  * get the next I/O issued before it does go dry
3826                                  */
3827                                 if (last_ioread_offset && io_size > (max_io_size / 4))
3828                                         io_resid = (max_io_size / 4);
3829                                 else
3830                                         io_resid = io_size;
3831
3832                                 io_requested = io_resid;
3833
3834                                 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
3835
3836                                 xsize = io_requested - io_resid;
3837
3838                                 io_size -= xsize;
3839                                 io_req_size -= xsize;
3840
3841                                 if (retval || io_resid)
3842                                         /*
3843                                          * if we run into a real error or
3844                                          * a page that is not in the cache
3845                                          * we need to leave streaming mode
3846                                          */
3847                                         break;
3848
3849                                 if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) {
3850                                         /*
3851                                          * we're already finished the I/O for this read request
3852                                          * let's see if we should do a read-ahead
3853                                          */
3854                                         cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
3855                                 }
3856                         }
3857                         if (retval)
3858                                 break;
3859                         if (io_size == 0) {
3860                                 if (rap != NULL) {
3861                                         if (extent.e_addr < rap->cl_lastr)
3862                                                 rap->cl_maxra = 0;
3863                                         rap->cl_lastr = extent.e_addr;
3864                                 }
3865                                 break;
3866                         }
3867                         /*
3868                          * recompute max_size since cluster_copy_ubc_data_internal
3869                          * may have advanced uio->uio_offset
3870                          */
3871                         max_size = filesize - uio->uio_offset;
3872                 }
3873
3874                 iostate.io_completed = 0;
3875                 iostate.io_issued = 0;
3876                 iostate.io_error = 0;
3877                 iostate.io_wanted = 0;
3878
3879                 if ( (flags & IO_RETURN_ON_THROTTLE) ) {
3880                         if (cluster_is_throttled(vp) == THROTTLE_NOW) {
3881                                 if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) {
3882                                         /*
3883                                          * we're in the throttle window and at least 1 I/O
3884                                          * has already been issued by a throttleable thread
3885                                          * in this window, so return with EAGAIN to indicate
3886                                          * to the FS issuing the cluster_read call that it
3887                                          * should now throttle after dropping any locks
3888                                          */
3889                                         throttle_info_update_by_mount(vp->v_mount);
3890
3891                                         retval = EAGAIN;
3892                                         break;
3893                                 }
3894                         }
3895                 }
3896
3897                 /*
3898                  * compute the size of the upl needed to encompass
3899                  * the requested read... limit each call to cluster_io
3900                  * to the maximum UPL size... cluster_io will clip if
3901                  * this exceeds the maximum io_size for the device,
3902                  * make sure to account for
3903                  * a starting offset that's not page aligned
3904                  */
3905                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3906                 upl_f_offset = uio->uio_offset - (off_t)start_offset;
3907
3908                 if (io_size > max_rd_size)
3909                         io_size = max_rd_size;
3910
3911                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3912
3913                 if (flags & IO_NOCACHE) {
3914                         if (upl_size > max_io_size)
3915                                 upl_size = max_io_size;
3916                 } else {
3917                         if (upl_size > max_io_size / 4) {
3918                                 upl_size = max_io_size / 4;
3919                                 upl_size &= ~PAGE_MASK;
3920
3921                                 if (upl_size == 0)
3922                                         upl_size = PAGE_SIZE;
3923                         }
3924                 }
3925                 pages_in_upl = upl_size / PAGE_SIZE;
3926
3927                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
3928                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
3929
3930                 kret = ubc_create_upl(vp,
3931                                       upl_f_offset,
3932                                       upl_size,
3933                                       &upl,
3934                                       &pl,
3935                                       UPL_FILE_IO | UPL_SET_LITE);
3936                 if (kret != KERN_SUCCESS)
3937                         panic("cluster_read_copy: failed to get pagelist");
3938
3939                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
3940                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
3941
3942                 /*
3943                  * scan from the beginning of the upl looking for the first
3944                  * non-valid page.... this will become the first page in
3945                  * the request we're going to make to 'cluster_io'... if all
3946                  * of the pages are valid, we won't call through to 'cluster_io'
3947                  */
3948                 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
3949                         if (!upl_valid_page(pl, start_pg))
3950                                 break;
3951                 }
3952
3953                 /*
3954                  * scan from the starting invalid page looking for a valid
3955                  * page before the end of the upl is reached, if we
3956                  * find one, then it will be the last page of the request to
3957                  * 'cluster_io'
3958                  */
3959                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3960                         if (upl_valid_page(pl, last_pg))
3961                                 break;
3962                 }
3963
3964                 if (start_pg < last_pg) {
3965                         /*
3966                          * we found a range of 'invalid' pages that must be filled
3967                          * if the last page in this range is the last page of the file
3968                          * we may have to clip the size of it to keep from reading past
3969                          * the end of the last physical block associated with the file
3970                          */
3971                         if (iolock_inited == FALSE) {
3972                                 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
3973
3974                                 iolock_inited = TRUE;
3975                         }
3976                         upl_offset = start_pg * PAGE_SIZE;
3977                         io_size    = (last_pg - start_pg) * PAGE_SIZE;
3978
3979                         if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize)
3980                                 io_size = filesize - (upl_f_offset + upl_offset);
3981
3982                         /*
3983                          * issue an asynchronous read to cluster_io
3984                          */
3985
3986                         error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
3987                                            io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg);
3988
3989                         if (rap) {
3990                                 if (extent.e_addr < rap->cl_maxra) {
3991                                        /*
3992                                         * we've just issued a read for a block that should have been
3993                                         * in the cache courtesy of the read-ahead engine... something
3994                                         * has gone wrong with the pipeline, so reset the read-ahead
3995                                         * logic which will cause us to restart from scratch
3996                                         */
3997                                         rap->cl_maxra = 0;
3998                                }
3999                         }
4000                 }
4001                 if (error == 0) {
4002                         /*
4003                          * if the read completed successfully, or there was no I/O request
4004                          * issued, than copy the data into user land via 'cluster_upl_copy_data'
4005                          * we'll first add on any 'valid'
4006                          * pages that were present in the upl when we acquired it.
4007                          */
4008                         u_int  val_size;
4009
4010                         for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4011                                 if (!upl_valid_page(pl, uio_last))
4012                                         break;
4013                         }
4014                         if (uio_last < pages_in_upl) {
4015                                 /*
4016                                  * there were some invalid pages beyond the valid pages
4017                                  * that we didn't issue an I/O for, just release them
4018                                  * unchanged now, so that any prefetch/readahed can
4019                                  * include them
4020                                  */
4021                                 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4022                                                     (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4023                         }
4024
4025                         /*
4026                          * compute size to transfer this round,  if io_req_size is
4027                          * still non-zero after this attempt, we'll loop around and
4028                          * set up for another I/O.
4029                          */
4030                         val_size = (uio_last * PAGE_SIZE) - start_offset;
4031
4032                         if (val_size > max_size)
4033                                 val_size = max_size;
4034
4035                         if (val_size > io_req_size)
4036                                 val_size = io_req_size;
4037
4038                         if ((uio->uio_offset + val_size) > last_ioread_offset)
4039                                 last_ioread_offset = uio->uio_offset + val_size;
4040
4041                         if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
4042
4043                                 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
4044                                         /*
4045                                          * if there's still I/O left to do for this request, and...
4046                                          * we're not in hard throttle mode, and...
4047                                          * we're close to using up the previous prefetch, then issue a
4048                                          * new pre-fetch I/O... the I/O latency will overlap
4049                                          * with the copying of the data
4050                                          */
4051                                         if (size_of_prefetch > max_rd_size)
4052                                                 size_of_prefetch = max_rd_size;
4053
4054                                         size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4055
4056                                         last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4057
4058                                         if (last_ioread_offset > last_request_offset)
4059                                                 last_ioread_offset = last_request_offset;
4060                                 }
4061
4062                         } else if ((uio->uio_offset + val_size) == last_request_offset) {
4063                                 /*
4064                                  * this transfer will finish this request, so...
4065                                  * let's try to read ahead if we're in
4066                                  * a sequential access pattern and we haven't
4067                                  * explicitly disabled it
4068                                  */
4069                                 if (rd_ahead_enabled)
4070                                         cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4071
4072                                 if (rap != NULL) {
4073                                         if (extent.e_addr < rap->cl_lastr)
4074                                                 rap->cl_maxra = 0;
4075                                         rap->cl_lastr = extent.e_addr;
4076                                 }
4077                         }
4078                         if (iolock_inited == TRUE)
4079                                 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4080
4081                         if (iostate.io_error)
4082                                 error = iostate.io_error;
4083                         else {
4084                                 u_int32_t io_requested;
4085
4086                                 io_requested = val_size;
4087
4088                                 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
4089
4090                                 io_req_size -= (val_size - io_requested);
4091                         }
4092                 } else {
4093                         if (iolock_inited == TRUE)
4094                                 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4095                 }
4096                 if (start_pg < last_pg) {
4097                         /*
4098                          * compute the range of pages that we actually issued an I/O for
4099                          * and either commit them as valid if the I/O succeeded
4100                          * or abort them if the I/O failed or we're not supposed to
4101                          * keep them in the cache
4102                          */
4103                         io_size = (last_pg - start_pg) * PAGE_SIZE;
4104
4105                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4106
4107                         if (error || (flags & IO_NOCACHE))
4108                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
4109                                                     UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4110                         else {
4111                                 int     commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
4112
4113                                 if (take_reference)
4114                                         commit_flags |= UPL_COMMIT_INACTIVATE;
4115                                 else
4116                                         commit_flags |= UPL_COMMIT_SPECULATE;
4117
4118                                 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
4119                         }
4120                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4121                 }
4122                 if ((last_pg - start_pg) < pages_in_upl) {
4123                         /*
4124                          * the set of pages that we issued an I/O for did not encompass
4125                          * the entire upl... so just release these without modifying
4126                          * their state
4127                          */
4128                         if (error)
4129                                 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
4130                         else {
4131
4132                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
4133                                              upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
4134
4135                                 /*
4136                                  * handle any valid pages at the beginning of
4137                                  * the upl... release these appropriately
4138                                  */
4139                                 cluster_read_upl_release(upl, 0, start_pg, take_reference);
4140
4141                                 /*
4142                                  * handle any valid pages immediately after the
4143                                  * pages we issued I/O for... ... release these appropriately
4144                                  */
4145                                 cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
4146
4147                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0);
4148                         }
4149                 }
4150                 if (retval == 0)
4151                         retval = error;
4152
4153                 if (io_req_size) {
4154                         if (cluster_is_throttled(vp)) {
4155                                 /*
4156                                  * we're in the throttle window, at the very least
4157                                  * we want to limit the size of the I/O we're about
4158                                  * to issue
4159                                  */
4160                                 rd_ahead_enabled = 0;
4161                                 prefetch_enabled = 0;
4162                                 max_rd_size = THROTTLE_MAX_IOSIZE;
4163                         } else {
4164                                 if (max_rd_size == THROTTLE_MAX_IOSIZE) {
4165                                         /*
4166                                          * coming out of throttled state
4167                                          */
4168                                         if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
4169                                                 if (rap != NULL)
4170                                                         rd_ahead_enabled = 1;
4171                                                 prefetch_enabled = 1;
4172                                         }
4173                                         max_rd_size = max_prefetch;
4174                                         last_ioread_offset = 0;
4175                                 }
4176                         }
4177                 }
4178         }
4179         if (iolock_inited == TRUE) {
4180                 /*
4181                  * cluster_io returned an error after it
4182                  * had already issued some I/O.  we need
4183                  * to wait for that I/O to complete before
4184                  * we can destroy the iostate mutex...
4185                  * 'retval' already contains the early error
4186                  * so no need to pick it up from iostate.io_error
4187                  */
4188                 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4189
4190                 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
4191         }
4192         if (rap != NULL) {
4193                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4194                              (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
4195
4196                 lck_mtx_unlock(&rap->cl_lockr);
4197         } else {
4198                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4199                              (int)uio->uio_offset, io_req_size, 0, retval, 0);
4200         }
4201
4202         return (retval);
4203 }
4204
4205 /*
4206  * We don't want another read/write lock for every vnode in the system
4207  * so we keep a hash of them here.  There should never be very many of
4208  * these around at any point in time.
4209  */
4210 cl_direct_read_lock_t *cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
4211 {
4212         struct cl_direct_read_locks *head
4213                 = &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
4214                                                                 % CL_DIRECT_READ_LOCK_BUCKETS];
4215
4216         struct cl_direct_read_lock *lck, *new_lck = NULL;
4217
4218         for (;;) {
4219                 lck_spin_lock(&cl_direct_read_spin_lock);
4220
4221                 LIST_FOREACH(lck, head, chain) {
4222                         if (lck->vp == vp) {
4223                                 ++lck->ref_count;
4224                                 lck_spin_unlock(&cl_direct_read_spin_lock);
4225                                 if (new_lck) {
4226                                         // Someone beat us to it, ditch the allocation
4227                                         lck_rw_destroy(&new_lck->rw_lock, cl_mtx_grp);
4228                                         FREE(new_lck, M_TEMP);
4229                                 }
4230                                 lck_rw_lock(&lck->rw_lock, type);
4231                                 return lck;
4232                         }
4233                 }
4234
4235                 if (new_lck) {
4236                         // Use the lock we allocated
4237                         LIST_INSERT_HEAD(head, new_lck, chain);
4238                         lck_spin_unlock(&cl_direct_read_spin_lock);
4239                         lck_rw_lock(&new_lck->rw_lock, type);
4240                         return new_lck;
4241                 }
4242
4243                 lck_spin_unlock(&cl_direct_read_spin_lock);
4244
4245                 // Allocate a new lock
4246                 MALLOC(new_lck, cl_direct_read_lock_t *, sizeof(*new_lck),
4247                            M_TEMP, M_WAITOK);
4248                 lck_rw_init(&new_lck->rw_lock, cl_mtx_grp, cl_mtx_attr);
4249                 new_lck->vp = vp;
4250                 new_lck->ref_count = 1;
4251
4252                 // Got to go round again
4253         }
4254 }
4255
4256 void cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
4257 {
4258         lck_rw_done(&lck->rw_lock);
4259
4260         lck_spin_lock(&cl_direct_read_spin_lock);
4261         if (lck->ref_count == 1) {
4262                 LIST_REMOVE(lck, chain);
4263                 lck_spin_unlock(&cl_direct_read_spin_lock);
4264                 lck_rw_destroy(&lck->rw_lock, cl_mtx_grp);
4265                 FREE(lck, M_TEMP);
4266         } else {
4267                 --lck->ref_count;
4268                 lck_spin_unlock(&cl_direct_read_spin_lock);
4269         }
4270 }
4271
4272 static int
4273 cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
4274                     int flags, int (*callback)(buf_t, void *), void *callback_arg)
4275 {
4276         upl_t            upl;
4277         upl_page_info_t  *pl;
4278         off_t            max_io_size;
4279         vm_offset_t      upl_offset, vector_upl_offset = 0;
4280         upl_size_t       upl_size, vector_upl_size = 0;
4281         vm_size_t        upl_needed_size;
4282         unsigned int     pages_in_pl;
4283         upl_control_flags_t upl_flags;
4284         kern_return_t    kret;
4285         unsigned int     i;
4286         int              force_data_sync;
4287         int              retval = 0;
4288         int              no_zero_fill = 0;
4289         int              io_flag = 0;
4290         int              misaligned = 0;
4291         struct clios     iostate;
4292         user_addr_t      iov_base;
4293         u_int32_t        io_req_size;
4294         u_int32_t        offset_in_file;
4295         u_int32_t        offset_in_iovbase;
4296         u_int32_t        io_size;
4297         u_int32_t        io_min;
4298         u_int32_t        xsize;
4299         u_int32_t        devblocksize;
4300         u_int32_t        mem_alignment_mask;
4301         u_int32_t        max_upl_size;
4302         u_int32_t        max_rd_size;
4303         u_int32_t        max_rd_ahead;
4304         u_int32_t        max_vector_size;
4305         boolean_t        strict_uncached_IO = FALSE;
4306         boolean_t        io_throttled = FALSE;
4307
4308         u_int32_t        vector_upl_iosize = 0;
4309         int              issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1);
4310         off_t            v_upl_uio_offset = 0;
4311         int              vector_upl_index=0;
4312         upl_t            vector_upl = NULL;
4313         cl_direct_read_lock_t *lock = NULL;
4314
4315         user_addr_t      orig_iov_base = 0;
4316         user_addr_t      last_iov_base = 0;
4317         user_addr_t      next_iov_base = 0;
4318
4319         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
4320                      (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
4321
4322         max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
4323
4324         max_rd_size = max_upl_size;
4325         max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4326
4327         io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO;
4328
4329         if (flags & IO_PASSIVE)
4330                 io_flag |= CL_PASSIVE;
4331
4332         if (flags & IO_ENCRYPTED) {
4333                 io_flag |= CL_RAW_ENCRYPTED;
4334         }
4335
4336         if (flags & IO_NOCACHE) {
4337                 io_flag |= CL_NOCACHE;
4338         }
4339
4340         if (flags & IO_SKIP_ENCRYPTION)
4341                 io_flag |= CL_ENCRYPTED;
4342
4343         iostate.io_completed = 0;
4344         iostate.io_issued = 0;
4345         iostate.io_error = 0;
4346         iostate.io_wanted = 0;
4347
4348         lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
4349
4350         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
4351         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
4352
4353         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
4354                      (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
4355
4356         if (devblocksize == 1) {
4357                /*
4358                 * the AFP client advertises a devblocksize of 1
4359                 * however, its BLOCKMAP routine maps to physical
4360                 * blocks that are PAGE_SIZE in size...
4361                 * therefore we can't ask for I/Os that aren't page aligned
4362                 * or aren't multiples of PAGE_SIZE in size
4363                 * by setting devblocksize to PAGE_SIZE, we re-instate
4364                 * the old behavior we had before the mem_alignment_mask
4365                 * changes went in...
4366                 */
4367                devblocksize = PAGE_SIZE;
4368         }
4369
4370         strict_uncached_IO = ubc_strict_uncached_IO(vp);
4371
4372         orig_iov_base = uio_curriovbase(uio);
4373         last_iov_base = orig_iov_base;
4374
4375 next_dread:
4376         io_req_size = *read_length;
4377         iov_base = uio_curriovbase(uio);
4378
4379         max_io_size = filesize - uio->uio_offset;
4380
4381         if ((off_t)io_req_size > max_io_size)
4382                 io_req_size = max_io_size;
4383
4384         offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
4385         offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
4386
4387         if (offset_in_file || offset_in_iovbase) {
4388                 /*
4389                  * one of the 2 important offsets is misaligned
4390                  * so fire an I/O through the cache for this entire vector
4391                  */
4392                 misaligned = 1;
4393         }
4394         if (iov_base & (devblocksize - 1)) {
4395                 /*
4396                  * the offset in memory must be on a device block boundary
4397                  * so that we can guarantee that we can generate an
4398                  * I/O that ends on a page boundary in cluster_io
4399                  */
4400                 misaligned = 1;
4401     }
4402
4403         /*
4404          * The user must request IO in aligned chunks.  If the
4405          * offset into the file is bad, or the userland pointer
4406          * is non-aligned, then we cannot service the encrypted IO request.
4407          */
4408         if ((flags & IO_ENCRYPTED) && (misaligned)) {
4409                 retval = EINVAL;
4410         }
4411
4412         /*
4413          * When we get to this point, we know...
4414          *  -- the offset into the file is on a devblocksize boundary
4415          */
4416
4417         while (io_req_size && retval == 0) {
4418                 u_int32_t io_start;
4419
4420                 if (cluster_is_throttled(vp)) {
4421                         /*
4422                          * we're in the throttle window, at the very least
4423                          * we want to limit the size of the I/O we're about
4424                          * to issue
4425                          */
4426                         max_rd_size  = THROTTLE_MAX_IOSIZE;
4427                         max_rd_ahead = THROTTLE_MAX_IOSIZE - 1;
4428                         max_vector_size = THROTTLE_MAX_IOSIZE;
4429                 } else {
4430                         max_rd_size  = max_upl_size;
4431                         max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4432                         max_vector_size = MAX_VECTOR_UPL_SIZE;
4433                 }
4434                 io_start = io_size = io_req_size;
4435
4436                 /*
4437                  * First look for pages already in the cache
4438                  * and move them to user space.  But only do this
4439                  * check if we are not retrieving encrypted data directly
4440                  * from the filesystem;  those blocks should never
4441                  * be in the UBC.
4442                  *
4443                  * cluster_copy_ubc_data returns the resid
4444                  * in io_size
4445                  */
4446                 if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) {
4447                         retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
4448                 }
4449                 /*
4450                  * calculate the number of bytes actually copied
4451                  * starting size - residual
4452                  */
4453                 xsize = io_start - io_size;
4454
4455                 io_req_size -= xsize;
4456
4457                 if(useVectorUPL && (xsize || (iov_base & PAGE_MASK))) {
4458                         /*
4459                          * We found something in the cache or we have an iov_base that's not
4460                          * page-aligned.
4461                          *
4462                          * Issue all I/O's that have been collected within this Vectored UPL.
4463                          */
4464                         if(vector_upl_index) {
4465                                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4466                                 reset_vector_run_state();
4467                         }
4468
4469                         if(xsize)
4470                                 useVectorUPL = 0;
4471
4472                        /*
4473                         * After this point, if we are using the Vector UPL path and the base is
4474                         * not page-aligned then the UPL with that base will be the first in the vector UPL.
4475                         */
4476                 }
4477
4478                 /*
4479                  * check to see if we are finished with this request.
4480                  *
4481                  * If we satisfied this IO already, then io_req_size will be 0.
4482                  * Otherwise, see if the IO was mis-aligned and needs to go through
4483                  * the UBC to deal with the 'tail'.
4484                  *
4485                  */
4486                 if (io_req_size == 0 || (misaligned)) {
4487                         /*
4488                          * see if there's another uio vector to
4489                          * process that's of type IO_DIRECT
4490                          *
4491                          * break out of while loop to get there
4492                          */
4493                         break;
4494                 }
4495                 /*
4496                  * assume the request ends on a device block boundary
4497                  */
4498                 io_min = devblocksize;
4499
4500                 /*
4501                  * we can handle I/O's in multiples of the device block size
4502                  * however, if io_size isn't a multiple of devblocksize we
4503                  * want to clip it back to the nearest page boundary since
4504                  * we are going to have to go through cluster_read_copy to
4505                  * deal with the 'overhang'... by clipping it to a PAGE_SIZE
4506                  * multiple, we avoid asking the drive for the same physical
4507                  * blocks twice.. once for the partial page at the end of the
4508                  * request and a 2nd time for the page we read into the cache
4509                  * (which overlaps the end of the direct read) in order to
4510                  * get at the overhang bytes
4511                  */
4512                 if (io_size & (devblocksize - 1)) {
4513                         if (flags & IO_ENCRYPTED) {
4514                                 /*
4515                                  * Normally, we'd round down to the previous page boundary to
4516                                  * let the UBC manage the zero-filling of the file past the EOF.
4517                                  * But if we're doing encrypted IO, we can't let any of
4518                                  * the data hit the UBC.  This means we have to do the full
4519                                  * IO to the upper block boundary of the device block that
4520                                  * contains the EOF. The user will be responsible for not
4521                                  * interpreting data PAST the EOF in its buffer.
4522                                  *
4523                                  * So just bump the IO back up to a multiple of devblocksize
4524                                  */
4525                                 io_size = ((io_size + devblocksize) & ~(devblocksize - 1));
4526                                 io_min = io_size;
4527                         }
4528                         else {
4529                                 /*
4530                                  * Clip the request to the previous page size boundary
4531                                  * since request does NOT end on a device block boundary
4532                                  */
4533                                 io_size &= ~PAGE_MASK;
4534                                 io_min = PAGE_SIZE;
4535                         }
4536
4537                 }
4538                 if (retval || io_size < io_min) {
4539                         /*
4540                          * either an error or we only have the tail left to
4541                          * complete via the copy path...
4542                          * we may have already spun some portion of this request
4543                          * off as async requests... we need to wait for the I/O
4544                          * to complete before returning
4545                          */
4546                         goto wait_for_dreads;
4547                 }
4548
4549                 /*
4550                  * Don't re-check the UBC data if we are looking for uncached IO
4551                  * or asking for encrypted blocks.
4552                  */
4553                 if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) {
4554
4555                         if ((xsize = io_size) > max_rd_size)
4556                                 xsize = max_rd_size;
4557
4558                         io_size = 0;
4559
4560                         if (!lock) {
4561                                 /*
4562                                  * We hold a lock here between the time we check the
4563                                  * cache and the time we issue I/O.  This saves us
4564                                  * from having to lock the pages in the cache.  Not
4565                                  * all clients will care about this lock but some
4566                                  * clients may want to guarantee stability between
4567                                  * here and when the I/O is issued in which case they
4568                                  * will take the lock exclusively.
4569                                  */
4570                                 lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
4571                         }
4572
4573                         ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
4574
4575                         if (io_size == 0) {
4576                                 /*
4577                                  * a page must have just come into the cache
4578                                  * since the first page in this range is no
4579                                  * longer absent, go back and re-evaluate
4580                                  */
4581                                 continue;
4582                         }
4583                 }
4584                 if ( (flags & IO_RETURN_ON_THROTTLE) ) {
4585                         if (cluster_is_throttled(vp) == THROTTLE_NOW) {
4586                                 if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) {
4587                                         /*
4588                                          * we're in the throttle window and at least 1 I/O
4589                                          * has already been issued by a throttleable thread
4590                                          * in this window, so return with EAGAIN to indicate
4591                                          * to the FS issuing the cluster_read call that it
4592                                          * should now throttle after dropping any locks
4593                                          */
4594                                         throttle_info_update_by_mount(vp->v_mount);
4595
4596                                         io_throttled = TRUE;
4597                                         goto wait_for_dreads;
4598                                 }
4599                         }
4600                 }
4601                 if (io_size > max_rd_size)
4602                         io_size = max_rd_size;
4603
4604                 iov_base = uio_curriovbase(uio);
4605
4606                 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
4607                 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
4608
4609                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
4610                              (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
4611
4612                 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0))
4613                         no_zero_fill = 1;
4614                 else
4615                         no_zero_fill = 0;
4616
4617                 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
4618                 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
4619                         pages_in_pl = 0;
4620                         upl_size = upl_needed_size;
4621                         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE
4622                                   | UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE);
4623                         if (no_zero_fill)
4624                                 upl_flags |= UPL_NOZEROFILL;
4625                         if (force_data_sync)
4626                                 upl_flags |= UPL_FORCE_DATA_SYNC;
4627
4628                         kret = vm_map_create_upl(map,
4629                                                  (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
4630                                                  &upl_size, &upl, NULL, &pages_in_pl, &upl_flags);
4631
4632                         if (kret != KERN_SUCCESS) {
4633                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4634                                              (int)upl_offset, upl_size, io_size, kret, 0);
4635                                 /*
4636                                  * failed to get pagelist
4637                                  *
4638                                  * we may have already spun some portion of this request
4639                                  * off as async requests... we need to wait for the I/O
4640                                  * to complete before returning
4641                                  */
4642                                 goto wait_for_dreads;
4643                         }
4644                         pages_in_pl = upl_size / PAGE_SIZE;
4645                         pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
4646
4647                         for (i = 0; i < pages_in_pl; i++) {
4648                                 if (!upl_page_present(pl, i))
4649                                         break;
4650                         }
4651                         if (i == pages_in_pl)
4652                                 break;
4653
4654                         ubc_upl_abort(upl, 0);
4655                 }
4656                 if (force_data_sync >= 3) {
4657                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4658                                      (int)upl_offset, upl_size, io_size, kret, 0);
4659
4660                         goto wait_for_dreads;
4661                 }
4662                 /*
4663                  * Consider the possibility that upl_size wasn't satisfied.
4664                  */
4665                 if (upl_size < upl_needed_size) {
4666                         if (upl_size && upl_offset == 0)
4667                                 io_size = upl_size;
4668                         else
4669                                 io_size = 0;
4670                 }
4671                 if (io_size == 0) {
4672                         ubc_upl_abort(upl, 0);
4673                         goto wait_for_dreads;
4674                 }
4675                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4676                              (int)upl_offset, upl_size, io_size, kret, 0);
4677
4678                 if(useVectorUPL) {
4679                         vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
4680                         if(end_off)
4681                                 issueVectorUPL = 1;
4682                         /*
4683                          * After this point, if we are using a vector UPL, then
4684                          * either all the UPL elements end on a page boundary OR
4685                          * this UPL is the last element because it does not end
4686                          * on a page boundary.
4687                          */
4688                 }
4689
4690                 /*
4691                  * request asynchronously so that we can overlap
4692                  * the preparation of the next I/O
4693                  * if there are already too many outstanding reads
4694                  * wait until some have completed before issuing the next read
4695                  */
4696                 cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
4697
4698                 if (iostate.io_error) {
4699                         /*
4700                          * one of the earlier reads we issued ran into a hard error
4701                          * don't issue any more reads, cleanup the UPL
4702                          * that was just created but not used, then
4703                          * go wait for any other reads to complete before
4704                          * returning the error to the caller
4705                          */
4706                         ubc_upl_abort(upl, 0);
4707
4708                         goto wait_for_dreads;
4709                 }
4710                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
4711                              upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
4712
4713                 if(!useVectorUPL) {
4714                         if (no_zero_fill)
4715                                 io_flag &= ~CL_PRESERVE;
4716                         else
4717                                 io_flag |= CL_PRESERVE;
4718
4719                         retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4720
4721                 } else {
4722
4723                         if(!vector_upl_index) {
4724                                 vector_upl = vector_upl_create(upl_offset);
4725                                 v_upl_uio_offset = uio->uio_offset;
4726                                 vector_upl_offset = upl_offset;
4727                         }
4728
4729                         vector_upl_set_subupl(vector_upl,upl, upl_size);
4730                         vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
4731                         vector_upl_index++;
4732                         vector_upl_size += upl_size;
4733                         vector_upl_iosize += io_size;
4734
4735                         if(issueVectorUPL || vector_upl_index ==  MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
4736                                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize,  io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4737                                 reset_vector_run_state();
4738                         }
4739                 }
4740                 last_iov_base = iov_base + io_size;
4741
4742                 if (lock) {
4743                         // We don't need to wait for the I/O to complete
4744                         cluster_unlock_direct_read(lock);
4745                         lock = NULL;
4746                 }
4747
4748                 /*
4749                  * update the uio structure
4750                  */
4751                 if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
4752                         uio_update(uio, (user_size_t)max_io_size);
4753                 }
4754                 else {
4755                         uio_update(uio, (user_size_t)io_size);
4756                 }
4757                 /*
4758                  * Under normal circumstances, the io_size should not be
4759                  * bigger than the io_req_size, but we may have had to round up
4760                  * to the end of the page in the encrypted IO case.  In that case only,
4761                  * ensure that we only decrement io_req_size to 0.
4762                  */
4763                 if ((flags & IO_ENCRYPTED) && (io_size > io_req_size)) {
4764                         io_req_size = 0;
4765                 }
4766                 else {
4767                         io_req_size -= io_size;
4768                 }
4769
4770                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
4771                              upl, (int)uio->uio_offset, io_req_size, retval, 0);
4772
4773         } /* end while */
4774
4775         if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
4776
4777                 retval = cluster_io_type(uio, read_type, read_length, 0);
4778
4779                 if (retval == 0 && *read_type == IO_DIRECT) {
4780
4781                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
4782                                      (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
4783
4784                         goto next_dread;
4785                 }
4786         }
4787
4788 wait_for_dreads:
4789
4790         if(retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
4791                 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize,  io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4792                 reset_vector_run_state();
4793         }
4794
4795         // We don't need to wait for the I/O to complete
4796         if (lock)
4797                 cluster_unlock_direct_read(lock);
4798
4799         /*
4800          * make sure all async reads that are part of this stream
4801          * have completed before we return
4802          */
4803         cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
4804
4805         if (iostate.io_error)
4806                 retval = iostate.io_error;
4807
4808         lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
4809
4810         if (io_throttled == TRUE && retval == 0)
4811                 retval = EAGAIN;
4812
4813         for (next_iov_base = orig_iov_base; next_iov_base < last_iov_base; next_iov_base += PAGE_SIZE) {
4814                 /*
4815                  * This is specifically done for pmap accounting purposes.
4816                  * vm_pre_fault() will call vm_fault() to enter the page into
4817                  * the pmap if there isn't _a_ physical page for that VA already.
4818                  */
4819                 vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK));
4820         }
4821
4822         if (io_req_size && retval == 0) {
4823                 /*
4824                  * we couldn't handle the tail of this request in DIRECT mode
4825                  * so fire it through the copy path
4826                  */
4827                 retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
4828
4829                 *read_type = IO_UNKNOWN;
4830         }
4831         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
4832                      (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
4833
4834         return (retval);
4835 }
4836
4837
4838 static int
4839 cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
4840                     int (*callback)(buf_t, void *), void *callback_arg, int flags)
4841 {
4842         upl_page_info_t *pl;
4843         upl_t            upl[MAX_VECTS];
4844         vm_offset_t      upl_offset;
4845         addr64_t         dst_paddr = 0;
4846         user_addr_t      iov_base;
4847         off_t            max_size;
4848         upl_size_t       upl_size;
4849         vm_size_t        upl_needed_size;
4850         mach_msg_type_number_t  pages_in_pl;
4851         upl_control_flags_t upl_flags;
4852         kern_return_t    kret;
4853         struct clios     iostate;
4854         int              error= 0;
4855         int              cur_upl = 0;
4856         int              num_upl = 0;
4857         int              n;
4858         u_int32_t        xsize;
4859         u_int32_t        io_size;
4860         u_int32_t        devblocksize;
4861         u_int32_t        mem_alignment_mask;
4862         u_int32_t        tail_size = 0;
4863         int              bflag;
4864
4865         if (flags & IO_PASSIVE)
4866                 bflag = CL_PASSIVE;
4867         else
4868                 bflag = 0;
4869
4870         if (flags & IO_NOCACHE)
4871                 bflag |= CL_NOCACHE;
4872
4873         /*
4874          * When we enter this routine, we know
4875          *  -- the read_length will not exceed the current iov_len
4876          *  -- the target address is physically contiguous for read_length
4877          */
4878         cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
4879
4880         devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
4881         mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
4882
4883         iostate.io_completed = 0;
4884         iostate.io_issued = 0;
4885         iostate.io_error = 0;
4886         iostate.io_wanted = 0;
4887
4888         lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
4889
4890 next_cread:
4891         io_size = *read_length;
4892
4893         max_size = filesize - uio->uio_offset;
4894
4895         if (io_size > max_size)
4896                 io_size = max_size;
4897
4898         iov_base = uio_curriovbase(uio);
4899
4900         upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
4901         upl_needed_size = upl_offset + io_size;
4902
4903         pages_in_pl = 0;
4904         upl_size = upl_needed_size;
4905         upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE
4906                    | UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE);
4907
4908
4909         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START,
4910                      (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
4911
4912         vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
4913         kret = vm_map_get_upl(map,
4914                               (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
4915                               &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0);
4916
4917         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END,
4918                      (int)upl_offset, upl_size, io_size, kret, 0);
4919
4920         if (kret != KERN_SUCCESS) {
4921                 /*
4922                  * failed to get pagelist
4923                  */
4924                 error = EINVAL;
4925                 goto wait_for_creads;
4926         }
4927         num_upl++;
4928
4929         if (upl_size < upl_needed_size) {
4930                 /*
4931                  * The upl_size wasn't satisfied.
4932                  */
4933                 error = EINVAL;
4934                 goto wait_for_creads;
4935         }
4936         pl = ubc_upl_pageinfo(upl[cur_upl]);
4937
4938         dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
4939
4940         while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
4941                 u_int32_t   head_size;
4942
4943                 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
4944
4945                 if (head_size > io_size)
4946                         head_size = io_size;
4947
4948                 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
4949
4950                 if (error)
4951                         goto wait_for_creads;
4952
4953                 upl_offset += head_size;
4954                 dst_paddr  += head_size;
4955                 io_size    -= head_size;
4956
4957                 iov_base   += head_size;
4958         }
4959         if ((u_int32_t)iov_base & mem_alignment_mask) {
4960                 /*
4961                  * request doesn't set up on a memory boundary
4962                  * the underlying DMA engine can handle...
4963                  * return an error instead of going through
4964                  * the slow copy path since the intent of this
4965                  * path is direct I/O to device memory
4966                  */
4967                 error = EINVAL;
4968                 goto wait_for_creads;
4969         }
4970
4971         tail_size = io_size & (devblocksize - 1);
4972
4973         io_size  -= tail_size;
4974
4975         while (io_size && error == 0) {
4976
4977                 if (io_size > MAX_IO_CONTIG_SIZE)
4978                         xsize = MAX_IO_CONTIG_SIZE;
4979                 else
4980                         xsize = io_size;
4981                 /*
4982                  * request asynchronously so that we can overlap
4983                  * the preparation of the next I/O... we'll do
4984                  * the commit after all the I/O has completed
4985                  * since its all issued against the same UPL
4986                  * if there are already too many outstanding reads
4987                  * wait until some have completed before issuing the next
4988                  */
4989                 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
4990
4991                 if (iostate.io_error) {
4992                         /*
4993                          * one of the earlier reads we issued ran into a hard error
4994                          * don't issue any more reads...
4995                          * go wait for any other reads to complete before
4996                          * returning the error to the caller
4997                          */
4998                         goto wait_for_creads;
4999                 }
5000                 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
5001                                    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag,
5002                                    (buf_t)NULL, &iostate, callback, callback_arg);
5003                 /*
5004                  * The cluster_io read was issued successfully,
5005                  * update the uio structure
5006                  */
5007                 if (error == 0) {
5008                         uio_update(uio, (user_size_t)xsize);
5009
5010                         dst_paddr  += xsize;
5011                         upl_offset += xsize;
5012                         io_size    -= xsize;
5013                 }
5014         }
5015         if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
5016
5017                 error = cluster_io_type(uio, read_type, read_length, 0);
5018
5019                 if (error == 0 && *read_type == IO_CONTIG) {
5020                         cur_upl++;
5021                         goto next_cread;
5022                 }
5023         } else
5024                 *read_type = IO_UNKNOWN;
5025
5026 wait_for_creads:
5027         /*
5028          * make sure all async reads that are part of this stream
5029          * have completed before we proceed
5030          */
5031         cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
5032
5033         if (iostate.io_error)
5034                 error = iostate.io_error;
5035
5036         lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
5037
5038         if (error == 0 && tail_size)
5039                 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
5040
5041         for (n = 0; n < num_upl; n++)
5042                 /*
5043                  * just release our hold on each physically contiguous
5044                  * region without changing any state
5045                  */
5046                 ubc_upl_abort(upl[n], 0);
5047
5048         return (error);
5049 }
5050
5051
5052 static int
5053 cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length)
5054 {
5055         user_size_t      iov_len;
5056         user_addr_t      iov_base = 0;
5057         upl_t            upl;
5058         upl_size_t       upl_size;
5059         upl_control_flags_t upl_flags;
5060         int              retval = 0;
5061
5062         /*
5063          * skip over any emtpy vectors
5064          */
5065         uio_update(uio, (user_size_t)0);
5066
5067         iov_len = uio_curriovlen(uio);
5068
5069         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
5070
5071         if (iov_len) {
5072                 iov_base = uio_curriovbase(uio);
5073                 /*
5074                  * make sure the size of the vector isn't too big...
5075                  * internally, we want to handle all of the I/O in
5076                  * chunk sizes that fit in a 32 bit int
5077                  */
5078                 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE)
5079                         upl_size = MAX_IO_REQUEST_SIZE;
5080                 else
5081                         upl_size = (u_int32_t)iov_len;
5082
5083                 upl_flags = UPL_QUERY_OBJECT_TYPE | UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE);
5084
5085                 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5086                 if ((vm_map_get_upl(map,
5087                                     (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
5088                                     &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
5089                         /*
5090                          * the user app must have passed in an invalid address
5091                          */
5092                         retval = EFAULT;
5093                 }
5094                 if (upl_size == 0)
5095                         retval = EFAULT;
5096
5097                 *io_length = upl_size;
5098
5099                 if (upl_flags & UPL_PHYS_CONTIG)
5100                         *io_type = IO_CONTIG;
5101                 else if (iov_len >= min_length)
5102                         *io_type = IO_DIRECT;
5103                 else
5104                         *io_type = IO_COPY;
5105         } else {
5106                 /*
5107                  * nothing left to do for this uio
5108                  */
5109                 *io_length = 0;
5110                 *io_type   = IO_UNKNOWN;
5111         }
5112         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0);
5113
5114         return (retval);
5115 }
5116
5117
5118 /*
5119  * generate advisory I/O's in the largest chunks possible
5120  * the completed pages will be released into the VM cache
5121  */
5122 int
5123 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
5124 {
5125         return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
5126 }
5127
5128 int
5129 advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
5130 {
5131         upl_page_info_t *pl;
5132         upl_t            upl;
5133         vm_offset_t      upl_offset;
5134         int              upl_size;
5135         off_t            upl_f_offset;
5136         int              start_offset;
5137         int              start_pg;
5138         int              last_pg;
5139         int              pages_in_upl;
5140         off_t            max_size;
5141         int              io_size;
5142         kern_return_t    kret;
5143         int              retval = 0;
5144         int              issued_io;
5145         int              skip_range;
5146         uint32_t         max_io_size;
5147
5148
5149         if ( !UBCINFOEXISTS(vp))
5150                 return(EINVAL);
5151
5152         if (resid < 0)
5153                 return(EINVAL);
5154
5155         max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
5156
5157         if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) {
5158                 if (max_io_size > speculative_prefetch_max_iosize)
5159                         max_io_size = speculative_prefetch_max_iosize;
5160         }
5161
5162         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
5163                      (int)f_offset, resid, (int)filesize, 0, 0);
5164
5165         while (resid && f_offset < filesize && retval == 0) {
5166                 /*
5167                  * compute the size of the upl needed to encompass
5168                  * the requested read... limit each call to cluster_io
5169                  * to the maximum UPL size... cluster_io will clip if
5170                  * this exceeds the maximum io_size for the device,
5171                  * make sure to account for
5172                  * a starting offset that's not page aligned
5173                  */
5174                 start_offset = (int)(f_offset & PAGE_MASK_64);
5175                 upl_f_offset = f_offset - (off_t)start_offset;
5176                 max_size     = filesize - f_offset;
5177
5178                 if (resid < max_size)
5179                         io_size = resid;
5180                 else
5181                         io_size = max_size;
5182
5183                 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5184                 if ((uint32_t)upl_size > max_io_size)
5185                         upl_size = max_io_size;
5186
5187                 skip_range = 0;
5188                 /*
5189                  * return the number of contiguously present pages in the cache
5190                  * starting at upl_f_offset within the file
5191                  */
5192                 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
5193
5194                 if (skip_range) {
5195                         /*
5196                          * skip over pages already present in the cache
5197                          */
5198                         io_size = skip_range - start_offset;
5199
5200                         f_offset += io_size;
5201                         resid    -= io_size;
5202
5203                         if (skip_range == upl_size)
5204                                 continue;
5205                         /*
5206                          * have to issue some real I/O
5207                          * at this point, we know it's starting on a page boundary
5208                          * because we've skipped over at least the first page in the request
5209                          */
5210                         start_offset = 0;
5211                         upl_f_offset += skip_range;
5212                         upl_size     -= skip_range;
5213                 }
5214                 pages_in_upl = upl_size / PAGE_SIZE;
5215
5216                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
5217                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
5218
5219                 kret = ubc_create_upl(vp,
5220                                       upl_f_offset,
5221                                       upl_size,
5222                                       &upl,
5223                                       &pl,
5224                                       UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
5225                 if (kret != KERN_SUCCESS)
5226                         return(retval);
5227                 issued_io = 0;
5228
5229                 /*
5230                  * before we start marching forward, we must make sure we end on
5231                  * a present page, otherwise we will be working with a freed
5232                  * upl
5233                  */
5234                 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
5235                         if (upl_page_present(pl, last_pg))
5236                                 break;
5237                 }
5238                 pages_in_upl = last_pg + 1;
5239
5240
5241                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
5242                              upl, (int)upl_f_offset, upl_size, start_offset, 0);
5243
5244
5245                 for (last_pg = 0; last_pg < pages_in_upl; ) {
5246                         /*
5247                          * scan from the beginning of the upl looking for the first
5248                          * page that is present.... this will become the first page in
5249                          * the request we're going to make to 'cluster_io'... if all
5250                          * of the pages are absent, we won't call through to 'cluster_io'
5251                          */
5252                         for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
5253                                 if (upl_page_present(pl, start_pg))
5254                                         break;
5255                         }
5256
5257                         /*
5258                          * scan from the starting present page looking for an absent
5259                          * page before the end of the upl is reached, if we
5260                          * find one, then it will terminate the range of pages being
5261                          * presented to 'cluster_io'
5262                          */
5263                         for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
5264                                 if (!upl_page_present(pl, last_pg))
5265                                         break;
5266                         }
5267
5268                         if (last_pg > start_pg) {
5269                                 /*
5270                                  * we found a range of pages that must be filled
5271                                  * if the last page in this range is the last page of the file
5272                                  * we may have to clip the size of it to keep from reading past
5273                                  * the end of the last physical block associated with the file
5274                                  */
5275                                 upl_offset = start_pg * PAGE_SIZE;
5276                                 io_size    = (last_pg - start_pg) * PAGE_SIZE;
5277
5278                                 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize)
5279                                         io_size = filesize - (upl_f_offset + upl_offset);
5280
5281                                 /*
5282                                  * issue an asynchronous read to cluster_io
5283                                  */
5284                                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
5285                                                     CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
5286
5287                                 issued_io = 1;
5288                         }
5289                 }
5290                 if (issued_io == 0)
5291                         ubc_upl_abort(upl, 0);
5292
5293                 io_size = upl_size - start_offset;
5294
5295                 if (io_size > resid)
5296                         io_size = resid;
5297                 f_offset += io_size;
5298                 resid    -= io_size;
5299         }
5300
5301         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
5302                      (int)f_offset, resid, retval, 0, 0);
5303
5304         return(retval);
5305 }
5306
5307
5308 int
5309 cluster_push(vnode_t vp, int flags)
5310 {
5311         return cluster_push_ext(vp, flags, NULL, NULL);
5312 }
5313
5314
5315 int
5316 cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg)
5317 {
5318         int     retval;
5319         int     my_sparse_wait = 0;
5320         struct  cl_writebehind *wbp;
5321
5322         if ( !UBCINFOEXISTS(vp)) {
5323                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -1, 0);
5324                 return (0);
5325         }
5326         /* return if deferred write is set */
5327         if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
5328                 return (0);
5329         }
5330         if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
5331                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -2, 0);
5332                 return (0);
5333         }
5334         if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
5335                 lck_mtx_unlock(&wbp->cl_lockw);
5336
5337                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -3, 0);
5338                 return(0);
5339         }
5340         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
5341                      wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
5342
5343         /*
5344          * if we have an fsync in progress, we don't want to allow any additional
5345          * sync/fsync/close(s) to occur until it finishes.
5346          * note that its possible for writes to continue to occur to this file
5347          * while we're waiting and also once the fsync starts to clean if we're
5348          * in the sparse map case
5349          */
5350         while (wbp->cl_sparse_wait) {
5351                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, vp, 0, 0, 0, 0);
5352
5353                 msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5354
5355                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, vp, 0, 0, 0, 0);
5356         }
5357         if (flags & IO_SYNC) {
5358                 my_sparse_wait = 1;
5359                 wbp->cl_sparse_wait = 1;
5360
5361                 /*
5362                  * this is an fsync (or equivalent)... we must wait for any existing async
5363                  * cleaning operations to complete before we evaulate the current state
5364                  * and finish cleaning... this insures that all writes issued before this
5365                  * fsync actually get cleaned to the disk before this fsync returns
5366                  */
5367                 while (wbp->cl_sparse_pushes) {
5368                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, vp, 0, 0, 0, 0);
5369
5370                         msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5371
5372                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, vp, 0, 0, 0, 0);
5373                 }
5374         }
5375         if (wbp->cl_scmap) {
5376                 void    *scmap;
5377
5378                 if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
5379
5380                         scmap = wbp->cl_scmap;
5381                         wbp->cl_scmap = NULL;
5382
5383                         wbp->cl_sparse_pushes++;
5384
5385                         lck_mtx_unlock(&wbp->cl_lockw);
5386
5387                         sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg);
5388
5389                         lck_mtx_lock(&wbp->cl_lockw);
5390
5391                         wbp->cl_sparse_pushes--;
5392
5393                         if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0)
5394                                 wakeup((caddr_t)&wbp->cl_sparse_pushes);
5395                 } else {
5396                         sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg);
5397                 }
5398                 retval = 1;
5399         } else  {
5400                 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg);
5401         }
5402         lck_mtx_unlock(&wbp->cl_lockw);
5403
5404         if (flags & IO_SYNC)
5405                 (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
5406
5407         if (my_sparse_wait) {
5408                 /*
5409                  * I'm the owner of the serialization token
5410                  * clear it and wakeup anyone that is waiting
5411                  * for me to finish
5412                  */
5413                 lck_mtx_lock(&wbp->cl_lockw);
5414
5415                 wbp->cl_sparse_wait = 0;
5416                 wakeup((caddr_t)&wbp->cl_sparse_wait);
5417
5418                 lck_mtx_unlock(&wbp->cl_lockw);
5419         }
5420         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
5421                      wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
5422
5423         return (retval);
5424 }
5425
5426
5427 __private_extern__ void
5428 cluster_release(struct ubc_info *ubc)
5429 {
5430         struct cl_writebehind *wbp;
5431         struct cl_readahead   *rap;
5432
5433         if ((wbp = ubc->cl_wbehind)) {
5434
5435                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
5436
5437                 if (wbp->cl_scmap)
5438                         vfs_drt_control(&(wbp->cl_scmap), 0);
5439         } else {
5440                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0);
5441         }
5442
5443         rap = ubc->cl_rahead;
5444
5445         if (wbp != NULL) {
5446                 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
5447                 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
5448         }
5449         if ((rap = ubc->cl_rahead)) {
5450                 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
5451                 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
5452         }
5453         ubc->cl_rahead  = NULL;
5454         ubc->cl_wbehind = NULL;
5455
5456         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0);
5457 }
5458
5459
5460 static int
5461 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg)
5462 {
5463         int cl_index;
5464         int cl_index1;
5465         int min_index;
5466         int cl_len;
5467         int cl_pushed = 0;
5468         struct cl_wextent l_clusters[MAX_CLUSTERS];
5469         u_int  max_cluster_pgcount;
5470
5471
5472         max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
5473         /*
5474          * the write behind context exists and has
5475          * already been locked...
5476          */
5477         if (wbp->cl_number == 0)
5478                 /*
5479                  * no clusters to push
5480                  * return number of empty slots
5481                  */
5482                 return (MAX_CLUSTERS);
5483
5484         /*
5485          * make a local 'sorted' copy of the clusters
5486          * and clear wbp->cl_number so that new clusters can
5487          * be developed
5488          */
5489         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
5490                 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
5491                         if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
5492                                 continue;
5493                         if (min_index == -1)
5494                                 min_index = cl_index1;
5495                         else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
5496                                 min_index = cl_index1;
5497                 }
5498                 if (min_index == -1)
5499                         break;
5500
5501                 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
5502                 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
5503                 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
5504
5505                 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
5506         }
5507         wbp->cl_number = 0;
5508
5509         cl_len = cl_index;
5510
5511         if ( (push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS ) {
5512                 int   i;
5513
5514                 /*
5515                  * determine if we appear to be writing the file sequentially
5516                  * if not, by returning without having pushed any clusters
5517                  * we will cause this vnode to be pushed into the sparse cluster mechanism
5518                  * used for managing more random I/O patterns
5519                  *
5520                  * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
5521                  * that's why we're in try_push with PUSH_DELAY...
5522                  *
5523                  * check to make sure that all the clusters except the last one are 'full'... and that each cluster
5524                  * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
5525                  * so we can just make a simple pass through, up to, but not including the last one...
5526                  * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
5527                  * are sequential
5528                  *
5529                  * we let the last one be partial as long as it was adjacent to the previous one...
5530                  * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
5531                  * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
5532                  */
5533                 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
5534                         if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount)
5535                                 goto dont_try;
5536                         if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
5537                                 goto dont_try;
5538                 }
5539         }
5540         for (cl_index = 0; cl_index < cl_len; cl_index++) {
5541                 int     flags;
5542                 struct  cl_extent cl;
5543
5544                 flags = io_flags & (IO_PASSIVE|IO_CLOSE);
5545
5546                 /*
5547                  * try to push each cluster in turn...
5548                  */
5549                 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE)
5550                         flags |= IO_NOCACHE;
5551
5552                 if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE)
5553                         flags |= IO_PASSIVE;
5554
5555                 if (push_flag & PUSH_SYNC)
5556                         flags |= IO_SYNC;
5557
5558                 cl.b_addr = l_clusters[cl_index].b_addr;
5559                 cl.e_addr = l_clusters[cl_index].e_addr;
5560
5561                 cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg);
5562
5563                 l_clusters[cl_index].b_addr = 0;
5564                 l_clusters[cl_index].e_addr = 0;
5565
5566                 cl_pushed++;
5567
5568                 if ( !(push_flag & PUSH_ALL) )
5569                         break;
5570         }
5571 dont_try:
5572         if (cl_len > cl_pushed) {
5573                /*
5574                 * we didn't push all of the clusters, so
5575                 * lets try to merge them back in to the vnode
5576                 */
5577                 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
5578                         /*
5579                          * we picked up some new clusters while we were trying to
5580                          * push the old ones... this can happen because I've dropped
5581                          * the vnode lock... the sum of the
5582                          * leftovers plus the new cluster count exceeds our ability
5583                          * to represent them, so switch to the sparse cluster mechanism
5584                          *
5585                          * collect the active public clusters...
5586                          */
5587                         sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg);
5588
5589                         for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
5590                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
5591                                         continue;
5592                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
5593                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
5594                                 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
5595
5596                                 cl_index1++;
5597                         }
5598                         /*
5599                          * update the cluster count
5600                          */
5601                         wbp->cl_number = cl_index1;
5602
5603                         /*
5604                          * and collect the original clusters that were moved into the
5605                          * local storage for sorting purposes
5606                          */
5607                         sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg);
5608
5609                 } else {
5610                         /*
5611                          * we've got room to merge the leftovers back in
5612                          * just append them starting at the next 'hole'
5613                          * represented by wbp->cl_number
5614                          */
5615                         for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
5616                                 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
5617                                         continue;
5618
5619                                 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
5620                                 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
5621                                 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
5622
5623                                 cl_index1++;
5624                         }
5625                         /*
5626                          * update the cluster count
5627                          */
5628                         wbp->cl_number = cl_index1;
5629                 }
5630         }
5631         return (MAX_CLUSTERS - wbp->cl_number);
5632 }
5633
5634
5635
5636 static int
5637 cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*callback)(buf_t, void *), void *callback_arg)
5638 {
5639         upl_page_info_t *pl;
5640         upl_t            upl;
5641         vm_offset_t      upl_offset;
5642         int              upl_size;
5643         off_t            upl_f_offset;
5644         int              pages_in_upl;
5645         int              start_pg;
5646         int              last_pg;
5647         int              io_size;
5648         int              io_flags;
5649         int              upl_flags;
5650         int              bflag;
5651         int              size;
5652         int              error = 0;
5653         int              retval;
5654         kern_return_t    kret;
5655
5656         if (flags & IO_PASSIVE)
5657                 bflag = CL_PASSIVE;
5658         else
5659                 bflag = 0;
5660
5661         if (flags & IO_SKIP_ENCRYPTION)
5662                 bflag |= CL_ENCRYPTED;
5663
5664         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
5665                      (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
5666
5667         if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
5668                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
5669
5670                 return (0);
5671         }
5672         upl_size = pages_in_upl * PAGE_SIZE;
5673         upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
5674
5675         if (upl_f_offset + upl_size >= EOF) {
5676
5677                 if (upl_f_offset >= EOF) {
5678                         /*
5679                          * must have truncated the file and missed
5680                          * clearing a dangling cluster (i.e. it's completely
5681                          * beyond the new EOF
5682                          */
5683                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
5684
5685                         return(0);
5686                 }
5687                 size = EOF - upl_f_offset;
5688
5689                 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5690                 pages_in_upl = upl_size / PAGE_SIZE;
5691         } else
5692                 size = upl_size;
5693
5694         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
5695
5696         /*
5697          * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
5698          *
5699          * - only pages that are currently dirty are returned... these are the ones we need to clean
5700          * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
5701          * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
5702          * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
5703          *   someone dirties this page while the I/O is in progress, we don't lose track of the new state
5704          *
5705          * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
5706          */
5707
5708         if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE))
5709                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
5710         else
5711                 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
5712
5713         kret = ubc_create_upl(vp,
5714                                 upl_f_offset,
5715                                 upl_size,
5716                                 &upl,
5717                                 &pl,
5718                                 upl_flags);
5719         if (kret != KERN_SUCCESS)
5720                 panic("cluster_push: failed to get pagelist");
5721
5722         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
5723
5724         /*
5725          * since we only asked for the dirty pages back
5726          * it's possible that we may only get a few or even none, so...
5727          * before we start marching forward, we must make sure we know
5728          * where the last present page is in the UPL, otherwise we could
5729          * end up working with a freed upl due to the FREE_ON_EMPTY semantics
5730          * employed by commit_range and abort_range.
5731          */
5732         for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
5733                 if (upl_page_present(pl, last_pg))
5734                         break;
5735         }
5736         pages_in_upl = last_pg + 1;
5737
5738         if (pages_in_upl == 0) {
5739                 ubc_upl_abort(upl, 0);
5740
5741                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
5742                 return(0);
5743         }
5744
5745         for (last_pg = 0; last_pg < pages_in_upl; ) {
5746                 /*
5747                  * find the next dirty page in the UPL
5748                  * this will become the first page in the
5749                  * next I/O to generate
5750                  */
5751                 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
5752                         if (upl_dirty_page(pl, start_pg))
5753                                 break;
5754                         if (upl_page_present(pl, start_pg))
5755                                 /*
5756                                  * RET_ONLY_DIRTY will return non-dirty 'precious' pages
5757                                  * just release these unchanged since we're not going
5758                                  * to steal them or change their state
5759                                  */
5760                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
5761                 }
5762                 if (start_pg >= pages_in_upl)
5763                         /*
5764                          * done... no more dirty pages to push
5765                          */
5766                         break;
5767                 if (start_pg > last_pg)
5768                         /*
5769                          * skipped over some non-dirty pages
5770                          */
5771                         size -= ((start_pg - last_pg) * PAGE_SIZE);
5772
5773                 /*
5774                  * find a range of dirty pages to write
5775                  */
5776                 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
5777                         if (!upl_dirty_page(pl, last_pg))
5778                                 break;
5779                 }
5780                 upl_offset = start_pg * PAGE_SIZE;
5781
5782                 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
5783
5784                 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag;
5785
5786                 if ( !(flags & IO_SYNC))
5787                         io_flags |= CL_ASYNC;
5788
5789                 if (flags & IO_CLOSE)
5790                         io_flags |= CL_CLOSE;
5791
5792                 if (flags & IO_NOCACHE)
5793                         io_flags |= CL_NOCACHE;
5794
5795                 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
5796                                     io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
5797
5798                 if (error == 0 && retval)
5799                         error = retval;
5800
5801                 size -= io_size;
5802         }
5803         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
5804
5805         return(error);
5806 }
5807
5808
5809 /*
5810  * sparse_cluster_switch is called with the write behind lock held
5811  */
5812 static void
5813 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg)
5814 {
5815         int     cl_index;
5816
5817         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, vp, wbp->cl_scmap, 0, 0, 0);
5818
5819         for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
5820                 int       flags;
5821                 struct cl_extent cl;
5822
5823                 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
5824
5825                         if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
5826                                 if (flags & UPL_POP_DIRTY) {
5827                                         cl.e_addr = cl.b_addr + 1;
5828
5829                                         sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg);
5830                                 }
5831                         }
5832                 }
5833         }
5834         wbp->cl_number = 0;
5835
5836         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, vp, wbp->cl_scmap, 0, 0, 0);
5837 }
5838
5839
5840 /*
5841  * sparse_cluster_push must be called with the write-behind lock held if the scmap is
5842  * still associated with the write-behind context... however, if the scmap has been disassociated
5843  * from the write-behind context (the cluster_push case), the wb lock is not held
5844  */
5845 static void
5846 sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg)
5847 {
5848         struct cl_extent cl;
5849         off_t           offset;
5850         u_int           length;
5851
5852         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, vp, (*scmap), 0, push_flag, 0);
5853
5854         if (push_flag & PUSH_ALL)
5855                 vfs_drt_control(scmap, 1);
5856
5857         for (;;) {
5858                 if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS)
5859                         break;
5860
5861                 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
5862                 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
5863
5864                 cluster_push_now(vp, &cl, EOF, io_flags & (IO_PASSIVE|IO_CLOSE), callback, callback_arg);
5865
5866                 if ( !(push_flag & PUSH_ALL) )
5867                         break;
5868         }
5869         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, vp, (*scmap), 0, 0, 0);
5870 }
5871
5872
5873 /*
5874  * sparse_cluster_add is called with the write behind lock held
5875  */
5876 static void
5877 sparse_cluster_add(void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg)
5878 {
5879         u_int   new_dirty;
5880         u_int   length;
5881         off_t   offset;
5882
5883         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
5884
5885         offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
5886         length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
5887
5888         while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
5889                 /*
5890                  * no room left in the map
5891                  * only a partial update was done
5892                  * push out some pages and try again
5893                  */
5894                 sparse_cluster_push(scmap, vp, EOF, 0, 0, callback, callback_arg);
5895
5896                 offset += (new_dirty * PAGE_SIZE_64);
5897                 length -= (new_dirty * PAGE_SIZE);
5898         }
5899         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, vp, (*scmap), 0, 0, 0);
5900 }
5901
5902
5903 static int
5904 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
5905 {
5906         upl_page_info_t  *pl;
5907         upl_t            upl;
5908         addr64_t         ubc_paddr;
5909         kern_return_t    kret;
5910         int              error = 0;
5911         int              did_read = 0;
5912         int              abort_flags;
5913         int              upl_flags;
5914         int              bflag;
5915
5916         if (flags & IO_PASSIVE)
5917                 bflag = CL_PASSIVE;
5918         else
5919                 bflag = 0;
5920
5921         if (flags & IO_NOCACHE)
5922                 bflag |= CL_NOCACHE;
5923
5924         upl_flags = UPL_SET_LITE;
5925
5926         if ( !(flags & CL_READ) ) {
5927                 /*
5928                  * "write" operation:  let the UPL subsystem know
5929                  * that we intend to modify the buffer cache pages
5930                  * we're gathering.
5931                  */
5932                 upl_flags |= UPL_WILL_MODIFY;
5933         } else {
5934                 /*
5935                  * indicate that there is no need to pull the
5936                  * mapping for this page... we're only going
5937                  * to read from it, not modify it.
5938                  */
5939                 upl_flags |= UPL_FILE_IO;
5940         }
5941         kret = ubc_create_upl(vp,
5942                               uio->uio_offset & ~PAGE_MASK_64,
5943                               PAGE_SIZE,
5944                               &upl,
5945                               &pl,
5946                               upl_flags);
5947
5948         if (kret != KERN_SUCCESS)
5949                 return(EINVAL);
5950
5951         if (!upl_valid_page(pl, 0)) {
5952                 /*
5953                  * issue a synchronous read to cluster_io
5954                  */
5955                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
5956                                    CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
5957                 if (error) {
5958                           ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
5959
5960                           return(error);
5961                 }
5962                 did_read = 1;
5963         }
5964         ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
5965
5966 /*
5967  *      NOTE:  There is no prototype for the following in BSD. It, and the definitions
5968  *      of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
5969  *      osfmk/ppc/mappings.h.  They are not included here because there appears to be no
5970  *      way to do so without exporting them to kexts as well.
5971  */
5972         if (flags & CL_READ)
5973 //              copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);    /* Copy physical to physical and flush the destination */
5974                 copypv(ubc_paddr, usr_paddr, xsize,        2 |        1 |        4);    /* Copy physical to physical and flush the destination */
5975         else
5976 //              copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);    /* Copy physical to physical and flush the source */
5977                 copypv(usr_paddr, ubc_paddr, xsize,        2 |        1 |        8);    /* Copy physical to physical and flush the source */
5978
5979         if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
5980                 /*
5981                  * issue a synchronous write to cluster_io
5982                  */
5983                 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
5984                                    bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
5985         }
5986         if (error == 0)
5987                 uio_update(uio, (user_size_t)xsize);
5988
5989         if (did_read)
5990                 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
5991         else
5992                 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
5993
5994         ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
5995
5996         return (error);
5997 }
5998
5999
6000
6001 int
6002 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid)
6003 {
6004         int       pg_offset;
6005         int       pg_index;
6006         int       csize;
6007         int       segflg;
6008         int       retval = 0;
6009         int       xsize;
6010         upl_page_info_t *pl;
6011
6012         xsize = *io_resid;
6013
6014         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6015                      (int)uio->uio_offset, upl_offset, xsize, 0, 0);
6016
6017         segflg = uio->uio_segflg;
6018
6019         switch(segflg) {
6020
6021           case UIO_USERSPACE32:
6022           case UIO_USERISPACE32:
6023                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6024                 break;
6025
6026           case UIO_USERSPACE:
6027           case UIO_USERISPACE:
6028                 uio->uio_segflg = UIO_PHYS_USERSPACE;
6029                 break;
6030
6031           case UIO_USERSPACE64:
6032           case UIO_USERISPACE64:
6033                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6034                 break;
6035
6036           case UIO_SYSSPACE:
6037                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6038                 break;
6039
6040         }
6041         pl = ubc_upl_pageinfo(upl);
6042
6043         pg_index  = upl_offset / PAGE_SIZE;
6044         pg_offset = upl_offset & PAGE_MASK;
6045         csize     = min(PAGE_SIZE - pg_offset, xsize);
6046
6047         while (xsize && retval == 0) {
6048                 addr64_t  paddr;
6049
6050                 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
6051
6052                 retval = uiomove64(paddr, csize, uio);
6053
6054                 pg_index += 1;
6055                 pg_offset = 0;
6056                 xsize    -= csize;
6057                 csize     = min(PAGE_SIZE, xsize);
6058         }
6059         *io_resid = xsize;
6060
6061         uio->uio_segflg = segflg;
6062
6063         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6064                      (int)uio->uio_offset, xsize, retval, segflg, 0);
6065
6066         return (retval);
6067 }
6068
6069
6070 int
6071 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
6072 {
6073
6074         return (cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1));
6075 }
6076
6077
6078 static int
6079 cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference)
6080 {
6081         int       segflg;
6082         int       io_size;
6083         int       xsize;
6084         int       start_offset;
6085         int       retval = 0;
6086         memory_object_control_t  control;
6087
6088         io_size = *io_resid;
6089
6090         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6091                      (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
6092
6093         control = ubc_getobject(vp, UBC_FLAGS_NONE);
6094
6095         if (control == MEMORY_OBJECT_CONTROL_NULL) {
6096                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6097                              (int)uio->uio_offset, io_size, retval, 3, 0);
6098
6099                 return(0);
6100         }
6101         segflg = uio->uio_segflg;
6102
6103         switch(segflg) {
6104
6105           case UIO_USERSPACE32:
6106           case UIO_USERISPACE32:
6107                 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6108                 break;
6109
6110           case UIO_USERSPACE64:
6111           case UIO_USERISPACE64:
6112                 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6113                 break;
6114
6115           case UIO_USERSPACE:
6116           case UIO_USERISPACE:
6117                 uio->uio_segflg = UIO_PHYS_USERSPACE;
6118                 break;
6119
6120           case UIO_SYSSPACE:
6121                 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6122                 break;
6123         }
6124
6125         if ( (io_size = *io_resid) ) {
6126                 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
6127                 xsize = uio_resid(uio);
6128
6129                 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
6130                                                        start_offset, io_size, mark_dirty, take_reference);
6131                 xsize -= uio_resid(uio);
6132                 io_size -= xsize;
6133         }
6134         uio->uio_segflg = segflg;
6135         *io_resid       = io_size;
6136
6137         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6138                      (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0);
6139
6140         return(retval);
6141 }
6142
6143
6144 int
6145 is_file_clean(vnode_t vp, off_t filesize)
6146 {
6147         off_t f_offset;
6148         int   flags;
6149         int   total_dirty = 0;
6150
6151         for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
6152                 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
6153                         if (flags & UPL_POP_DIRTY) {
6154                                 total_dirty++;
6155                         }
6156                 }
6157         }
6158         if (total_dirty)
6159                 return(EINVAL);
6160
6161         return (0);
6162 }
6163
6164
6165
6166 /*
6167  * Dirty region tracking/clustering mechanism.
6168  *
6169  * This code (vfs_drt_*) provides a mechanism for tracking and clustering
6170  * dirty regions within a larger space (file).  It is primarily intended to
6171  * support clustering in large files with many dirty areas.
6172  *
6173  * The implementation assumes that the dirty regions are pages.
6174  *
6175  * To represent dirty pages within the file, we store bit vectors in a
6176  * variable-size circular hash.
6177  */
6178
6179 /*
6180  * Bitvector size.  This determines the number of pages we group in a
6181  * single hashtable entry.  Each hashtable entry is aligned to this
6182  * size within the file.
6183  */
6184 #define DRT_BITVECTOR_PAGES             ((1024 * 1024) / PAGE_SIZE)
6185
6186 /*
6187  * File offset handling.
6188  *
6189   * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
6190  * the correct formula is  (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6191  */
6192 #define DRT_ADDRESS_MASK                (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6193 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK)
6194
6195 /*
6196  * Hashtable address field handling.
6197  *
6198  * The low-order bits of the hashtable address are used to conserve
6199  * space.
6200  *
6201  * DRT_HASH_COUNT_MASK must be large enough to store the range
6202  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
6203  * to indicate that the bucket is actually unoccupied.
6204  */
6205 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
6206 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \
6207         do {                                                                                            \
6208                 (scm)->scm_hashtable[(i)].dhe_control =                                                 \
6209                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
6210         } while (0)
6211 #define DRT_HASH_COUNT_MASK             0x1ff
6212 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
6213 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \
6214         do {                                                                                                            \
6215                 (scm)->scm_hashtable[(i)].dhe_control =                                                                 \
6216                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \
6217         } while (0)
6218 #define DRT_HASH_CLEAR(scm, i)                                                                                          \
6219         do {                                                                                                            \
6220                 (scm)->scm_hashtable[(i)].dhe_control = 0;                                                              \
6221         } while (0)
6222 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
6223 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
6224 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \
6225         do {                                                                                            \
6226                 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \
6227                 DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \
6228         } while(0);
6229
6230
6231 /*
6232  * Hash table moduli.
6233  *
6234  * Since the hashtable entry's size is dependent on the size of
6235  * the bitvector, and since the hashtable size is constrained to
6236  * both being prime and fitting within the desired allocation
6237  * size, these values need to be manually determined.
6238  *
6239  * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
6240  *
6241  * The small hashtable allocation is 1024 bytes, so the modulus is 23.
6242  * The large hashtable allocation is 16384 bytes, so the modulus is 401.
6243  */
6244 #define DRT_HASH_SMALL_MODULUS  23
6245 #define DRT_HASH_LARGE_MODULUS  401
6246
6247 /*
6248  * Physical memory required before the large hash modulus is permitted.
6249  *
6250  * On small memory systems, the large hash modulus can lead to phsyical
6251  * memory starvation, so we avoid using it there.
6252  */
6253 #define DRT_HASH_LARGE_MEMORY_REQUIRED  (1024LL * 1024LL * 1024LL)      /* 1GiB */
6254
6255 #define DRT_SMALL_ALLOCATION    1024    /* 104 bytes spare */
6256 #define DRT_LARGE_ALLOCATION    16384   /* 344 bytes spare */
6257
6258 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
6259
6260 /*
6261  * Hashtable bitvector handling.
6262  *
6263  * Bitvector fields are 32 bits long.
6264  */
6265
6266 #define DRT_HASH_SET_BIT(scm, i, bit)                           \
6267         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
6268
6269 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \
6270         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
6271
6272 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \
6273         ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
6274
6275 #define DRT_BITVECTOR_CLEAR(scm, i)                             \
6276         bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6277
6278 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \
6279         bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \
6280             &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \
6281             (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6282
6283
6284
6285 /*
6286  * Hashtable entry.
6287  */
6288 struct vfs_drt_hashentry {
6289         u_int64_t       dhe_control;
6290 /*
6291 * dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
6292 * DRT_BITVECTOR_PAGES is defined as ((1024 * 1024) / PAGE_SIZE)
6293 * Since PAGE_SIZE is only known at boot time,
6294 *       -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
6295 *       -declare dhe_bitvector array for largest possible length
6296 */
6297 #define MAX_DRT_BITVECTOR_PAGES (1024 * 1024)/( 4 * 1024)
6298         u_int32_t       dhe_bitvector[MAX_DRT_BITVECTOR_PAGES/32];
6299 };
6300
6301 /*
6302  * Dirty Region Tracking structure.
6303  *
6304  * The hashtable is allocated entirely inside the DRT structure.
6305  *
6306  * The hash is a simple circular prime modulus arrangement, the structure
6307  * is resized from small to large if it overflows.
6308  */
6309
6310 struct vfs_drt_clustermap {
6311         u_int32_t               scm_magic;      /* sanity/detection */
6312 #define DRT_SCM_MAGIC           0x12020003
6313         u_int32_t               scm_modulus;    /* current ring size */
6314         u_int32_t               scm_buckets;    /* number of occupied buckets */
6315         u_int32_t               scm_lastclean;  /* last entry we cleaned */
6316         u_int32_t               scm_iskips;     /* number of slot skips */
6317
6318         struct vfs_drt_hashentry scm_hashtable[0];
6319 };
6320
6321
6322 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus)
6323 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus)
6324
6325 /*
6326  * Debugging codes and arguments.
6327  */
6328 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
6329 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
6330 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
6331 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
6332 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
6333                                                             * dirty */
6334                                                            /* 0, setcount */
6335                                                            /* 1 (clean, no map) */
6336                                                            /* 2 (map alloc fail) */
6337                                                            /* 3, resid (partial) */
6338 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87))
6339 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
6340                                                             * lastclean, iskips */
6341
6342
6343 static kern_return_t    vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
6344 static kern_return_t    vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
6345 static kern_return_t    vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
6346         u_int64_t offset, int *indexp);
6347 static kern_return_t    vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
6348         u_int64_t offset,
6349         int *indexp,
6350         int recursed);
6351 static kern_return_t    vfs_drt_do_mark_pages(
6352         void            **cmapp,
6353         u_int64_t       offset,
6354         u_int           length,
6355         u_int           *setcountp,
6356         int             dirty);
6357 static void             vfs_drt_trace(
6358         struct vfs_drt_clustermap *cmap,
6359         int code,
6360         int arg1,
6361         int arg2,
6362         int arg3,
6363         int arg4);
6364
6365
6366 /*
6367  * Allocate and initialise a sparse cluster map.
6368  *
6369  * Will allocate a new map, resize or compact an existing map.
6370  *
6371  * XXX we should probably have at least one intermediate map size,
6372  * as the 1:16 ratio seems a bit drastic.
6373  */
6374 static kern_return_t
6375 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
6376 {
6377         struct vfs_drt_clustermap *cmap, *ocmap;
6378         kern_return_t   kret;
6379         u_int64_t       offset;
6380         u_int32_t       i;
6381         int             nsize, active_buckets, index, copycount;
6382
6383         ocmap = NULL;
6384         if (cmapp != NULL)
6385                 ocmap = *cmapp;
6386
6387         /*
6388          * Decide on the size of the new map.
6389          */
6390         if (ocmap == NULL) {
6391                 nsize = DRT_HASH_SMALL_MODULUS;
6392         } else {
6393                 /* count the number of active buckets in the old map */
6394                 active_buckets = 0;
6395                 for (i = 0; i < ocmap->scm_modulus; i++) {
6396                         if (!DRT_HASH_VACANT(ocmap, i) &&
6397                             (DRT_HASH_GET_COUNT(ocmap, i) != 0))
6398                                 active_buckets++;
6399                 }
6400                 /*
6401                  * If we're currently using the small allocation, check to
6402                  * see whether we should grow to the large one.
6403                  */
6404                 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
6405                         /*
6406                          * If the ring is nearly full and we are allowed to
6407                          * use the large modulus, upgrade.
6408                          */
6409                         if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
6410                             (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
6411                                 nsize = DRT_HASH_LARGE_MODULUS;
6412                         } else {
6413                                 nsize = DRT_HASH_SMALL_MODULUS;
6414                         }
6415                 } else {
6416                         /* already using the large modulus */
6417                         nsize = DRT_HASH_LARGE_MODULUS;
6418                         /*
6419                          * If the ring is completely full, there's
6420                          * nothing useful for us to do.  Behave as
6421                          * though we had compacted into the new
6422                          * array and return.
6423                          */
6424                         if (active_buckets >= DRT_HASH_LARGE_MODULUS)
6425                                 return(KERN_SUCCESS);
6426                 }
6427         }
6428
6429         /*
6430          * Allocate and initialise the new map.
6431          */
6432
6433         kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
6434             (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION, VM_KERN_MEMORY_FILE);
6435         if (kret != KERN_SUCCESS)
6436                 return(kret);
6437         cmap->scm_magic = DRT_SCM_MAGIC;
6438         cmap->scm_modulus = nsize;
6439         cmap->scm_buckets = 0;
6440         cmap->scm_lastclean = 0;
6441         cmap->scm_iskips = 0;
6442         for (i = 0; i < cmap->scm_modulus; i++) {
6443                 DRT_HASH_CLEAR(cmap, i);
6444                 DRT_HASH_VACATE(cmap, i);
6445                 DRT_BITVECTOR_CLEAR(cmap, i);
6446         }
6447
6448         /*
6449          * If there's an old map, re-hash entries from it into the new map.
6450          */
6451         copycount = 0;
6452         if (ocmap != NULL) {
6453                 for (i = 0; i < ocmap->scm_modulus; i++) {
6454                         /* skip empty buckets */
6455                         if (DRT_HASH_VACANT(ocmap, i) ||
6456                             (DRT_HASH_GET_COUNT(ocmap, i) == 0))
6457                                 continue;
6458                         /* get new index */
6459                         offset = DRT_HASH_GET_ADDRESS(ocmap, i);
6460                         kret = vfs_drt_get_index(&cmap, offset, &index, 1);
6461                         if (kret != KERN_SUCCESS) {
6462                                 /* XXX need to bail out gracefully here */
6463                                 panic("vfs_drt: new cluster map mysteriously too small");
6464                                 index = 0;
6465                         }
6466                         /* copy */
6467                         DRT_HASH_COPY(ocmap, i, cmap, index);
6468                         copycount++;
6469                 }
6470         }
6471
6472         /* log what we've done */
6473         vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
6474
6475         /*
6476          * It's important to ensure that *cmapp always points to
6477          * a valid map, so we must overwrite it before freeing
6478          * the old map.
6479          */
6480         *cmapp = cmap;
6481         if (ocmap != NULL) {
6482                 /* emit stats into trace buffer */
6483                 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
6484                               ocmap->scm_modulus,
6485                               ocmap->scm_buckets,
6486                               ocmap->scm_lastclean,
6487                               ocmap->scm_iskips);
6488
6489                 vfs_drt_free_map(ocmap);
6490         }
6491         return(KERN_SUCCESS);
6492 }
6493
6494
6495 /*
6496  * Free a sparse cluster map.
6497  */
6498 static kern_return_t
6499 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
6500 {
6501         kmem_free(kernel_map, (vm_offset_t)cmap,
6502                   (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
6503         return(KERN_SUCCESS);
6504 }
6505
6506
6507 /*
6508  * Find the hashtable slot currently occupied by an entry for the supplied offset.
6509  */
6510 static kern_return_t
6511 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
6512 {
6513         int             index;
6514         u_int32_t       i;
6515
6516         offset = DRT_ALIGN_ADDRESS(offset);
6517         index = DRT_HASH(cmap, offset);
6518
6519         /* traverse the hashtable */
6520         for (i = 0; i < cmap->scm_modulus; i++) {
6521
6522                 /*
6523                  * If the slot is vacant, we can stop.
6524                  */
6525                 if (DRT_HASH_VACANT(cmap, index))
6526                         break;
6527
6528                 /*
6529                  * If the address matches our offset, we have success.
6530                  */
6531                 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
6532                         *indexp = index;
6533                         return(KERN_SUCCESS);
6534                 }
6535
6536                 /*
6537                  * Move to the next slot, try again.
6538                  */
6539                 index = DRT_HASH_NEXT(cmap, index);
6540         }
6541         /*
6542          * It's not there.
6543          */
6544         return(KERN_FAILURE);
6545 }
6546
6547 /*
6548  * Find the hashtable slot for the supplied offset.  If we haven't allocated
6549  * one yet, allocate one and populate the address field.  Note that it will
6550  * not have a nonzero page count and thus will still technically be free, so
6551  * in the case where we are called to clean pages, the slot will remain free.
6552  */
6553 static kern_return_t
6554 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
6555 {
6556         struct vfs_drt_clustermap *cmap;
6557         kern_return_t   kret;
6558         u_int32_t       index;
6559         u_int32_t       i;
6560
6561         cmap = *cmapp;
6562
6563         /* look for an existing entry */
6564         kret = vfs_drt_search_index(cmap, offset, indexp);
6565         if (kret == KERN_SUCCESS)
6566                 return(kret);
6567
6568         /* need to allocate an entry */
6569         offset = DRT_ALIGN_ADDRESS(offset);
6570         index = DRT_HASH(cmap, offset);
6571
6572         /* scan from the index forwards looking for a vacant slot */
6573         for (i = 0; i < cmap->scm_modulus; i++) {
6574                 /* slot vacant? */
6575                 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
6576                         cmap->scm_buckets++;
6577                         if (index < cmap->scm_lastclean)
6578                                 cmap->scm_lastclean = index;
6579                         DRT_HASH_SET_ADDRESS(cmap, index, offset);
6580                         DRT_HASH_SET_COUNT(cmap, index, 0);
6581                         DRT_BITVECTOR_CLEAR(cmap, index);
6582                         *indexp = index;
6583                         vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
6584                         return(KERN_SUCCESS);
6585                 }
6586                 cmap->scm_iskips += i;
6587                 index = DRT_HASH_NEXT(cmap, index);
6588         }
6589
6590         /*
6591          * We haven't found a vacant slot, so the map is full.  If we're not
6592          * already recursed, try reallocating/compacting it.
6593          */
6594         if (recursed)
6595                 return(KERN_FAILURE);
6596         kret = vfs_drt_alloc_map(cmapp);
6597         if (kret == KERN_SUCCESS) {
6598                 /* now try to insert again */
6599                 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
6600         }
6601         return(kret);
6602 }
6603
6604 /*
6605  * Implementation of set dirty/clean.
6606  *
6607  * In the 'clean' case, not finding a map is OK.
6608  */
6609 static kern_return_t
6610 vfs_drt_do_mark_pages(
6611         void            **private,
6612         u_int64_t       offset,
6613         u_int           length,
6614         u_int           *setcountp,
6615         int             dirty)
6616 {
6617         struct vfs_drt_clustermap *cmap, **cmapp;
6618         kern_return_t   kret;
6619         int             i, index, pgoff, pgcount, setcount, ecount;
6620
6621         cmapp = (struct vfs_drt_clustermap **)private;
6622         cmap = *cmapp;
6623
6624         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
6625
6626         if (setcountp != NULL)
6627                 *setcountp = 0;
6628
6629         /* allocate a cluster map if we don't already have one */
6630         if (cmap == NULL) {
6631                 /* no cluster map, nothing to clean */
6632                 if (!dirty) {
6633                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
6634                         return(KERN_SUCCESS);
6635                 }
6636                 kret = vfs_drt_alloc_map(cmapp);
6637                 if (kret != KERN_SUCCESS) {
6638                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
6639                         return(kret);
6640                 }
6641         }
6642         setcount = 0;
6643
6644         /*
6645          * Iterate over the length of the region.
6646          */
6647         while (length > 0) {
6648                 /*
6649                  * Get the hashtable index for this offset.
6650                  *
6651                  * XXX this will add blank entries if we are clearing a range
6652                  * that hasn't been dirtied.
6653                  */
6654                 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
6655                 cmap = *cmapp;  /* may have changed! */
6656                 /* this may be a partial-success return */
6657                 if (kret != KERN_SUCCESS) {
6658                         if (setcountp != NULL)
6659                                 *setcountp = setcount;
6660                         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
6661
6662                         return(kret);
6663                 }
6664
6665                 /*
6666                  * Work out how many pages we're modifying in this
6667                  * hashtable entry.
6668                  */
6669                 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
6670                 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
6671
6672                 /*
6673                  * Iterate over pages, dirty/clearing as we go.
6674                  */
6675                 ecount = DRT_HASH_GET_COUNT(cmap, index);
6676                 for (i = 0; i < pgcount; i++) {
6677                         if (dirty) {
6678                                 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
6679                                         DRT_HASH_SET_BIT(cmap, index, pgoff + i);
6680                                         ecount++;
6681                                         setcount++;
6682                                 }
6683                         } else {
6684                                 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
6685                                         DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
6686                                         ecount--;
6687                                         setcount++;
6688                                 }
6689                         }
6690                 }
6691                 DRT_HASH_SET_COUNT(cmap, index, ecount);
6692
6693                 offset += pgcount * PAGE_SIZE;
6694                 length -= pgcount * PAGE_SIZE;
6695         }
6696         if (setcountp != NULL)
6697                 *setcountp = setcount;
6698
6699         vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
6700
6701         return(KERN_SUCCESS);
6702 }
6703
6704 /*
6705  * Mark a set of pages as dirty/clean.
6706  *
6707  * This is a public interface.
6708  *
6709  * cmapp
6710  *      Pointer to storage suitable for holding a pointer.  Note that
6711  *      this must either be NULL or a value set by this function.
6712  *
6713  * size
6714  *      Current file size in bytes.
6715  *
6716  * offset
6717  *      Offset of the first page to be marked as dirty, in bytes.  Must be
6718  *      page-aligned.
6719  *
6720  * length
6721  *      Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE.
6722  *
6723  * setcountp
6724  *      Number of pages newly marked dirty by this call (optional).
6725  *
6726  * Returns KERN_SUCCESS if all the pages were successfully marked.
6727  */
6728 static kern_return_t
6729 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp)
6730 {
6731         /* XXX size unused, drop from interface */
6732         return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
6733 }
6734
6735 #if 0
6736 static kern_return_t
6737 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
6738 {
6739         return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
6740 }
6741 #endif
6742
6743 /*
6744  * Get a cluster of dirty pages.
6745  *
6746  * This is a public interface.
6747  *
6748  * cmapp
6749  *      Pointer to storage managed by drt_mark_pages.  Note that this must
6750  *      be NULL or a value set by drt_mark_pages.
6751  *
6752  * offsetp
6753  *      Returns the byte offset into the file of the first page in the cluster.
6754  *
6755  * lengthp
6756  *      Returns the length in bytes of the cluster of dirty pages.
6757  *
6758  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there
6759  * are no dirty pages meeting the minmum size criteria.  Private storage will
6760  * be released if there are no more dirty pages left in the map
6761  *
6762  */
6763 static kern_return_t
6764 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
6765 {
6766         struct vfs_drt_clustermap *cmap;
6767         u_int64_t       offset;
6768         u_int           length;
6769         u_int32_t       j;
6770         int             index, i, fs, ls;
6771
6772         /* sanity */
6773         if ((cmapp == NULL) || (*cmapp == NULL))
6774                 return(KERN_FAILURE);
6775         cmap = *cmapp;
6776
6777         /* walk the hashtable */
6778         for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
6779                 index = DRT_HASH(cmap, offset);
6780
6781                 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
6782                         continue;
6783
6784                 /* scan the bitfield for a string of bits */
6785                 fs = -1;
6786
6787                 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
6788                         if (DRT_HASH_TEST_BIT(cmap, index, i)) {
6789                                 fs = i;
6790                                 break;
6791                         }
6792                 }
6793                 if (fs == -1) {
6794                         /*  didn't find any bits set */
6795                         panic("vfs_drt: entry summary count > 0 but no bits set in map");
6796                 }
6797                 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
6798                         if (!DRT_HASH_TEST_BIT(cmap, index, i))
6799                                 break;
6800                 }
6801
6802                 /* compute offset and length, mark pages clean */
6803                 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
6804                 length = ls * PAGE_SIZE;
6805                 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
6806                 cmap->scm_lastclean = index;
6807
6808                 /* return successful */
6809                 *offsetp = (off_t)offset;
6810                 *lengthp = length;
6811
6812                 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
6813                 return(KERN_SUCCESS);
6814         }
6815         /*
6816          * We didn't find anything... hashtable is empty
6817          * emit stats into trace buffer and
6818          * then free it
6819          */
6820         vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
6821                       cmap->scm_modulus,
6822                       cmap->scm_buckets,
6823                       cmap->scm_lastclean,
6824                       cmap->scm_iskips);
6825
6826         vfs_drt_free_map(cmap);
6827         *cmapp = NULL;
6828
6829         return(KERN_FAILURE);
6830 }
6831
6832
6833 static kern_return_t
6834 vfs_drt_control(void **cmapp, int op_type)
6835 {
6836         struct vfs_drt_clustermap *cmap;
6837
6838         /* sanity */
6839         if ((cmapp == NULL) || (*cmapp == NULL))
6840                 return(KERN_FAILURE);
6841         cmap = *cmapp;
6842
6843         switch (op_type) {
6844         case 0:
6845                 /* emit stats into trace buffer */
6846                 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
6847                               cmap->scm_modulus,
6848                               cmap->scm_buckets,
6849                               cmap->scm_lastclean,
6850                               cmap->scm_iskips);
6851
6852                 vfs_drt_free_map(cmap);
6853                 *cmapp = NULL;
6854                 break;
6855
6856         case 1:
6857                 cmap->scm_lastclean = 0;
6858                 break;
6859         }
6860         return(KERN_SUCCESS);
6861 }
6862
6863
6864
6865 /*
6866  * Emit a summary of the state of the clustermap into the trace buffer
6867  * along with some caller-provided data.
6868  */
6869 #if KDEBUG
6870 static void
6871 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
6872 {
6873         KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
6874 }
6875 #else
6876 static void
6877 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
6878                           __unused int arg1, __unused int arg2, __unused int arg3,
6879                           __unused int arg4)
6880 {
6881 }
6882 #endif
6883
6884 #if 0
6885 /*
6886  * Perform basic sanity check on the hash entry summary count
6887  * vs. the actual bits set in the entry.
6888  */
6889 static void
6890 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
6891 {
6892         int index, i;
6893         int bits_on;
6894
6895         for (index = 0; index < cmap->scm_modulus; index++) {
6896                 if (DRT_HASH_VACANT(cmap, index))
6897                         continue;
6898
6899                 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
6900                         if (DRT_HASH_TEST_BIT(cmap, index, i))
6901                                 bits_on++;
6902                 }
6903                 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
6904                         panic("bits_on = %d,  index = %d\n", bits_on, index);
6905         }
6906 }
6907 #endif