2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved. 
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 
   6  * This file contains Original Code and/or Modifications of Original Code 
   7  * as defined in and that are subject to the Apple Public Source License 
   8  * Version 2.0 (the 'License'). You may not use this file except in 
   9  * compliance with the License. The rights granted to you under the License 
  10  * may not be used to create, or enable the creation or redistribution of, 
  11  * unlawful or unlicensed copies of an Apple operating system, or to 
  12  * circumvent, violate, or enable the circumvention or violation of, any 
  13  * terms of an Apple operating system software license agreement. 
  15  * Please obtain a copy of the License at 
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file. 
  18  * The Original Code and all software distributed under the License are 
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
  23  * Please see the License for the specific language governing rights and 
  24  * limitations under the License. 
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 
  31  *      The Regents of the University of California.  All rights reserved. 
  33  * Redistribution and use in source and binary forms, with or without 
  34  * modification, are permitted provided that the following conditions 
  36  * 1. Redistributions of source code must retain the above copyright 
  37  *    notice, this list of conditions and the following disclaimer. 
  38  * 2. Redistributions in binary form must reproduce the above copyright 
  39  *    notice, this list of conditions and the following disclaimer in the 
  40  *    documentation and/or other materials provided with the distribution. 
  41  * 3. All advertising materials mentioning features or use of this software 
  42  *    must display the following acknowledgement: 
  43  *      This product includes software developed by the University of 
  44  *      California, Berkeley and its contributors. 
  45  * 4. Neither the name of the University nor the names of its contributors 
  46  *    may be used to endorse or promote products derived from this software 
  47  *    without specific prior written permission. 
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 
  61  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95 
  64 #include <sys/param.h> 
  65 #include <sys/proc_internal.h> 
  66 #include <sys/buf_internal.h> 
  67 #include <sys/mount_internal.h> 
  68 #include <sys/vnode_internal.h> 
  69 #include <sys/trace.h> 
  70 #include <sys/malloc.h> 
  72 #include <sys/kernel.h> 
  73 #include <sys/resourcevar.h> 
  74 #include <miscfs/specfs/specdev.h> 
  75 #include <sys/uio_internal.h> 
  76 #include <libkern/libkern.h> 
  77 #include <machine/machine_routines.h> 
  79 #include <sys/ubc_internal.h> 
  80 #include <vm/vnode_pager.h> 
  82 #include <mach/mach_types.h> 
  83 #include <mach/memory_object_types.h> 
  84 #include <mach/vm_map.h> 
  86 #include <kern/task.h> 
  88 #include <vm/vm_kern.h> 
  89 #include <vm/vm_map.h> 
  90 #include <vm/vm_pageout.h> 
  92 #include <sys/kdebug.h> 
  93 #include <libkern/OSAtomic.h>   
  99 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT 
 104 #define CL_WRITE        0x02 
 105 #define CL_ASYNC        0x04 
 106 #define CL_COMMIT       0x08 
 107 #define CL_PAGEOUT      0x10 
 109 #define CL_NOZERO       0x40 
 110 #define CL_PAGEIN       0x80 
 111 #define CL_DEV_MEMORY   0x100 
 112 #define CL_PRESERVE     0x200 
 113 #define CL_THROTTLE     0x400 
 114 #define CL_KEEPCACHED   0x800 
 115 #define CL_DIRECT_IO    0x1000 
 116 #define CL_PASSIVE      0x2000 
 117 #define CL_IOSTREAMING  0x4000 
 118 #define CL_CLOSE        0x8000 
 119 #define CL_ENCRYPTED    0x10000 
 120 #define CL_RAW_ENCRYPTED        0x20000 
 121 #define CL_NOCACHE      0x40000 
 123 #define MAX_VECTOR_UPL_ELEMENTS 8 
 124 #define MAX_VECTOR_UPL_SIZE     (2 * MAX_UPL_SIZE) * PAGE_SIZE 
 126 extern upl_t 
vector_upl_create(vm_offset_t
); 
 127 extern boolean_t 
vector_upl_is_valid(upl_t
); 
 128 extern boolean_t 
vector_upl_set_subupl(upl_t
,upl_t
, u_int32_t
); 
 129 extern void vector_upl_set_pagelist(upl_t
); 
 130 extern void vector_upl_set_iostate(upl_t
, upl_t
, vm_offset_t
, u_int32_t
); 
 134         u_int  io_completed
;       /* amount of io that has currently completed */ 
 135         u_int  io_issued
;          /* amount of io that was successfully issued */ 
 136         int    io_error
;           /* error code of first error encountered */ 
 137         int    io_wanted
;          /* someone is sleeping waiting for a change in state */ 
 140 static lck_grp_t        
*cl_mtx_grp
; 
 141 static lck_attr_t       
*cl_mtx_attr
; 
 142 static lck_grp_attr_t   
*cl_mtx_grp_attr
; 
 143 static lck_mtx_t        
*cl_transaction_mtxp
; 
 151 #define PUSH_DELAY      0x01 
 152 #define PUSH_ALL        0x02 
 153 #define PUSH_SYNC       0x04 
 156 static void cluster_EOT(buf_t cbp_head
, buf_t cbp_tail
, int zero_offset
); 
 157 static void cluster_wait_IO(buf_t cbp_head
, int async
); 
 158 static void cluster_complete_transaction(buf_t 
*cbp_head
, void *callback_arg
, int *retval
, int flags
, int needwait
); 
 160 static int cluster_io_type(struct uio 
*uio
, int *io_type
, u_int32_t 
*io_length
, u_int32_t min_length
); 
 162 static int cluster_io(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
, int non_rounded_size
, 
 163                       int flags
, buf_t real_bp
, struct clios 
*iostate
, int (*)(buf_t
, void *), void *callback_arg
); 
 164 static int cluster_iodone(buf_t bp
, void *callback_arg
); 
 165 static int cluster_ioerror(upl_t upl
, int upl_offset
, int abort_size
, int error
, int io_flags
); 
 166 static int cluster_hard_throttle_on(vnode_t vp
, uint32_t); 
 168 static void cluster_iostate_wait(struct clios 
*iostate
, u_int target
, const char *wait_name
); 
 170 static void cluster_syncup(vnode_t vp
, off_t newEOF
, int (*)(buf_t
, void *), void *callback_arg
); 
 172 static void cluster_read_upl_release(upl_t upl
, int start_pg
, int last_pg
, int take_reference
); 
 173 static int cluster_copy_ubc_data_internal(vnode_t vp
, struct uio 
*uio
, int *io_resid
, int mark_dirty
, int take_reference
); 
 175 static int cluster_read_copy(vnode_t vp
, struct uio 
*uio
, u_int32_t io_req_size
,  off_t filesize
, int flags
, 
 176                              int (*)(buf_t
, void *), void *callback_arg
); 
 177 static int cluster_read_direct(vnode_t vp
, struct uio 
*uio
, off_t filesize
, int *read_type
, u_int32_t 
*read_length
, 
 178                                int flags
, int (*)(buf_t
, void *), void *callback_arg
); 
 179 static int cluster_read_contig(vnode_t vp
, struct uio 
*uio
, off_t filesize
, int *read_type
, u_int32_t 
*read_length
, 
 180                                int (*)(buf_t
, void *), void *callback_arg
, int flags
); 
 182 static int cluster_write_copy(vnode_t vp
, struct uio 
*uio
, u_int32_t io_req_size
, off_t oldEOF
, off_t newEOF
, 
 183                               off_t headOff
, off_t tailOff
, int flags
, int (*)(buf_t
, void *), void *callback_arg
); 
 184 static int cluster_write_direct(vnode_t vp
, struct uio 
*uio
, off_t oldEOF
, off_t newEOF
, 
 185                                 int *write_type
, u_int32_t 
*write_length
, int flags
, int (*)(buf_t
, void *), void *callback_arg
); 
 186 static int cluster_write_contig(vnode_t vp
, struct uio 
*uio
, off_t newEOF
, 
 187                                 int *write_type
, u_int32_t 
*write_length
, int (*)(buf_t
, void *), void *callback_arg
, int bflag
); 
 189 static int cluster_align_phys_io(vnode_t vp
, struct uio 
*uio
, addr64_t usr_paddr
, u_int32_t xsize
, int flags
, int (*)(buf_t
, void *), void *callback_arg
); 
 191 static int      cluster_read_prefetch(vnode_t vp
, off_t f_offset
, u_int size
, off_t filesize
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
); 
 192 static void     cluster_read_ahead(vnode_t vp
, struct cl_extent 
*extent
, off_t filesize
, struct cl_readahead 
*ra
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
); 
 194 static int      cluster_push_now(vnode_t vp
, struct cl_extent 
*, off_t EOF
, int flags
, int (*)(buf_t
, void *), void *callback_arg
); 
 196 static int      cluster_try_push(struct cl_writebehind 
*, vnode_t vp
, off_t EOF
, int push_flag
, int flags
, int (*)(buf_t
, void *), void *callback_arg
); 
 198 static void     sparse_cluster_switch(struct cl_writebehind 
*, vnode_t vp
, off_t EOF
, int (*)(buf_t
, void *), void *callback_arg
); 
 199 static void     sparse_cluster_push(void **cmapp
, vnode_t vp
, off_t EOF
, int push_flag
, int io_flags
, int (*)(buf_t
, void *), void *callback_arg
); 
 200 static void     sparse_cluster_add(void **cmapp
, vnode_t vp
, struct cl_extent 
*, off_t EOF
, int (*)(buf_t
, void *), void *callback_arg
); 
 202 static kern_return_t 
vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, u_int 
*setcountp
); 
 203 static kern_return_t 
vfs_drt_get_cluster(void **cmapp
, off_t 
*offsetp
, u_int 
*lengthp
); 
 204 static kern_return_t 
vfs_drt_control(void **cmapp
, int op_type
); 
 208  * For throttled IO to check whether 
 209  * a block is cached by the boot cache 
 210  * and thus it can avoid delaying the IO. 
 212  * bootcache_contains_block is initially 
 213  * NULL. The BootCache will set it while 
 214  * the cache is active and clear it when 
 215  * the cache is jettisoned. 
 217  * Returns 0 if the block is not 
 218  * contained in the cache, 1 if it is 
 221  * The function pointer remains valid 
 222  * after the cache has been evicted even 
 223  * if bootcache_contains_block has been 
 226  * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs 
 228 int (*bootcache_contains_block
)(dev_t device
, u_int64_t blkno
) = NULL
; 
 232  * limit the internal I/O size so that we 
 233  * can represent it in a 32 bit int 
 235 #define MAX_IO_REQUEST_SIZE     (1024 * 1024 * 512) 
 236 #define MAX_IO_CONTIG_SIZE      (MAX_UPL_SIZE * PAGE_SIZE) 
 238 #define MIN_DIRECT_WRITE_SIZE   (4 * PAGE_SIZE) 
 240 #define WRITE_THROTTLE          6 
 241 #define WRITE_THROTTLE_SSD      2 
 242 #define WRITE_BEHIND            1 
 243 #define WRITE_BEHIND_SSD        1 
 247 #define PREFETCH_SSD            1 
 248 uint32_t speculative_prefetch_max 
= 512;        /* maximum number of pages to use for a specluative read-ahead */ 
 249 uint32_t speculative_prefetch_max_iosize 
= (512 * 1024);       /* maximum I/O size to use for a specluative read-ahead */ 
 252 #define PREFETCH_SSD            1 
 253 uint32_t speculative_prefetch_max 
= (MAX_UPL_SIZE 
* 3); 
 254 uint32_t speculative_prefetch_max_iosize 
= (512 * 1024);       /* maximum I/O size to use for a specluative read-ahead on SSDs*/ 
 258 #define IO_SCALE(vp, base)              (vp->v_mount->mnt_ioscale * (base)) 
 259 #define MAX_CLUSTER_SIZE(vp)            (cluster_max_io_size(vp->v_mount, CL_WRITE)) 
 260 #define MAX_PREFETCH(vp, size, is_ssd)  (size * IO_SCALE(vp, ((is_ssd && !ignore_is_ssd) ? PREFETCH_SSD : PREFETCH))) 
 262 int     ignore_is_ssd 
= 0; 
 263 int     speculative_reads_disabled 
= 0; 
 266  * throttle the number of async writes that 
 267  * can be outstanding on a single vnode 
 268  * before we issue a synchronous write  
 270 #define HARD_THROTTLE_MAXCNT    0 
 271 #define HARD_THROTTLE_MAX_IOSIZE (128 * 1024) 
 272 #define LEGACY_HARD_THROTTLE_MAX_IOSIZE (512 * 1024) 
 274 extern int32_t throttle_legacy_process_count
; 
 275 int hard_throttle_on_root 
= 0; 
 276 uint32_t hard_throttle_max_iosize 
= HARD_THROTTLE_MAX_IOSIZE
; 
 277 uint32_t legacy_hard_throttle_max_iosize 
= LEGACY_HARD_THROTTLE_MAX_IOSIZE
; 
 278 struct timeval priority_IO_timestamp_for_root
; 
 281 #define THROTTLE_MAX_IOSIZE (hard_throttle_max_iosize) 
 283 #define THROTTLE_MAX_IOSIZE (throttle_legacy_process_count == 0 ? hard_throttle_max_iosize : legacy_hard_throttle_max_iosize) 
 287 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_throttle_max_iosize
, CTLFLAG_RW 
| CTLFLAG_LOCKED
, &hard_throttle_max_iosize
, 0, ""); 
 288 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_legacy_throttle_max_iosize
, CTLFLAG_RW 
| CTLFLAG_LOCKED
, &legacy_hard_throttle_max_iosize
, 0, ""); 
 294          * allocate lock group attribute and group 
 296         cl_mtx_grp_attr 
= lck_grp_attr_alloc_init(); 
 297         cl_mtx_grp 
= lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr
); 
 300          * allocate the lock attribute 
 302         cl_mtx_attr 
= lck_attr_alloc_init(); 
 304         cl_transaction_mtxp 
= lck_mtx_alloc_init(cl_mtx_grp
, cl_mtx_attr
); 
 306         if (cl_transaction_mtxp 
== NULL
) 
 307                 panic("cluster_init: failed to allocate cl_transaction_mtxp"); 
 312 cluster_max_io_size(mount_t mp
, int type
) 
 314         uint32_t        max_io_size
; 
 321                 segcnt 
= mp
->mnt_segreadcnt
; 
 322                 maxcnt 
= mp
->mnt_maxreadcnt
; 
 325                 segcnt 
= mp
->mnt_segwritecnt
; 
 326                 maxcnt 
= mp
->mnt_maxwritecnt
; 
 329                 segcnt 
= min(mp
->mnt_segreadcnt
, mp
->mnt_segwritecnt
); 
 330                 maxcnt 
= min(mp
->mnt_maxreadcnt
, mp
->mnt_maxwritecnt
); 
 333        if (segcnt 
> MAX_UPL_SIZE
) { 
 335                 * don't allow a size beyond the max UPL size we can create 
 337                segcnt 
= MAX_UPL_SIZE
; 
 339        max_io_size 
= min((segcnt 
* PAGE_SIZE
), maxcnt
); 
 341        if (max_io_size 
< (MAX_UPL_TRANSFER 
* PAGE_SIZE
)) { 
 343                 * don't allow a size smaller than the old fixed limit 
 345                max_io_size 
= (MAX_UPL_TRANSFER 
* PAGE_SIZE
); 
 348                 * make sure the size specified is a multiple of PAGE_SIZE 
 350                max_io_size 
&= ~PAGE_MASK
; 
 352        return (max_io_size
); 
 358 #define CLW_ALLOCATE            0x01 
 359 #define CLW_RETURNLOCKED        0x02 
 360 #define CLW_IONOCACHE           0x04 
 361 #define CLW_IOPASSIVE   0x08 
 364  * if the read ahead context doesn't yet exist, 
 365  * allocate and initialize it... 
 366  * the vnode lock serializes multiple callers 
 367  * during the actual assignment... first one 
 368  * to grab the lock wins... the other callers 
 369  * will release the now unnecessary storage 
 371  * once the context is present, try to grab (but don't block on) 
 372  * the lock associated with it... if someone 
 373  * else currently owns it, than the read 
 374  * will run without read-ahead.  this allows 
 375  * multiple readers to run in parallel and 
 376  * since there's only 1 read ahead context, 
 377  * there's no real loss in only allowing 1 
 378  * reader to have read-ahead enabled. 
 380 static struct cl_readahead 
* 
 381 cluster_get_rap(vnode_t vp
) 
 383         struct ubc_info         
*ubc
; 
 384         struct cl_readahead     
*rap
; 
 388         if ((rap 
= ubc
->cl_rahead
) == NULL
) { 
 389                 MALLOC_ZONE(rap
, struct cl_readahead 
*, sizeof *rap
, M_CLRDAHEAD
, M_WAITOK
); 
 391                 bzero(rap
, sizeof *rap
); 
 393                 lck_mtx_init(&rap
->cl_lockr
, cl_mtx_grp
, cl_mtx_attr
); 
 397                 if (ubc
->cl_rahead 
== NULL
) 
 398                         ubc
->cl_rahead 
= rap
; 
 400                         lck_mtx_destroy(&rap
->cl_lockr
, cl_mtx_grp
); 
 401                         FREE_ZONE((void *)rap
, sizeof *rap
, M_CLRDAHEAD
); 
 402                         rap 
= ubc
->cl_rahead
; 
 406         if (lck_mtx_try_lock(&rap
->cl_lockr
) == TRUE
) 
 409         return ((struct cl_readahead 
*)NULL
); 
 414  * if the write behind context doesn't yet exist, 
 415  * and CLW_ALLOCATE is specified, allocate and initialize it... 
 416  * the vnode lock serializes multiple callers 
 417  * during the actual assignment... first one 
 418  * to grab the lock wins... the other callers 
 419  * will release the now unnecessary storage 
 421  * if CLW_RETURNLOCKED is set, grab (blocking if necessary) 
 422  * the lock associated with the write behind context before 
 426 static struct cl_writebehind 
* 
 427 cluster_get_wbp(vnode_t vp
, int flags
) 
 429         struct ubc_info 
*ubc
; 
 430         struct cl_writebehind 
*wbp
; 
 434         if ((wbp 
= ubc
->cl_wbehind
) == NULL
) { 
 436                 if ( !(flags 
& CLW_ALLOCATE
)) 
 437                         return ((struct cl_writebehind 
*)NULL
); 
 439                 MALLOC_ZONE(wbp
, struct cl_writebehind 
*, sizeof *wbp
, M_CLWRBEHIND
, M_WAITOK
); 
 441                 bzero(wbp
, sizeof *wbp
); 
 442                 lck_mtx_init(&wbp
->cl_lockw
, cl_mtx_grp
, cl_mtx_attr
); 
 446                 if (ubc
->cl_wbehind 
== NULL
) 
 447                         ubc
->cl_wbehind 
= wbp
; 
 449                         lck_mtx_destroy(&wbp
->cl_lockw
, cl_mtx_grp
); 
 450                         FREE_ZONE((void *)wbp
, sizeof *wbp
, M_CLWRBEHIND
); 
 451                         wbp 
= ubc
->cl_wbehind
; 
 455         if (flags 
& CLW_RETURNLOCKED
) 
 456                 lck_mtx_lock(&wbp
->cl_lockw
); 
 463 cluster_syncup(vnode_t vp
, off_t newEOF
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
 465         struct cl_writebehind 
*wbp
; 
 467         if ((wbp 
= cluster_get_wbp(vp
, 0)) != NULL
) { 
 469                 if (wbp
->cl_number
) { 
 470                         lck_mtx_lock(&wbp
->cl_lockw
); 
 472                         cluster_try_push(wbp
, vp
, newEOF
, PUSH_ALL 
| PUSH_SYNC
, 0, callback
, callback_arg
); 
 474                         lck_mtx_unlock(&wbp
->cl_lockw
); 
 481 cluster_io_present_in_BC(vnode_t vp
, off_t f_offset
) 
 485         int (*bootcache_check_fn
)(dev_t device
, u_int64_t blkno
) = bootcache_contains_block
; 
 487         if (bootcache_check_fn
) { 
 488                 if (VNOP_BLOCKMAP(vp
, f_offset
, PAGE_SIZE
, &blkno
, &io_size
, NULL
, VNODE_READ
, NULL
)) 
 494                 if (bootcache_check_fn(vp
->v_mount
->mnt_devvp
->v_rdev
, blkno
)) 
 502 cluster_hard_throttle_on(vnode_t vp
, uint32_t hard_throttle
) 
 504         int throttle_type 
= 0; 
 506         if ( (throttle_type 
= throttle_io_will_be_throttled(-1, vp
->v_mount
)) ) 
 507                 return(throttle_type
); 
 509         if (hard_throttle 
&& (vp
->v_mount
->mnt_kern_flag 
& MNTK_ROOTDEV
)) { 
 510                 static struct timeval hard_throttle_maxelapsed 
= { 0, 100000 }; 
 511                 struct timeval elapsed
; 
 513                 if (hard_throttle_on_root
) 
 516                 microuptime(&elapsed
); 
 517                 timevalsub(&elapsed
, &priority_IO_timestamp_for_root
); 
 519                 if (timevalcmp(&elapsed
, &hard_throttle_maxelapsed
, <)) 
 527 cluster_iostate_wait(struct clios 
*iostate
, u_int target
, const char *wait_name
) 
 530         lck_mtx_lock(&iostate
->io_mtxp
); 
 532         while ((iostate
->io_issued 
- iostate
->io_completed
) > target
) { 
 534                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 95)) | DBG_FUNC_START
, 
 535                              iostate
->io_issued
, iostate
->io_completed
, target
, 0, 0); 
 537                 iostate
->io_wanted 
= 1; 
 538                 msleep((caddr_t
)&iostate
->io_wanted
, &iostate
->io_mtxp
, PRIBIO 
+ 1, wait_name
, NULL
); 
 540                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 95)) | DBG_FUNC_END
, 
 541                              iostate
->io_issued
, iostate
->io_completed
, target
, 0, 0); 
 543         lck_mtx_unlock(&iostate
->io_mtxp
); 
 548 cluster_ioerror(upl_t upl
, int upl_offset
, int abort_size
, int error
, int io_flags
) 
 550         int upl_abort_code 
= 0; 
 554         if ((io_flags 
& (B_PHYS 
| B_CACHE
)) == (B_PHYS 
| B_CACHE
)) 
 556                  * direct write of any flavor, or a direct read that wasn't aligned 
 558                 ubc_upl_commit_range(upl
, upl_offset
, abort_size
, UPL_COMMIT_FREE_ON_EMPTY
); 
 560                 if (io_flags 
& B_PAGEIO
) { 
 561                         if (io_flags 
& B_READ
) 
 566                 if (io_flags 
& B_CACHE
) 
 568                          * leave pages in the cache unchanged on error 
 570                         upl_abort_code 
= UPL_ABORT_FREE_ON_EMPTY
; 
 571                 else if (page_out 
&& (error 
!= ENXIO
)) 
 573                          * transient error... leave pages unchanged 
 575                         upl_abort_code 
= UPL_ABORT_FREE_ON_EMPTY
; 
 577                         upl_abort_code 
= UPL_ABORT_FREE_ON_EMPTY 
| UPL_ABORT_ERROR
; 
 579                         upl_abort_code 
= UPL_ABORT_FREE_ON_EMPTY 
| UPL_ABORT_DUMP_PAGES
; 
 581                 ubc_upl_abort_range(upl
, upl_offset
, abort_size
, upl_abort_code
); 
 583         return (upl_abort_code
); 
 588 cluster_iodone(buf_t bp
, void *callback_arg
) 
 599         int     transaction_size 
= 0; 
 605         struct  clios 
*iostate
; 
 606         boolean_t       transaction_complete 
= FALSE
; 
 608         cbp_head 
= (buf_t
)(bp
->b_trans_head
); 
 610         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_START
, 
 611                      cbp_head
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0); 
 613         if (cbp_head
->b_trans_next 
|| !(cbp_head
->b_flags 
& B_EOT
)) { 
 614                 boolean_t       need_wakeup 
= FALSE
; 
 616                 lck_mtx_lock_spin(cl_transaction_mtxp
); 
 618                 bp
->b_flags 
|= B_TDONE
; 
 620                 if (bp
->b_flags 
& B_TWANTED
) { 
 621                         CLR(bp
->b_flags
, B_TWANTED
); 
 624                 for (cbp 
= cbp_head
; cbp
; cbp 
= cbp
->b_trans_next
) { 
 626                          * all I/O requests that are part of this transaction 
 627                          * have to complete before we can process it 
 629                         if ( !(cbp
->b_flags 
& B_TDONE
)) { 
 631                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
, 
 632                                              cbp_head
, cbp
, cbp
->b_bcount
, cbp
->b_flags
, 0); 
 634                                 lck_mtx_unlock(cl_transaction_mtxp
); 
 636                                 if (need_wakeup 
== TRUE
) 
 641                         if (cbp
->b_flags 
& B_EOT
) 
 642                                 transaction_complete 
= TRUE
; 
 644                 lck_mtx_unlock(cl_transaction_mtxp
); 
 646                 if (need_wakeup 
== TRUE
) 
 649                 if (transaction_complete 
== FALSE
) { 
 650                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
, 
 651                                      cbp_head
, 0, 0, 0, 0); 
 660         upl_offset 
= cbp
->b_uploffset
; 
 662         b_flags    
= cbp
->b_flags
; 
 663         real_bp    
= cbp
->b_real_bp
; 
 664         zero_offset
= cbp
->b_validend
; 
 665         iostate    
= (struct clios 
*)cbp
->b_iostate
; 
 668                 real_bp
->b_dev 
= cbp
->b_dev
; 
 671                 if ((cbp
->b_flags 
& B_ERROR
) && error 
== 0) 
 672                         error 
= cbp
->b_error
; 
 674                 total_resid 
+= cbp
->b_resid
; 
 675                 total_size  
+= cbp
->b_bcount
; 
 677                 cbp_next 
= cbp
->b_trans_next
; 
 679                 if (cbp_next 
== NULL
) 
 681                          * compute the overall size of the transaction 
 682                          * in case we created one that has 'holes' in it 
 683                          * 'total_size' represents the amount of I/O we 
 684                          * did, not the span of the transaction w/r to the UPL 
 686                         transaction_size 
= cbp
->b_uploffset 
+ cbp
->b_bcount 
- upl_offset
; 
 693         if (error 
== 0 && total_resid
) 
 697                 int     (*cliodone_func
)(buf_t
, void *) = (int (*)(buf_t
, void *))(cbp_head
->b_cliodone
); 
 699                 if (cliodone_func 
!= NULL
) { 
 700                         cbp_head
->b_bcount 
= transaction_size
; 
 702                         error 
= (*cliodone_func
)(cbp_head
, callback_arg
); 
 706                 cluster_zero(upl
, zero_offset
, PAGE_SIZE 
- (zero_offset 
& PAGE_MASK
), real_bp
); 
 708         free_io_buf(cbp_head
); 
 714                  * someone has issued multiple I/Os asynchrounsly 
 715                  * and is waiting for them to complete (streaming) 
 717                 lck_mtx_lock_spin(&iostate
->io_mtxp
); 
 719                 if (error 
&& iostate
->io_error 
== 0) 
 720                         iostate
->io_error 
= error
; 
 722                 iostate
->io_completed 
+= total_size
; 
 724                 if (iostate
->io_wanted
) { 
 726                          * someone is waiting for the state of 
 727                          * this io stream to change 
 729                         iostate
->io_wanted 
= 0; 
 732                 lck_mtx_unlock(&iostate
->io_mtxp
); 
 735                         wakeup((caddr_t
)&iostate
->io_wanted
); 
 738         if (b_flags 
& B_COMMIT_UPL
) { 
 740                 pg_offset   
= upl_offset 
& PAGE_MASK
; 
 741                 commit_size 
= (pg_offset 
+ transaction_size 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
 744                         upl_flags 
= cluster_ioerror(upl
, upl_offset 
- pg_offset
, commit_size
, error
, b_flags
); 
 746                         upl_flags 
= UPL_COMMIT_FREE_ON_EMPTY
; 
 748                         if ((b_flags 
& B_PHYS
) && (b_flags 
& B_READ
))  
 749                                 upl_flags 
|= UPL_COMMIT_SET_DIRTY
; 
 752                                 upl_flags 
|= UPL_COMMIT_INACTIVATE
; 
 754                         ubc_upl_commit_range(upl
, upl_offset 
- pg_offset
, commit_size
, upl_flags
); 
 759                         real_bp
->b_flags 
|= B_ERROR
; 
 760                         real_bp
->b_error 
= error
; 
 762                 real_bp
->b_resid 
= total_resid
; 
 764                 buf_biodone(real_bp
); 
 766         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
, 
 767                      upl
, upl_offset 
- pg_offset
, commit_size
, (error 
<< 24) | upl_flags
, 0); 
 774 cluster_hard_throttle_limit(vnode_t vp
, uint32_t *limit
, uint32_t hard_throttle
) 
 776         if (cluster_hard_throttle_on(vp
, hard_throttle
)) { 
 777                 *limit 
= THROTTLE_MAX_IOSIZE
; 
 785 cluster_zero(upl_t upl
, upl_offset_t upl_offset
, int size
, buf_t bp
) 
 788         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_START
, 
 789                      upl_offset
, size
, bp
, 0, 0); 
 791         if (bp 
== NULL 
|| bp
->b_datap 
== 0) { 
 795                 pl 
= ubc_upl_pageinfo(upl
); 
 797                 if (upl_device_page(pl
) == TRUE
) { 
 798                         zero_addr 
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + upl_offset
; 
 800                         bzero_phys_nc(zero_addr
, size
); 
 807                                 page_index  
= upl_offset 
/ PAGE_SIZE
; 
 808                                 page_offset 
= upl_offset 
& PAGE_MASK
; 
 810                                 zero_addr 
= ((addr64_t
)upl_phys_page(pl
, page_index
) << 12) + page_offset
; 
 811                                 zero_cnt  
= min(PAGE_SIZE 
- page_offset
, size
); 
 813                                 bzero_phys(zero_addr
, zero_cnt
); 
 816                                 upl_offset 
+= zero_cnt
; 
 820                 bzero((caddr_t
)((vm_offset_t
)bp
->b_datap 
+ upl_offset
), size
); 
 822         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_END
, 
 823                      upl_offset
, size
, 0, 0, 0); 
 828 cluster_EOT(buf_t cbp_head
, buf_t cbp_tail
, int zero_offset
) 
 830         cbp_head
->b_validend 
= zero_offset
; 
 831         cbp_tail
->b_flags 
|= B_EOT
; 
 835 cluster_wait_IO(buf_t cbp_head
, int async
) 
 841                  * async callback completion will not normally 
 842                  * generate a wakeup upon I/O completion... 
 843                  * by setting B_TWANTED, we will force a wakeup 
 844                  * to occur as any outstanding I/Os complete...  
 845                  * I/Os already completed will have B_TDONE already 
 846                  * set and we won't cause us to block 
 847                  * note that we're actually waiting for the bp to have 
 848                  * completed the callback function... only then 
 849                  * can we safely take back ownership of the bp 
 851                 lck_mtx_lock_spin(cl_transaction_mtxp
); 
 853                 for (cbp 
= cbp_head
; cbp
; cbp 
= cbp
->b_trans_next
) 
 854                       cbp
->b_flags 
|= B_TWANTED
; 
 856                 lck_mtx_unlock(cl_transaction_mtxp
); 
 858         for (cbp 
= cbp_head
; cbp
; cbp 
= cbp
->b_trans_next
) { 
 861                         while (!ISSET(cbp
->b_flags
, B_TDONE
)) { 
 863                                 lck_mtx_lock_spin(cl_transaction_mtxp
); 
 865                                 if (!ISSET(cbp
->b_flags
, B_TDONE
)) { 
 866                                         DTRACE_IO1(wait__start
, buf_t
, cbp
); 
 867                                         (void) msleep(cbp
, cl_transaction_mtxp
, PDROP 
| (PRIBIO
+1), "cluster_wait_IO", NULL
); 
 868                                         DTRACE_IO1(wait__done
, buf_t
, cbp
); 
 870                                         lck_mtx_unlock(cl_transaction_mtxp
); 
 878 cluster_complete_transaction(buf_t 
*cbp_head
, void *callback_arg
, int *retval
, int flags
, int needwait
) 
 884          * cluster_complete_transaction will 
 885          * only be called if we've issued a complete chain in synchronous mode 
 886          * or, we've already done a cluster_wait_IO on an incomplete chain 
 889                 for (cbp 
= *cbp_head
; cbp
; cbp 
= cbp
->b_trans_next
) 
 893          * we've already waited on all of the I/Os in this transaction, 
 894          * so mark all of the buf_t's in this transaction as B_TDONE 
 895          * so that cluster_iodone sees the transaction as completed 
 897         for (cbp 
= *cbp_head
; cbp
; cbp 
= cbp
->b_trans_next
) 
 898                 cbp
->b_flags 
|= B_TDONE
; 
 900         error 
= cluster_iodone(*cbp_head
, callback_arg
); 
 902         if ( !(flags 
& CL_ASYNC
) && error 
&& *retval 
== 0) { 
 903                 if (((flags 
& (CL_PAGEOUT 
| CL_KEEPCACHED
)) != CL_PAGEOUT
) || (error 
!= ENXIO
)) 
 906         *cbp_head 
= (buf_t
)NULL
; 
 911 cluster_io(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
, int non_rounded_size
, 
 912            int flags
, buf_t real_bp
, struct clios 
*iostate
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
 921         buf_t   cbp_head 
= NULL
; 
 922         buf_t   cbp_tail 
= NULL
; 
 931         int     async_throttle 
= 0; 
 933         vm_offset_t upl_end_offset
; 
 934         boolean_t   need_EOT 
= FALSE
; 
 937          * we currently don't support buffers larger than a page 
 939         if (real_bp 
&& non_rounded_size 
> PAGE_SIZE
) 
 940                 panic("%s(): Called with real buffer of size %d bytes which " 
 941                                 "is greater than the maximum allowed size of " 
 942                                 "%d bytes (the system PAGE_SIZE).\n", 
 943                                 __FUNCTION__
, non_rounded_size
, PAGE_SIZE
); 
 948          * we don't want to do any funny rounding of the size for IO requests 
 949          * coming through the DIRECT or CONTIGUOUS paths...  those pages don't 
 950          * belong to us... we can't extend (nor do we need to) the I/O to fill 
 953         if (mp
->mnt_devblocksize 
> 1 && !(flags 
& (CL_DEV_MEMORY 
| CL_DIRECT_IO
))) { 
 955                  * round the requested size up so that this I/O ends on a 
 956                  * page boundary in case this is a 'write'... if the filesystem 
 957                  * has blocks allocated to back the page beyond the EOF, we want to 
 958                  * make sure to write out the zero's that are sitting beyond the EOF 
 959                  * so that in case the filesystem doesn't explicitly zero this area 
 960                  * if a hole is created via a lseek/write beyond the current EOF, 
 961                  * it will return zeros when it's read back from the disk.  If the 
 962                  * physical allocation doesn't extend for the whole page, we'll 
 963                  * only write/read from the disk up to the end of this allocation 
 964                  * via the extent info returned from the VNOP_BLOCKMAP call. 
 966                 pg_offset 
= upl_offset 
& PAGE_MASK
; 
 968                 size 
= (((non_rounded_size 
+ pg_offset
) + (PAGE_SIZE 
- 1)) & ~PAGE_MASK
) - pg_offset
; 
 971                  * anyone advertising a blocksize of 1 byte probably 
 972                  * can't deal with us rounding up the request size 
 973                  * AFP is one such filesystem/device 
 975                 size 
= non_rounded_size
; 
 977         upl_end_offset 
= upl_offset 
+ size
; 
 979         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_START
, (int)f_offset
, size
, upl_offset
, flags
, 0); 
 982          * Set the maximum transaction size to the maximum desired number of 
 986         if (flags 
& CL_DEV_MEMORY
) 
 987                 max_trans_count 
= 16; 
 989         if (flags 
& CL_READ
) { 
 991                 bmap_flags 
= VNODE_READ
; 
 993                 max_iosize  
= mp
->mnt_maxreadcnt
; 
 994                 max_vectors 
= mp
->mnt_segreadcnt
; 
 997                 bmap_flags 
= VNODE_WRITE
; 
 999                 max_iosize  
= mp
->mnt_maxwritecnt
; 
1000                 max_vectors 
= mp
->mnt_segwritecnt
; 
1002         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_NONE
, max_iosize
, max_vectors
, mp
->mnt_devblocksize
, 0, 0); 
1005          * make sure the maximum iosize is a 
1006          * multiple of the page size 
1008         max_iosize  
&= ~PAGE_MASK
; 
1011          * Ensure the maximum iosize is sensible. 
1014                 max_iosize 
= PAGE_SIZE
; 
1016         if (flags 
& CL_THROTTLE
) { 
1017                 if ( !(flags 
& CL_PAGEOUT
) && cluster_hard_throttle_on(vp
, 1)) { 
1018                         if (max_iosize 
> THROTTLE_MAX_IOSIZE
) 
1019                                 max_iosize 
= THROTTLE_MAX_IOSIZE
; 
1020                         async_throttle 
= HARD_THROTTLE_MAXCNT
; 
1022                         if ( (flags 
& CL_DEV_MEMORY
) ) 
1023                                 async_throttle 
= IO_SCALE(vp
, VNODE_ASYNC_THROTTLE
); 
1026                                 u_int max_cluster_size
; 
1029                                 max_cluster_size 
= MAX_CLUSTER_SIZE(vp
); 
1031                                 if (max_iosize 
> max_cluster_size
) 
1032                                         max_cluster 
= max_cluster_size
; 
1034                                         max_cluster 
= max_iosize
; 
1036                                 if (size 
< max_cluster
) 
1039                                 if ((vp
->v_mount
->mnt_kern_flag 
& MNTK_SSD
) && !ignore_is_ssd
) 
1040                                         scale 
= WRITE_THROTTLE_SSD
; 
1042                                         scale 
= WRITE_THROTTLE
; 
1044                                 if (flags 
& CL_CLOSE
) 
1045                                         scale 
+= MAX_CLUSTERS
; 
1047                                 async_throttle 
= min(IO_SCALE(vp
, VNODE_ASYNC_THROTTLE
), ((scale 
* max_cluster_size
) / max_cluster
) - 1); 
1053         if (flags 
& (CL_PAGEIN 
| CL_PAGEOUT
)) 
1054                 io_flags 
|= B_PAGEIO
; 
1055         if (flags 
& (CL_IOSTREAMING
)) 
1056                 io_flags 
|= B_IOSTREAMING
; 
1057         if (flags 
& CL_COMMIT
) 
1058                 io_flags 
|= B_COMMIT_UPL
; 
1059         if (flags 
& CL_DIRECT_IO
) 
1061         if (flags 
& (CL_PRESERVE 
| CL_KEEPCACHED
)) 
1062                 io_flags 
|= B_CACHE
; 
1063         if (flags 
& CL_PASSIVE
) 
1064                 io_flags 
|= B_PASSIVE
; 
1065         if (flags 
& CL_ENCRYPTED
) 
1066                 io_flags 
|= B_ENCRYPTED_IO
;      
1067         if (vp
->v_flag 
& VSYSTEM
) 
1070         if ((flags 
& CL_READ
) && ((upl_offset 
+ non_rounded_size
) & PAGE_MASK
) && (!(flags 
& CL_NOZERO
))) { 
1072                  * then we are going to end up 
1073                  * with a page that we can't complete (the file size wasn't a multiple 
1074                  * of PAGE_SIZE and we're trying to read to the end of the file 
1075                  * so we'll go ahead and zero out the portion of the page we can't 
1076                  * read in from the file 
1078                 zero_offset 
= upl_offset 
+ non_rounded_size
; 
1083                 u_int   io_size_wanted
; 
1086                 if (size 
> max_iosize
) 
1087                         io_size 
= max_iosize
; 
1091                 io_size_wanted 
= io_size
; 
1092                 io_size_tmp 
= (size_t)io_size
; 
1094                 if ((error 
= VNOP_BLOCKMAP(vp
, f_offset
, io_size
, &blkno
, &io_size_tmp
, NULL
, bmap_flags
, NULL
))) 
1097                 if (io_size_tmp 
> io_size_wanted
) 
1098                         io_size 
= io_size_wanted
; 
1100                         io_size 
= (u_int
)io_size_tmp
; 
1102                 if (real_bp 
&& (real_bp
->b_blkno 
== real_bp
->b_lblkno
)) 
1103                         real_bp
->b_blkno 
= blkno
; 
1105                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 24)) | DBG_FUNC_NONE
, 
1106                              (int)f_offset
, (int)(blkno
>>32), (int)blkno
, io_size
, 0); 
1110                          * vnop_blockmap didn't return an error... however, it did 
1111                          * return an extent size of 0 which means we can't 
1112                          * make forward progress on this I/O... a hole in the 
1113                          * file would be returned as a blkno of -1 with a non-zero io_size 
1114                          * a real extent is returned with a blkno != -1 and a non-zero io_size 
1119                 if ( !(flags 
& CL_READ
) && blkno 
== -1) { 
1123                         if (upl_get_internal_vectorupl(upl
)) 
1124                                 panic("Vector UPLs should not take this code-path\n"); 
1126                          * we're writing into a 'hole' 
1128                         if (flags 
& CL_PAGEOUT
) { 
1130                                  * if we got here via cluster_pageout  
1131                                  * then just error the request and return 
1132                                  * the 'hole' should already have been covered 
1138                          * we can get here if the cluster code happens to  
1139                          * pick up a page that was dirtied via mmap vs 
1140                          * a 'write' and the page targets a 'hole'... 
1141                          * i.e. the writes to the cluster were sparse 
1142                          * and the file was being written for the first time 
1144                          * we can also get here if the filesystem supports 
1145                          * 'holes' that are less than PAGE_SIZE.... because 
1146                          * we can't know if the range in the page that covers 
1147                          * the 'hole' has been dirtied via an mmap or not, 
1148                          * we have to assume the worst and try to push the 
1149                          * entire page to storage. 
1151                          * Try paging out the page individually before 
1152                          * giving up entirely and dumping it (the pageout 
1153                          * path will insure that the zero extent accounting 
1154                          * has been taken care of before we get back into cluster_io) 
1156                          * go direct to vnode_pageout so that we don't have to 
1157                          * unbusy the page from the UPL... we used to do this 
1158                          * so that we could call ubc_sync_range, but that results 
1159                          * in a potential deadlock if someone else races us to acquire 
1160                          * that page and wins and in addition needs one of the pages 
1161                          * we're continuing to hold in the UPL 
1163                         pageout_flags 
= UPL_MSYNC 
| UPL_VNODE_PAGER 
| UPL_NESTED_PAGEOUT
; 
1165                         if ( !(flags 
& CL_ASYNC
)) 
1166                                 pageout_flags 
|= UPL_IOSYNC
; 
1167                         if ( !(flags 
& CL_COMMIT
)) 
1168                                 pageout_flags 
|= UPL_NOCOMMIT
; 
1174                                  * first we have to wait for the the current outstanding I/Os 
1175                                  * to complete... EOT hasn't been set yet on this transaction 
1176                                  * so the pages won't be released just because all of the current 
1177                                  * I/O linked to this transaction has completed... 
1179                                 cluster_wait_IO(cbp_head
, (flags 
& CL_ASYNC
)); 
1182                                  * we've got a transcation that 
1183                                  * includes the page we're about to push out through vnode_pageout... 
1184                                  * find the last bp in the list which will be the one that 
1185                                  * includes the head of this page and round it's iosize down 
1186                                  * to a page boundary... 
1188                                 for (last_cbp 
= cbp 
= cbp_head
; cbp
->b_trans_next
; cbp 
= cbp
->b_trans_next
) 
1191                                 cbp
->b_bcount 
&= ~PAGE_MASK
; 
1193                                 if (cbp
->b_bcount 
== 0) { 
1195                                          * this buf no longer has any I/O associated with it 
1199                                         if (cbp 
== cbp_head
) { 
1201                                                  * the buf we just freed was the only buf in 
1202                                                  * this transaction... so there's no I/O to do 
1207                                                  * remove the buf we just freed from 
1208                                                  * the transaction list 
1210                                                 last_cbp
->b_trans_next 
= NULL
; 
1211                                                 cbp_tail 
= last_cbp
; 
1216                                          * there was more to the current transaction 
1217                                          * than just the page we are pushing out via vnode_pageout... 
1218                                          * mark it as finished and complete it... we've already 
1219                                          * waited for the I/Os to complete above in the call to cluster_wait_IO 
1221                                         cluster_EOT(cbp_head
, cbp_tail
, 0); 
1223                                         cluster_complete_transaction(&cbp_head
, callback_arg
, &retval
, flags
, 0); 
1228                         if (vnode_pageout(vp
, upl
, trunc_page(upl_offset
), trunc_page_64(f_offset
), PAGE_SIZE
, pageout_flags
, NULL
) != PAGER_SUCCESS
) { 
1231                         e_offset 
= round_page_64(f_offset 
+ 1); 
1232                         io_size 
= e_offset 
- f_offset
; 
1234                         f_offset   
+= io_size
; 
1235                         upl_offset 
+= io_size
; 
1237                         if (size 
>= io_size
) 
1242                          * keep track of how much of the original request 
1243                          * that we've actually completed... non_rounded_size 
1244                          * may go negative due to us rounding the request 
1245                          * to a page size multiple (i.e.  size > non_rounded_size) 
1247                         non_rounded_size 
-= io_size
; 
1249                         if (non_rounded_size 
<= 0) { 
1251                                  * we've transferred all of the data in the original 
1252                                  * request, but we were unable to complete the tail 
1253                                  * of the last page because the file didn't have 
1254                                  * an allocation to back that portion... this is ok. 
1260                                         flags 
&= ~CL_COMMIT
; 
1265                 lblkno 
= (daddr64_t
)(f_offset 
/ PAGE_SIZE_64
); 
1267                  * we have now figured out how much I/O we can do - this is in 'io_size' 
1268                  * pg_offset is the starting point in the first page for the I/O 
1269                  * pg_count is the number of full and partial pages that 'io_size' encompasses 
1271                 pg_offset 
= upl_offset 
& PAGE_MASK
; 
1273                 if (flags 
& CL_DEV_MEMORY
) { 
1275                          * treat physical requests as one 'giant' page 
1279                         pg_count  
= (io_size 
+ pg_offset 
+ (PAGE_SIZE 
- 1)) / PAGE_SIZE
; 
1281                 if ((flags 
& CL_READ
) && blkno 
== -1) { 
1282                         vm_offset_t  commit_offset
; 
1284                         int complete_transaction_now 
= 0; 
1287                          * if we're reading and blkno == -1, then we've got a 
1288                          * 'hole' in the file that we need to deal with by zeroing 
1289                          * out the affected area in the upl 
1291                         if (io_size 
>= (u_int
)non_rounded_size
) { 
1293                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE 
1294                                  * than 'zero_offset' will be non-zero 
1295                                  * if the 'hole' returned by vnop_blockmap extends all the way to the eof 
1296                                  * (indicated by the io_size finishing off the I/O request for this UPL) 
1297                                  * than we're not going to issue an I/O for the 
1298                                  * last page in this upl... we need to zero both the hole and the tail 
1299                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in  
1301                                 bytes_to_zero 
= non_rounded_size
; 
1302                                 if (!(flags 
& CL_NOZERO
)) 
1303                                         bytes_to_zero 
= (((upl_offset 
+ io_size
) + (PAGE_SIZE 
- 1)) & ~PAGE_MASK
) - upl_offset
; 
1307                                 bytes_to_zero 
= io_size
; 
1311                         cluster_zero(upl
, upl_offset
, bytes_to_zero
, real_bp
); 
1317                                  * if there is a current I/O chain pending 
1318                                  * then the first page of the group we just zero'd 
1319                                  * will be handled by the I/O completion if the zero 
1320                                  * fill started in the middle of the page 
1322                                 commit_offset 
= (upl_offset 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
1324                                 pg_resid 
= commit_offset 
- upl_offset
; 
1326                                 if (bytes_to_zero 
>= pg_resid
) { 
1328                                          * the last page of the current I/O  
1329                                          * has been completed... 
1330                                          * compute the number of fully zero'd  
1331                                          * pages that are beyond it 
1332                                          * plus the last page if its partial 
1333                                          * and we have no more I/O to issue... 
1334                                          * otherwise a partial page is left 
1335                                          * to begin the next I/O 
1337                                         if ((int)io_size 
>= non_rounded_size
) 
1338                                                 pg_count 
= (bytes_to_zero 
- pg_resid 
+ (PAGE_SIZE 
- 1)) / PAGE_SIZE
; 
1340                                                 pg_count 
= (bytes_to_zero 
- pg_resid
) / PAGE_SIZE
; 
1342                                         complete_transaction_now 
= 1; 
1346                                  * no pending I/O to deal with 
1347                                  * so, commit all of the fully zero'd pages 
1348                                  * plus the last page if its partial 
1349                                  * and we have no more I/O to issue... 
1350                                  * otherwise a partial page is left 
1351                                  * to begin the next I/O 
1353                                 if ((int)io_size 
>= non_rounded_size
) 
1354                                         pg_count 
= (pg_offset 
+ bytes_to_zero 
+ (PAGE_SIZE 
- 1)) / PAGE_SIZE
; 
1356                                         pg_count 
= (pg_offset 
+ bytes_to_zero
) / PAGE_SIZE
; 
1358                                 commit_offset 
= upl_offset 
& ~PAGE_MASK
; 
1360                         if ( (flags 
& CL_COMMIT
) && pg_count
) { 
1361                                 ubc_upl_commit_range(upl
, commit_offset
, pg_count 
* PAGE_SIZE
, 
1362                                                      UPL_COMMIT_CLEAR_DIRTY 
| UPL_COMMIT_FREE_ON_EMPTY
); 
1364                         upl_offset 
+= io_size
; 
1365                         f_offset   
+= io_size
; 
1369                          * keep track of how much of the original request 
1370                          * that we've actually completed... non_rounded_size 
1371                          * may go negative due to us rounding the request 
1372                          * to a page size multiple (i.e.  size > non_rounded_size) 
1374                         non_rounded_size 
-= io_size
; 
1376                         if (non_rounded_size 
<= 0) { 
1378                                  * we've transferred all of the data in the original 
1379                                  * request, but we were unable to complete the tail 
1380                                  * of the last page because the file didn't have 
1381                                  * an allocation to back that portion... this is ok. 
1385                         if (cbp_head 
&& (complete_transaction_now 
|| size 
== 0))  { 
1386                                 cluster_wait_IO(cbp_head
, (flags 
& CL_ASYNC
)); 
1388                                 cluster_EOT(cbp_head
, cbp_tail
, size 
== 0 ? zero_offset 
: 0); 
1390                                 cluster_complete_transaction(&cbp_head
, callback_arg
, &retval
, flags
, 0); 
1396                 if (pg_count 
> max_vectors
) { 
1397                         if (((pg_count 
- max_vectors
) * PAGE_SIZE
) > io_size
) { 
1398                                 io_size 
= PAGE_SIZE 
- pg_offset
; 
1401                                 io_size 
-= (pg_count 
- max_vectors
) * PAGE_SIZE
; 
1402                                 pg_count 
= max_vectors
; 
1406                  * If the transaction is going to reach the maximum number of 
1407                  * desired elements, truncate the i/o to the nearest page so 
1408                  * that the actual i/o is initiated after this buffer is 
1409                  * created and added to the i/o chain. 
1411                  * I/O directed to physically contiguous memory  
1412                  * doesn't have a requirement to make sure we 'fill' a page 
1414                 if ( !(flags 
& CL_DEV_MEMORY
) && trans_count 
>= max_trans_count 
&& 
1415                                 ((upl_offset 
+ io_size
) & PAGE_MASK
)) { 
1416                         vm_offset_t aligned_ofs
; 
1418                         aligned_ofs 
= (upl_offset 
+ io_size
) & ~PAGE_MASK
; 
1420                          * If the io_size does not actually finish off even a 
1421                          * single page we have to keep adding buffers to the 
1422                          * transaction despite having reached the desired limit. 
1424                          * Eventually we get here with the page being finished 
1425                          * off (and exceeded) and then we truncate the size of 
1426                          * this i/o request so that it is page aligned so that 
1427                          * we can finally issue the i/o on the transaction. 
1429                         if (aligned_ofs 
> upl_offset
) { 
1430                                 io_size 
= aligned_ofs 
- upl_offset
; 
1435                 if ( !(mp
->mnt_kern_flag 
& MNTK_VIRTUALDEV
)) 
1437                          * if we're not targeting a virtual device i.e. a disk image 
1438                          * it's safe to dip into the reserve pool since real devices 
1439                          * can complete this I/O request without requiring additional 
1440                          * bufs from the alloc_io_buf pool 
1443                 else if ((flags 
& CL_ASYNC
) && !(flags 
& CL_PAGEOUT
)) 
1445                          * Throttle the speculative IO 
1451                 cbp 
= alloc_io_buf(vp
, priv
); 
1453                 if (flags 
& CL_PAGEOUT
) { 
1456                         for (i 
= 0; i 
< pg_count
; i
++) { 
1457                                 if (buf_invalblkno(vp
, lblkno 
+ i
, 0) == EBUSY
) 
1458                                         panic("BUSY bp found in cluster_io"); 
1461                 if (flags 
& CL_ASYNC
) { 
1462                         if (buf_setcallback(cbp
, (void *)cluster_iodone
, callback_arg
)) 
1463                                 panic("buf_setcallback failed\n"); 
1465                 cbp
->b_cliodone 
= (void *)callback
; 
1466                 cbp
->b_flags 
|= io_flags
; 
1467                 if (flags 
& CL_NOCACHE
) 
1468                         cbp
->b_attr
.ba_flags 
|= BA_NOCACHE
; 
1470                 cbp
->b_lblkno 
= lblkno
; 
1471                 cbp
->b_blkno  
= blkno
; 
1472                 cbp
->b_bcount 
= io_size
; 
1474                 if (buf_setupl(cbp
, upl
, upl_offset
)) 
1475                         panic("buf_setupl failed\n"); 
1477                 cbp
->b_trans_next 
= (buf_t
)NULL
; 
1479                 if ((cbp
->b_iostate 
= (void *)iostate
)) 
1481                          * caller wants to track the state of this 
1482                          * io... bump the amount issued against this stream 
1484                         iostate
->io_issued 
+= io_size
; 
1486                 if (flags 
& CL_READ
) { 
1487                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 26)) | DBG_FUNC_NONE
, 
1488                                      (int)cbp
->b_lblkno
, (int)cbp
->b_blkno
, upl_offset
, io_size
, 0); 
1491                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 27)) | DBG_FUNC_NONE
, 
1492                                      (int)cbp
->b_lblkno
, (int)cbp
->b_blkno
, upl_offset
, io_size
, 0); 
1496                         cbp_tail
->b_trans_next 
= cbp
; 
1502                         if ( (cbp_head
->b_real_bp 
= real_bp
) ) 
1503                                 real_bp 
= (buf_t
)NULL
; 
1505                 *(buf_t 
*)(&cbp
->b_trans_head
) = cbp_head
; 
1509                 upl_offset 
+= io_size
; 
1510                 f_offset   
+= io_size
; 
1513                  * keep track of how much of the original request 
1514                  * that we've actually completed... non_rounded_size 
1515                  * may go negative due to us rounding the request 
1516                  * to a page size multiple (i.e.  size > non_rounded_size) 
1518                 non_rounded_size 
-= io_size
; 
1520                 if (non_rounded_size 
<= 0) { 
1522                          * we've transferred all of the data in the original 
1523                          * request, but we were unable to complete the tail 
1524                          * of the last page because the file didn't have 
1525                          * an allocation to back that portion... this is ok. 
1531                          * we have no more I/O to issue, so go 
1532                          * finish the final transaction 
1535                 } else if ( ((flags 
& CL_DEV_MEMORY
) || (upl_offset 
& PAGE_MASK
) == 0) && 
1536                             ((flags 
& CL_ASYNC
) || trans_count 
> max_trans_count
) ) { 
1538                          * I/O directed to physically contiguous memory... 
1539                          * which doesn't have a requirement to make sure we 'fill' a page 
1541                          * the current I/O we've prepared fully 
1542                          * completes the last page in this request 
1544                          * it's either an ASYNC request or  
1545                          * we've already accumulated more than 8 I/O's into 
1546                          * this transaction so mark it as complete so that 
1547                          * it can finish asynchronously or via the cluster_complete_transaction 
1548                          * below if the request is synchronous 
1552                 if (need_EOT 
== TRUE
) 
1553                         cluster_EOT(cbp_head
, cbp_tail
, size 
== 0 ? zero_offset 
: 0); 
1555                 if (flags 
& CL_THROTTLE
) 
1556                         (void)vnode_waitforwrites(vp
, async_throttle
, 0, 0, "cluster_io"); 
1558                 if ( !(io_flags 
& B_READ
)) 
1559                         vnode_startwrite(vp
); 
1561                 if (flags 
& CL_RAW_ENCRYPTED
) { 
1563                          * User requested raw encrypted bytes. 
1564                          * Twiddle the bit in the ba_flags for the buffer 
1566                         cbp
->b_attr
.ba_flags 
|= BA_RAW_ENCRYPTED_IO
; 
1569                 (void) VNOP_STRATEGY(cbp
); 
1571                 if (need_EOT 
== TRUE
) { 
1572                         if ( !(flags 
& CL_ASYNC
)) 
1573                                 cluster_complete_transaction(&cbp_head
, callback_arg
, &retval
, flags
, 1); 
1587                           * first wait until all of the outstanding I/O 
1588                           * for this partial transaction has completed 
1590                         cluster_wait_IO(cbp_head
, (flags 
& CL_ASYNC
)); 
1593                          * Rewind the upl offset to the beginning of the 
1596                         upl_offset 
= cbp_head
->b_uploffset
; 
1598                         for (cbp 
= cbp_head
; cbp
;) { 
1601                                 size       
+= cbp
->b_bcount
; 
1602                                 io_size    
+= cbp
->b_bcount
; 
1604                                 cbp_next 
= cbp
->b_trans_next
; 
1610                         int need_wakeup 
= 0; 
1613                          * update the error condition for this stream 
1614                          * since we never really issued the io 
1615                          * just go ahead and adjust it back 
1617                         lck_mtx_lock_spin(&iostate
->io_mtxp
); 
1619                         if (iostate
->io_error 
== 0) 
1620                                 iostate
->io_error 
= error
; 
1621                         iostate
->io_issued 
-= io_size
; 
1623                         if (iostate
->io_wanted
) { 
1625                                  * someone is waiting for the state of 
1626                                  * this io stream to change 
1628                                 iostate
->io_wanted 
= 0; 
1631                         lck_mtx_unlock(&iostate
->io_mtxp
); 
1634                                 wakeup((caddr_t
)&iostate
->io_wanted
); 
1636                 if (flags 
& CL_COMMIT
) { 
1639                         pg_offset  
= upl_offset 
& PAGE_MASK
; 
1640                         abort_size 
= (upl_end_offset 
- upl_offset 
+ PAGE_MASK
) & ~PAGE_MASK
; 
1642                         upl_flags 
= cluster_ioerror(upl
, upl_offset 
- pg_offset
, abort_size
, error
, io_flags
); 
1644                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 28)) | DBG_FUNC_NONE
, 
1645                                      upl
, upl_offset 
- pg_offset
, abort_size
, (error 
<< 24) | upl_flags
, 0); 
1649         } else if (cbp_head
) 
1650                         panic("%s(): cbp_head is not NULL.\n", __FUNCTION__
); 
1654                  * can get here if we either encountered an error 
1655                  * or we completely zero-filled the request and 
1659                         real_bp
->b_flags 
|= B_ERROR
; 
1660                         real_bp
->b_error 
= error
; 
1662                 buf_biodone(real_bp
); 
1664         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_END
, (int)f_offset
, size
, upl_offset
, retval
, 0); 
1669 #define reset_vector_run_state()                                                                                \ 
1670         issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;         
1673 vector_cluster_io(vnode_t vp
, upl_t vector_upl
, vm_offset_t vector_upl_offset
, off_t v_upl_uio_offset
, int vector_upl_iosize
, 
1674            int io_flag
, buf_t real_bp
, struct clios 
*iostate
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
1676         vector_upl_set_pagelist(vector_upl
); 
1678         if(io_flag 
& CL_READ
) {  
1679                 if(vector_upl_offset 
== 0 && ((vector_upl_iosize 
& PAGE_MASK
)==0)) 
1680                         io_flag 
&= ~CL_PRESERVE
; /*don't zero fill*/ 
1682                         io_flag 
|= CL_PRESERVE
; /*zero fill*/ 
1684         return (cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, real_bp
, iostate
, callback
, callback_arg
)); 
1689 cluster_read_prefetch(vnode_t vp
, off_t f_offset
, u_int size
, off_t filesize
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
) 
1691         int           pages_in_prefetch
; 
1693         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_START
, 
1694                      (int)f_offset
, size
, (int)filesize
, 0, 0); 
1696         if (f_offset 
>= filesize
) { 
1697                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
, 
1698                              (int)f_offset
, 0, 0, 0, 0); 
1701         if ((off_t
)size 
> (filesize 
- f_offset
)) 
1702                 size 
= filesize 
- f_offset
; 
1703         pages_in_prefetch 
= (size 
+ (PAGE_SIZE 
- 1)) / PAGE_SIZE
; 
1705         advisory_read_ext(vp
, filesize
, f_offset
, size
, callback
, callback_arg
, bflag
); 
1707         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
, 
1708                      (int)f_offset 
+ size
, pages_in_prefetch
, 0, 1, 0); 
1710         return (pages_in_prefetch
); 
1716 cluster_read_ahead(vnode_t vp
, struct cl_extent 
*extent
, off_t filesize
, struct cl_readahead 
*rap
, int (*callback
)(buf_t
, void *), void *callback_arg
, 
1721         int             size_of_prefetch
; 
1725         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_START
, 
1726                      (int)extent
->b_addr
, (int)extent
->e_addr
, (int)rap
->cl_lastr
, 0, 0); 
1728         if (extent
->b_addr 
== rap
->cl_lastr 
&& extent
->b_addr 
== extent
->e_addr
) { 
1729                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
, 
1730                              rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 0, 0); 
1733         if (rap
->cl_lastr 
== -1 || (extent
->b_addr 
!= rap
->cl_lastr 
&& extent
->b_addr 
!= (rap
->cl_lastr 
+ 1))) { 
1737                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
, 
1738                              rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 1, 0); 
1742         max_prefetch 
= MAX_PREFETCH(vp
, cluster_max_io_size(vp
->v_mount
, CL_READ
), (vp
->v_mount
->mnt_kern_flag 
& MNTK_SSD
)); 
1744         if ((max_prefetch 
/ PAGE_SIZE
) > speculative_prefetch_max
) 
1745                 max_prefetch 
= (speculative_prefetch_max 
* PAGE_SIZE
); 
1747         if (max_prefetch 
<= PAGE_SIZE
) { 
1748                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
, 
1749                              rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 6, 0); 
1752         if (extent
->e_addr 
< rap
->cl_maxra
) { 
1753                 if ((rap
->cl_maxra 
- extent
->e_addr
) > ((max_prefetch 
/ PAGE_SIZE
) / 4)) { 
1755                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
, 
1756                                      rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 2, 0); 
1760         r_addr 
= max(extent
->e_addr
, rap
->cl_maxra
) + 1; 
1761         f_offset 
= (off_t
)(r_addr 
* PAGE_SIZE_64
); 
1763         size_of_prefetch 
= 0; 
1765         ubc_range_op(vp
, f_offset
, f_offset 
+ PAGE_SIZE_64
, UPL_ROP_PRESENT
, &size_of_prefetch
); 
1767         if (size_of_prefetch
) { 
1768                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
, 
1769                              rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 3, 0); 
1772         if (f_offset 
< filesize
) { 
1773                 daddr64_t read_size
; 
1775                 rap
->cl_ralen 
= rap
->cl_ralen 
? min(max_prefetch 
/ PAGE_SIZE
, rap
->cl_ralen 
<< 1) : 1; 
1777                 read_size 
= (extent
->e_addr 
+ 1) - extent
->b_addr
; 
1779                 if (read_size 
> rap
->cl_ralen
) { 
1780                         if (read_size 
> max_prefetch 
/ PAGE_SIZE
) 
1781                                 rap
->cl_ralen 
= max_prefetch 
/ PAGE_SIZE
; 
1783                                 rap
->cl_ralen 
= read_size
; 
1785                 size_of_prefetch 
= cluster_read_prefetch(vp
, f_offset
, rap
->cl_ralen 
* PAGE_SIZE
, filesize
, callback
, callback_arg
, bflag
); 
1787                 if (size_of_prefetch
) 
1788                         rap
->cl_maxra 
= (r_addr 
+ size_of_prefetch
) - 1; 
1790         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
, 
1791                      rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 4, 0); 
1796 cluster_pageout(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
, 
1797                 int size
, off_t filesize
, int flags
) 
1799         return cluster_pageout_ext(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, flags
, NULL
, NULL
); 
1805 cluster_pageout_ext(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
, 
1806                 int size
, off_t filesize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
1813         local_flags 
= CL_PAGEOUT 
| CL_THROTTLE
; 
1815         if ((flags 
& UPL_IOSYNC
) == 0)  
1816                 local_flags 
|= CL_ASYNC
; 
1817         if ((flags 
& UPL_NOCOMMIT
) == 0)  
1818                 local_flags 
|= CL_COMMIT
; 
1819         if ((flags 
& UPL_KEEPCACHED
)) 
1820                 local_flags 
|= CL_KEEPCACHED
; 
1821         if (flags 
& UPL_PAGING_ENCRYPTED
) 
1822                 local_flags 
|= CL_ENCRYPTED
; 
1825         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 52)) | DBG_FUNC_NONE
, 
1826                      (int)f_offset
, size
, (int)filesize
, local_flags
, 0); 
1829          * If they didn't specify any I/O, then we are done... 
1830          * we can't issue an abort because we don't know how 
1831          * big the upl really is 
1836         if (vp
->v_mount
->mnt_flag 
& MNT_RDONLY
) { 
1837                 if (local_flags 
& CL_COMMIT
) 
1838                         ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
); 
1842          * can't page-in from a negative offset 
1843          * or if we're starting beyond the EOF 
1844          * or if the file offset isn't page aligned 
1845          * or the size requested isn't a multiple of PAGE_SIZE 
1847         if (f_offset 
< 0 || f_offset 
>= filesize 
|| 
1848            (f_offset 
& PAGE_MASK_64
) || (size 
& PAGE_MASK
)) { 
1849                 if (local_flags 
& CL_COMMIT
) 
1850                         ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
); 
1853         max_size 
= filesize 
- f_offset
; 
1855         if (size 
< max_size
) 
1860         rounded_size 
= (io_size 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
1862         if (size 
> rounded_size
) { 
1863                 if (local_flags 
& CL_COMMIT
) 
1864                         ubc_upl_abort_range(upl
, upl_offset 
+ rounded_size
, size 
- rounded_size
, 
1865                                         UPL_ABORT_FREE_ON_EMPTY
); 
1867         return (cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
, 
1868                            local_flags
, (buf_t
)NULL
, (struct clios 
*)NULL
, callback
, callback_arg
)); 
1873 cluster_pagein(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
, 
1874                int size
, off_t filesize
, int flags
) 
1876         return cluster_pagein_ext(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, flags
, NULL
, NULL
); 
1881 cluster_pagein_ext(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
, 
1882                int size
, off_t filesize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
1888         int           local_flags 
= 0; 
1890         if (upl 
== NULL 
|| size 
< 0) 
1891                 panic("cluster_pagein: NULL upl passed in"); 
1893         if ((flags 
& UPL_IOSYNC
) == 0) 
1894                 local_flags 
|= CL_ASYNC
; 
1895         if ((flags 
& UPL_NOCOMMIT
) == 0)  
1896                 local_flags 
|= CL_COMMIT
; 
1897         if (flags 
& UPL_IOSTREAMING
) 
1898                 local_flags 
|= CL_IOSTREAMING
; 
1899         if (flags 
& UPL_PAGING_ENCRYPTED
) 
1900                 local_flags 
|= CL_ENCRYPTED
; 
1903         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 56)) | DBG_FUNC_NONE
, 
1904                      (int)f_offset
, size
, (int)filesize
, local_flags
, 0); 
1907          * can't page-in from a negative offset 
1908          * or if we're starting beyond the EOF 
1909          * or if the file offset isn't page aligned 
1910          * or the size requested isn't a multiple of PAGE_SIZE 
1912         if (f_offset 
< 0 || f_offset 
>= filesize 
|| 
1913            (f_offset 
& PAGE_MASK_64
) || (size 
& PAGE_MASK
) || (upl_offset 
& PAGE_MASK
)) { 
1914                 if (local_flags 
& CL_COMMIT
) 
1915                         ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY 
| UPL_ABORT_ERROR
); 
1918         max_size 
= filesize 
- f_offset
; 
1920         if (size 
< max_size
) 
1925         rounded_size 
= (io_size 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
1927         if (size 
> rounded_size 
&& (local_flags 
& CL_COMMIT
)) 
1928                 ubc_upl_abort_range(upl
, upl_offset 
+ rounded_size
, 
1929                                     size 
- rounded_size
, UPL_ABORT_FREE_ON_EMPTY 
| UPL_ABORT_ERROR
); 
1931         retval 
= cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
, 
1932                             local_flags 
| CL_READ 
| CL_PAGEIN
, (buf_t
)NULL
, (struct clios 
*)NULL
, callback
, callback_arg
); 
1939 cluster_bp(buf_t bp
) 
1941        return cluster_bp_ext(bp
, NULL
, NULL
); 
1946 cluster_bp_ext(buf_t bp
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
1951         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 19)) | DBG_FUNC_START
, 
1952                      bp
, (int)bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0); 
1954         if (bp
->b_flags 
& B_READ
) 
1955                 flags 
= CL_ASYNC 
| CL_READ
; 
1958         if (bp
->b_flags 
& B_PASSIVE
)  
1959                 flags 
|= CL_PASSIVE
; 
1961         f_offset 
= ubc_blktooff(bp
->b_vp
, bp
->b_lblkno
); 
1963         return (cluster_io(bp
->b_vp
, bp
->b_upl
, 0, f_offset
, bp
->b_bcount
, flags
, bp
, (struct clios 
*)NULL
, callback
, callback_arg
)); 
1969 cluster_write(vnode_t vp
, struct uio 
*uio
, off_t oldEOF
, off_t newEOF
, off_t headOff
, off_t tailOff
, int xflags
) 
1971         return cluster_write_ext(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, xflags
, NULL
, NULL
); 
1976 cluster_write_ext(vnode_t vp
, struct uio 
*uio
, off_t oldEOF
, off_t newEOF
, off_t headOff
, off_t tailOff
, 
1977                   int xflags
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
1979         user_ssize_t    cur_resid
; 
1984         int             write_type 
= IO_COPY
; 
1985         u_int32_t       write_length
; 
1989         if (flags 
& IO_PASSIVE
) 
1994         if (vp
->v_flag 
& VNOCACHE_DATA
){ 
1995                 flags 
|= IO_NOCACHE
; 
1996                 bflag 
|= CL_NOCACHE
; 
2001                  * this call is being made to zero-fill some range in the file 
2003                 retval 
= cluster_write_copy(vp
, NULL
, (u_int32_t
)0, oldEOF
, newEOF
, headOff
, tailOff
, flags
, callback
, callback_arg
); 
2008          * do a write through the cache if one of the following is true.... 
2009          *   NOCACHE is not true or NODIRECT is true 
2010          *   the uio request doesn't target USERSPACE 
2011          * otherwise, find out if we want the direct or contig variant for 
2012          * the first vector in the uio request 
2014         if ( ((flags 
& (IO_NOCACHE 
| IO_NODIRECT
)) == IO_NOCACHE
) && UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) ) 
2015                 retval 
= cluster_io_type(uio
, &write_type
, &write_length
, MIN_DIRECT_WRITE_SIZE
); 
2017         if ( (flags 
& (IO_TAILZEROFILL 
| IO_HEADZEROFILL
)) && write_type 
== IO_DIRECT
) 
2019                  * must go through the cached variant in this case 
2021                 write_type 
= IO_COPY
; 
2023         while ((cur_resid 
= uio_resid(uio
)) && uio
->uio_offset 
< newEOF 
&& retval 
== 0) { 
2025                 switch (write_type
) { 
2029                          * make sure the uio_resid isn't too big... 
2030                          * internally, we want to handle all of the I/O in 
2031                          * chunk sizes that fit in a 32 bit int 
2033                         if (cur_resid 
> (user_ssize_t
)(MAX_IO_REQUEST_SIZE
)) { 
2035                                  * we're going to have to call cluster_write_copy 
2038                                  * only want the last call to cluster_write_copy to 
2039                                  * have the IO_TAILZEROFILL flag set and only the 
2040                                  * first call should have IO_HEADZEROFILL 
2042                                 zflags 
= flags 
& ~IO_TAILZEROFILL
; 
2043                                 flags 
&= ~IO_HEADZEROFILL
; 
2045                                 write_length 
= MAX_IO_REQUEST_SIZE
; 
2048                                  * last call to cluster_write_copy 
2052                                 write_length 
= (u_int32_t
)cur_resid
; 
2054                         retval 
= cluster_write_copy(vp
, uio
, write_length
, oldEOF
, newEOF
, headOff
, tailOff
, zflags
, callback
, callback_arg
); 
2058                         zflags 
= flags 
& ~(IO_TAILZEROFILL 
| IO_HEADZEROFILL
); 
2060                         if (flags 
& IO_HEADZEROFILL
) { 
2062                                  * only do this once per request 
2064                                 flags 
&= ~IO_HEADZEROFILL
; 
2066                                 retval 
= cluster_write_copy(vp
, (struct uio 
*)0, (u_int32_t
)0, (off_t
)0, uio
->uio_offset
, 
2067                                                             headOff
, (off_t
)0, zflags 
| IO_HEADZEROFILL 
| IO_SYNC
, callback
, callback_arg
); 
2071                         retval 
= cluster_write_contig(vp
, uio
, newEOF
, &write_type
, &write_length
, callback
, callback_arg
, bflag
); 
2073                         if (retval 
== 0 && (flags 
& IO_TAILZEROFILL
) && uio_resid(uio
) == 0) { 
2075                                  * we're done with the data from the user specified buffer(s) 
2076                                  * and we've been requested to zero fill at the tail 
2077                                  * treat this as an IO_HEADZEROFILL which doesn't require a uio 
2078                                  * by rearranging the args and passing in IO_HEADZEROFILL 
2080                                 retval 
= cluster_write_copy(vp
, (struct uio 
*)0, (u_int32_t
)0, (off_t
)0, tailOff
, uio
->uio_offset
, 
2081                                                             (off_t
)0, zflags 
| IO_HEADZEROFILL 
| IO_SYNC
, callback
, callback_arg
); 
2087                          * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL 
2089                         retval 
= cluster_write_direct(vp
, uio
, oldEOF
, newEOF
, &write_type
, &write_length
, flags
, callback
, callback_arg
); 
2093                         retval 
= cluster_io_type(uio
, &write_type
, &write_length
, MIN_DIRECT_WRITE_SIZE
); 
2097                  * in case we end up calling cluster_write_copy (from cluster_write_direct) 
2098                  * multiple times to service a multi-vector request that is not aligned properly 
2099                  * we need to update the oldEOF so that we 
2100                  * don't zero-fill the head of a page if we've successfully written 
2101                  * data to that area... 'cluster_write_copy' will zero-fill the head of a 
2102                  * page that is beyond the oldEOF if the write is unaligned... we only 
2103                  * want that to happen for the very first page of the cluster_write,  
2104                  * NOT the first page of each vector making up a multi-vector write. 
2106                 if (uio
->uio_offset 
> oldEOF
) 
2107                         oldEOF 
= uio
->uio_offset
; 
2114 cluster_write_direct(vnode_t vp
, struct uio 
*uio
, off_t oldEOF
, off_t newEOF
, int *write_type
, u_int32_t 
*write_length
, 
2115                      int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
2118         upl_page_info_t  
*pl
; 
2119         vm_offset_t      upl_offset
; 
2120         vm_offset_t      vector_upl_offset 
= 0; 
2121         u_int32_t        io_req_size
; 
2122         u_int32_t        offset_in_file
; 
2123         u_int32_t        offset_in_iovbase
; 
2126         upl_size_t       upl_size
, vector_upl_size 
= 0; 
2127         vm_size_t        upl_needed_size
; 
2128         mach_msg_type_number_t  pages_in_pl
; 
2131         mach_msg_type_number_t  i
; 
2132         int              force_data_sync
; 
2135         struct clios     iostate
; 
2136         user_addr_t      iov_base
; 
2137         u_int32_t        mem_alignment_mask
; 
2138         u_int32_t        devblocksize
; 
2139         u_int32_t        max_io_size
; 
2140         u_int32_t        max_upl_size
; 
2141         u_int32_t        max_vector_size
; 
2142         boolean_t        io_throttled 
= FALSE
; 
2144         u_int32_t        vector_upl_iosize 
= 0; 
2145         int              issueVectorUPL 
= 0,useVectorUPL 
= (uio
->uio_iovcnt 
> 1); 
2146         off_t            v_upl_uio_offset 
= 0; 
2147         int              vector_upl_index
=0; 
2148         upl_t            vector_upl 
= NULL
; 
2152          * When we enter this routine, we know 
2153          *  -- the resid will not exceed iov_len 
2155         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_START
, 
2156                      (int)uio
->uio_offset
, *write_length
, (int)newEOF
, 0, 0); 
2158         max_upl_size 
= cluster_max_io_size(vp
->v_mount
, CL_WRITE
); 
2160         io_flag 
= CL_ASYNC 
| CL_PRESERVE 
| CL_COMMIT 
| CL_THROTTLE 
| CL_DIRECT_IO
; 
2162         if (flags 
& IO_PASSIVE
) 
2163                 io_flag 
|= CL_PASSIVE
; 
2165         if (flags 
& IO_NOCACHE
) 
2166                 io_flag 
|= CL_NOCACHE
; 
2168         iostate
.io_completed 
= 0; 
2169         iostate
.io_issued 
= 0; 
2170         iostate
.io_error 
= 0; 
2171         iostate
.io_wanted 
= 0; 
2173         lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
); 
2175         mem_alignment_mask 
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
; 
2176         devblocksize 
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
; 
2178         if (devblocksize 
== 1) { 
2180                 * the AFP client advertises a devblocksize of 1 
2181                 * however, its BLOCKMAP routine maps to physical 
2182                 * blocks that are PAGE_SIZE in size... 
2183                 * therefore we can't ask for I/Os that aren't page aligned 
2184                 * or aren't multiples of PAGE_SIZE in size 
2185                 * by setting devblocksize to PAGE_SIZE, we re-instate 
2186                 * the old behavior we had before the mem_alignment_mask 
2187                 * changes went in... 
2189                devblocksize 
= PAGE_SIZE
; 
2193         io_req_size 
= *write_length
; 
2194         iov_base 
= uio_curriovbase(uio
); 
2196         offset_in_file 
= (u_int32_t
)uio
->uio_offset 
& PAGE_MASK
; 
2197         offset_in_iovbase 
= (u_int32_t
)iov_base 
& mem_alignment_mask
; 
2199         if (offset_in_file 
|| offset_in_iovbase
) { 
2201                  * one of the 2 important offsets is misaligned 
2202                  * so fire an I/O through the cache for this entire vector 
2204                 goto wait_for_dwrites
; 
2206         if (iov_base 
& (devblocksize 
- 1)) { 
2208                  * the offset in memory must be on a device block boundary 
2209                  * so that we can guarantee that we can generate an 
2210                  * I/O that ends on a page boundary in cluster_io 
2212                 goto wait_for_dwrites
; 
2215         while (io_req_size 
>= PAGE_SIZE 
&& uio
->uio_offset 
< newEOF 
&& retval 
== 0) { 
2218                 if ( (throttle_type 
= cluster_hard_throttle_on(vp
, 1)) ) { 
2220                          * we're in the throttle window, at the very least 
2221                          * we want to limit the size of the I/O we're about 
2224                         if ( (flags 
& IO_RETURN_ON_THROTTLE
) && throttle_type 
== 2) { 
2226                                  * we're in the throttle window and at least 1 I/O 
2227                                  * has already been issued by a throttleable thread 
2228                                  * in this window, so return with EAGAIN to indicate 
2229                                  * to the FS issuing the cluster_write call that it 
2230                                  * should now throttle after dropping any locks 
2232                                 throttle_info_update_by_mount(vp
->v_mount
); 
2234                                 io_throttled 
= TRUE
; 
2235                                 goto wait_for_dwrites
; 
2237                         max_vector_size 
= THROTTLE_MAX_IOSIZE
; 
2238                         max_io_size 
= THROTTLE_MAX_IOSIZE
; 
2240                         max_vector_size 
= MAX_VECTOR_UPL_SIZE
; 
2241                         max_io_size 
= max_upl_size
; 
2245                         cluster_syncup(vp
, newEOF
, callback
, callback_arg
); 
2248                 io_size  
= io_req_size 
& ~PAGE_MASK
; 
2249                 iov_base 
= uio_curriovbase(uio
); 
2251                 if (io_size 
> max_io_size
) 
2252                         io_size 
= max_io_size
; 
2254                 if(useVectorUPL 
&& (iov_base 
& PAGE_MASK
)) { 
2256                          * We have an iov_base that's not page-aligned. 
2257                          * Issue all I/O's that have been collected within  
2258                          * this Vectored UPL. 
2260                         if(vector_upl_index
) { 
2261                                 retval 
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
); 
2262                                 reset_vector_run_state(); 
2266                         * After this point, if we are using the Vector UPL path and the base is 
2267                         * not page-aligned then the UPL with that base will be the first in the vector UPL. 
2271                 upl_offset 
= (vm_offset_t
)((u_int32_t
)iov_base 
& PAGE_MASK
); 
2272                 upl_needed_size 
= (upl_offset 
+ io_size 
+ (PAGE_SIZE 
-1)) & ~PAGE_MASK
; 
2274                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_START
, 
2275                              (int)upl_offset
, upl_needed_size
, (int)iov_base
, io_size
, 0); 
2277                 for (force_data_sync 
= 0; force_data_sync 
< 3; force_data_sync
++) { 
2279                         upl_size 
= upl_needed_size
; 
2280                         upl_flags 
= UPL_FILE_IO 
| UPL_COPYOUT_FROM 
| UPL_NO_SYNC 
| 
2281                                     UPL_CLEAN_IN_PLACE 
| UPL_SET_INTERNAL 
| UPL_SET_LITE 
| UPL_SET_IO_WIRE
; 
2283                         kret 
= vm_map_get_upl(current_map(), 
2284                                               (vm_map_offset_t
)(iov_base 
& ~((user_addr_t
)PAGE_MASK
)), 
2292                         if (kret 
!= KERN_SUCCESS
) { 
2293                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
, 
2296                                  * failed to get pagelist 
2298                                  * we may have already spun some portion of this request 
2299                                  * off as async requests... we need to wait for the I/O 
2300                                  * to complete before returning 
2302                                 goto wait_for_dwrites
; 
2304                         pl 
= UPL_GET_INTERNAL_PAGE_LIST(upl
); 
2305                         pages_in_pl 
= upl_size 
/ PAGE_SIZE
; 
2307                         for (i 
= 0; i 
< pages_in_pl
; i
++) { 
2308                                 if (!upl_valid_page(pl
, i
)) 
2311                         if (i 
== pages_in_pl
) 
2315                          * didn't get all the pages back that we 
2316                          * needed... release this upl and try again 
2318                         ubc_upl_abort(upl
, 0); 
2320                 if (force_data_sync 
>= 3) { 
2321                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
, 
2322                                      i
, pages_in_pl
, upl_size
, kret
, 0); 
2324                          * for some reason, we couldn't acquire a hold on all 
2325                          * the pages needed in the user's address space 
2327                          * we may have already spun some portion of this request 
2328                          * off as async requests... we need to wait for the I/O 
2329                          * to complete before returning 
2331                         goto wait_for_dwrites
; 
2335                  * Consider the possibility that upl_size wasn't satisfied. 
2337                 if (upl_size 
< upl_needed_size
) { 
2338                         if (upl_size 
&& upl_offset 
== 0) 
2343                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
, 
2344                              (int)upl_offset
, upl_size
, (int)iov_base
, io_size
, 0);                     
2347                         ubc_upl_abort(upl
, 0); 
2349                          * we may have already spun some portion of this request 
2350                          * off as async requests... we need to wait for the I/O 
2351                          * to complete before returning 
2353                         goto wait_for_dwrites
; 
2357                         vm_offset_t end_off 
= ((iov_base 
+ io_size
) & PAGE_MASK
); 
2361                          * After this point, if we are using a vector UPL, then 
2362                          * either all the UPL elements end on a page boundary OR 
2363                          * this UPL is the last element because it does not end 
2364                          * on a page boundary. 
2369                  * Now look for pages already in the cache 
2370                  * and throw them away. 
2371                  * uio->uio_offset is page aligned within the file 
2372                  * io_size is a multiple of PAGE_SIZE 
2374                 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset 
+ io_size
, UPL_ROP_DUMP
, NULL
); 
2377                  * we want push out these writes asynchronously so that we can overlap 
2378                  * the preparation of the next I/O 
2379                  * if there are already too many outstanding writes 
2380                  * wait until some complete before issuing the next 
2382                 if (iostate
.io_issued 
> iostate
.io_completed
) 
2383                         cluster_iostate_wait(&iostate
, max_upl_size 
* IO_SCALE(vp
, 2), "cluster_write_direct"); 
2385                 if (iostate
.io_error
) { 
2387                          * one of the earlier writes we issued ran into a hard error 
2388                          * don't issue any more writes, cleanup the UPL 
2389                          * that was just created but not used, then 
2390                          * go wait for all writes that are part of this stream 
2391                          * to complete before returning the error to the caller 
2393                         ubc_upl_abort(upl
, 0); 
2395                         goto wait_for_dwrites
; 
2398                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_START
, 
2399                              (int)upl_offset
, (int)uio
->uio_offset
, io_size
, io_flag
, 0); 
2402                         retval 
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, 
2403                                    io_size
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
); 
2406                         if(!vector_upl_index
) { 
2407                                 vector_upl 
= vector_upl_create(upl_offset
); 
2408                                 v_upl_uio_offset 
= uio
->uio_offset
; 
2409                                 vector_upl_offset 
= upl_offset
; 
2412                         vector_upl_set_subupl(vector_upl
,upl
,upl_size
); 
2413                         vector_upl_set_iostate(vector_upl
, upl
, vector_upl_size
, upl_size
); 
2415                         vector_upl_iosize 
+= io_size
; 
2416                         vector_upl_size 
+= upl_size
; 
2418                         if(issueVectorUPL 
|| vector_upl_index 
==  MAX_VECTOR_UPL_ELEMENTS 
|| vector_upl_size 
>= max_vector_size
) { 
2419                                 retval 
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
); 
2420                                 reset_vector_run_state(); 
2425                  * update the uio structure to 
2426                  * reflect the I/O that we just issued 
2428                 uio_update(uio
, (user_size_t
)io_size
); 
2431                  * in case we end up calling through to cluster_write_copy to finish 
2432                  * the tail of this request, we need to update the oldEOF so that we 
2433                  * don't zero-fill the head of a page if we've successfully written 
2434                  * data to that area... 'cluster_write_copy' will zero-fill the head of a 
2435                  * page that is beyond the oldEOF if the write is unaligned... we only 
2436                  * want that to happen for the very first page of the cluster_write,  
2437                  * NOT the first page of each vector making up a multi-vector write. 
2439                 if (uio
->uio_offset 
> oldEOF
) 
2440                         oldEOF 
= uio
->uio_offset
; 
2442                 io_req_size 
-= io_size
; 
2444                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_END
, 
2445                              (int)upl_offset
, (int)uio
->uio_offset
, io_req_size
, retval
, 0); 
2449         if (retval 
== 0 && iostate
.io_error 
== 0 && io_req_size 
== 0) { 
2451                 retval 
= cluster_io_type(uio
, write_type
, write_length
, MIN_DIRECT_WRITE_SIZE
); 
2453                 if (retval 
== 0 && *write_type 
== IO_DIRECT
) { 
2455                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_NONE
, 
2456                                      (int)uio
->uio_offset
, *write_length
, (int)newEOF
, 0, 0); 
2464         if (retval 
== 0 && iostate
.io_error 
== 0 && useVectorUPL 
&& vector_upl_index
) { 
2465                 retval 
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
); 
2466                 reset_vector_run_state();        
2469         if (iostate
.io_issued 
> iostate
.io_completed
) { 
2471                  * make sure all async writes issued as part of this stream 
2472                  * have completed before we return 
2474                 cluster_iostate_wait(&iostate
, 0, "cluster_write_direct"); 
2476         if (iostate
.io_error
) 
2477                 retval 
= iostate
.io_error
; 
2479         lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
); 
2481         if (io_throttled 
== TRUE 
&& retval 
== 0) 
2484         if (io_req_size 
&& retval 
== 0) { 
2486                  * we couldn't handle the tail of this request in DIRECT mode 
2487                  * so fire it through the copy path 
2489                  * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set 
2490                  * so we can just pass 0 in for the headOff and tailOff 
2492                 if (uio
->uio_offset 
> oldEOF
) 
2493                         oldEOF 
= uio
->uio_offset
; 
2495                 retval 
= cluster_write_copy(vp
, uio
, io_req_size
, oldEOF
, newEOF
, (off_t
)0, (off_t
)0, flags
, callback
, callback_arg
); 
2497                 *write_type 
= IO_UNKNOWN
; 
2499         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_END
, 
2500                      (int)uio
->uio_offset
, io_req_size
, retval
, 4, 0); 
2507 cluster_write_contig(vnode_t vp
, struct uio 
*uio
, off_t newEOF
, int *write_type
, u_int32_t 
*write_length
, 
2508                      int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
) 
2510         upl_page_info_t 
*pl
; 
2511         addr64_t         src_paddr 
= 0; 
2512         upl_t            upl
[MAX_VECTS
]; 
2513         vm_offset_t      upl_offset
; 
2514         u_int32_t        tail_size 
= 0; 
2517         upl_size_t       upl_size
; 
2518         vm_size_t        upl_needed_size
; 
2519         mach_msg_type_number_t  pages_in_pl
; 
2522         struct clios     iostate
; 
2527         user_addr_t      iov_base
; 
2528         u_int32_t        devblocksize
; 
2529         u_int32_t        mem_alignment_mask
; 
2532          * When we enter this routine, we know 
2533          *  -- the io_req_size will not exceed iov_len 
2534          *  -- the target address is physically contiguous 
2536         cluster_syncup(vp
, newEOF
, callback
, callback_arg
); 
2538         devblocksize 
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
; 
2539         mem_alignment_mask 
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
; 
2541         iostate
.io_completed 
= 0; 
2542         iostate
.io_issued 
= 0; 
2543         iostate
.io_error 
= 0; 
2544         iostate
.io_wanted 
= 0; 
2546         lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
); 
2549         io_size 
= *write_length
; 
2551         iov_base 
= uio_curriovbase(uio
); 
2553         upl_offset 
= (vm_offset_t
)((u_int32_t
)iov_base 
& PAGE_MASK
); 
2554         upl_needed_size 
= upl_offset 
+ io_size
; 
2557         upl_size 
= upl_needed_size
; 
2558         upl_flags 
= UPL_FILE_IO 
| UPL_COPYOUT_FROM 
| UPL_NO_SYNC 
|  
2559                     UPL_CLEAN_IN_PLACE 
| UPL_SET_INTERNAL 
| UPL_SET_LITE 
| UPL_SET_IO_WIRE
; 
2561         kret 
= vm_map_get_upl(current_map(), 
2562                               (vm_map_offset_t
)(iov_base 
& ~((user_addr_t
)PAGE_MASK
)), 
2563                               &upl_size
, &upl
[cur_upl
], NULL
, &pages_in_pl
, &upl_flags
, 0); 
2565         if (kret 
!= KERN_SUCCESS
) { 
2567                  * failed to get pagelist 
2570                 goto wait_for_cwrites
; 
2575          * Consider the possibility that upl_size wasn't satisfied. 
2577         if (upl_size 
< upl_needed_size
) { 
2579                  * This is a failure in the physical memory case. 
2582                 goto wait_for_cwrites
; 
2584         pl 
= ubc_upl_pageinfo(upl
[cur_upl
]); 
2586         src_paddr 
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + (addr64_t
)upl_offset
; 
2588         while (((uio
->uio_offset 
& (devblocksize 
- 1)) || io_size 
< devblocksize
) && io_size
) { 
2589                 u_int32_t   head_size
; 
2591                 head_size 
= devblocksize 
- (u_int32_t
)(uio
->uio_offset 
& (devblocksize 
- 1)); 
2593                 if (head_size 
> io_size
) 
2594                         head_size 
= io_size
; 
2596                 error 
= cluster_align_phys_io(vp
, uio
, src_paddr
, head_size
, 0, callback
, callback_arg
); 
2599                         goto wait_for_cwrites
; 
2601                 upl_offset 
+= head_size
; 
2602                 src_paddr  
+= head_size
; 
2603                 io_size    
-= head_size
; 
2605                 iov_base   
+= head_size
; 
2607         if ((u_int32_t
)iov_base 
& mem_alignment_mask
) { 
2609                  * request doesn't set up on a memory boundary 
2610                  * the underlying DMA engine can handle... 
2611                  * return an error instead of going through 
2612                  * the slow copy path since the intent of this 
2613                  * path is direct I/O from device memory 
2616                 goto wait_for_cwrites
; 
2619         tail_size 
= io_size 
& (devblocksize 
- 1); 
2620         io_size  
-= tail_size
; 
2622         while (io_size 
&& error 
== 0) { 
2624                 if (io_size 
> MAX_IO_CONTIG_SIZE
) 
2625                         xsize 
= MAX_IO_CONTIG_SIZE
; 
2629                  * request asynchronously so that we can overlap 
2630                  * the preparation of the next I/O... we'll do 
2631                  * the commit after all the I/O has completed 
2632                  * since its all issued against the same UPL 
2633                  * if there are already too many outstanding writes 
2634                  * wait until some have completed before issuing the next 
2636                 if (iostate
.io_issued 
> iostate
.io_completed
) 
2637                         cluster_iostate_wait(&iostate
, MAX_IO_CONTIG_SIZE 
* IO_SCALE(vp
, 2), "cluster_write_contig"); 
2639                 if (iostate
.io_error
) { 
2641                          * one of the earlier writes we issued ran into a hard error 
2642                          * don't issue any more writes... 
2643                          * go wait for all writes that are part of this stream 
2644                          * to complete before returning the error to the caller 
2646                         goto wait_for_cwrites
; 
2649                  * issue an asynchronous write to cluster_io 
2651                 error 
= cluster_io(vp
, upl
[cur_upl
], upl_offset
, uio
->uio_offset
, 
2652                                    xsize
, CL_DEV_MEMORY 
| CL_ASYNC 
| bflag
, (buf_t
)NULL
, (struct clios 
*)&iostate
, callback
, callback_arg
); 
2656                          * The cluster_io write completed successfully, 
2657                          * update the uio structure 
2659                         uio_update(uio
, (user_size_t
)xsize
); 
2661                         upl_offset 
+= xsize
; 
2666         if (error 
== 0 && iostate
.io_error 
== 0 && tail_size 
== 0 && num_upl 
< MAX_VECTS
) { 
2668                 error 
= cluster_io_type(uio
, write_type
, write_length
, 0); 
2670                 if (error 
== 0 && *write_type 
== IO_CONTIG
) { 
2675                 *write_type 
= IO_UNKNOWN
; 
2679          * make sure all async writes that are part of this stream 
2680          * have completed before we proceed 
2682         if (iostate
.io_issued 
> iostate
.io_completed
) 
2683                 cluster_iostate_wait(&iostate
, 0, "cluster_write_contig"); 
2685         if (iostate
.io_error
) 
2686                 error 
= iostate
.io_error
; 
2688         lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
); 
2690         if (error 
== 0 && tail_size
) 
2691                 error 
= cluster_align_phys_io(vp
, uio
, src_paddr
, tail_size
, 0, callback
, callback_arg
); 
2693         for (n 
= 0; n 
< num_upl
; n
++) 
2695                  * just release our hold on each physically contiguous 
2696                  * region without changing any state 
2698                 ubc_upl_abort(upl
[n
], 0); 
2705  * need to avoid a race between an msync of a range of pages dirtied via mmap 
2706  * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's 
2707  * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd 
2709  * we should never force-zero-fill pages that are already valid in the cache... 
2710  * the entire page contains valid data (either from disk, zero-filled or dirtied 
2711  * via an mmap) so we can only do damage by trying to zero-fill 
2715 cluster_zero_range(upl_t upl
, upl_page_info_t 
*pl
, int flags
, int io_offset
, off_t zero_off
, off_t upl_f_offset
, int bytes_to_zero
) 
2718         boolean_t need_cluster_zero 
= TRUE
; 
2720         if ((flags 
& (IO_NOZEROVALID 
| IO_NOZERODIRTY
))) { 
2722                 bytes_to_zero 
= min(bytes_to_zero
, PAGE_SIZE 
- (int)(zero_off 
& PAGE_MASK_64
)); 
2723                 zero_pg_index 
= (int)((zero_off 
- upl_f_offset
) / PAGE_SIZE_64
); 
2725                 if (upl_valid_page(pl
, zero_pg_index
)) { 
2727                          * never force zero valid pages - dirty or clean 
2728                          * we'll leave these in the UPL for cluster_write_copy to deal with 
2730                         need_cluster_zero 
= FALSE
; 
2733         if (need_cluster_zero 
== TRUE
) 
2734                 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
); 
2736         return (bytes_to_zero
); 
2741 cluster_write_copy(vnode_t vp
, struct uio 
*uio
, u_int32_t io_req_size
, off_t oldEOF
, off_t newEOF
, off_t headOff
, 
2742                    off_t tailOff
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
2744         upl_page_info_t 
*pl
; 
2746         vm_offset_t      upl_offset 
= 0; 
2759         long long        total_size
; 
2762         long long        zero_cnt1
; 
2764         off_t            write_off 
= 0; 
2766         boolean_t        first_pass 
= FALSE
; 
2767         struct cl_extent cl
; 
2768         struct cl_writebehind 
*wbp
; 
2770         u_int            max_cluster_pgcount
; 
2774                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
, 
2775                              (int)uio
->uio_offset
, io_req_size
, (int)oldEOF
, (int)newEOF
, 0); 
2777                 io_resid 
= io_req_size
; 
2779                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
, 
2780                              0, 0, (int)oldEOF
, (int)newEOF
, 0); 
2784         if (flags 
& IO_PASSIVE
) 
2788         if (flags 
& IO_NOCACHE
) 
2789                 bflag 
|= CL_NOCACHE
; 
2796         max_cluster_pgcount 
= MAX_CLUSTER_SIZE(vp
) / PAGE_SIZE
; 
2797         max_io_size 
= cluster_max_io_size(vp
->v_mount
, CL_WRITE
); 
2799         if (flags 
& IO_HEADZEROFILL
) { 
2801                  * some filesystems (HFS is one) don't support unallocated holes within a file... 
2802                  * so we zero fill the intervening space between the old EOF and the offset 
2803                  * where the next chunk of real data begins.... ftruncate will also use this 
2804                  * routine to zero fill to the new EOF when growing a file... in this case, the 
2805                  * uio structure will not be provided 
2808                         if (headOff 
< uio
->uio_offset
) { 
2809                                 zero_cnt 
= uio
->uio_offset 
- headOff
; 
2812                 } else if (headOff 
< newEOF
) {   
2813                         zero_cnt 
= newEOF 
- headOff
; 
2817                 if (uio 
&& uio
->uio_offset 
> oldEOF
) { 
2818                         zero_off 
= uio
->uio_offset 
& ~PAGE_MASK_64
; 
2820                         if (zero_off 
>= oldEOF
) { 
2821                                 zero_cnt 
= uio
->uio_offset 
- zero_off
; 
2823                                 flags 
|= IO_HEADZEROFILL
; 
2827         if (flags 
& IO_TAILZEROFILL
) { 
2829                         zero_off1 
= uio
->uio_offset 
+ io_req_size
; 
2831                         if (zero_off1 
< tailOff
) 
2832                                 zero_cnt1 
= tailOff 
- zero_off1
; 
2835                 if (uio 
&& newEOF 
> oldEOF
) { 
2836                         zero_off1 
= uio
->uio_offset 
+ io_req_size
; 
2838                         if (zero_off1 
== newEOF 
&& (zero_off1 
& PAGE_MASK_64
)) { 
2839                                 zero_cnt1 
= PAGE_SIZE_64 
- (zero_off1 
& PAGE_MASK_64
); 
2841                                 flags 
|= IO_TAILZEROFILL
; 
2845         if (zero_cnt 
== 0 && uio 
== (struct uio 
*) 0) { 
2846                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
, 
2847                              retval
, 0, 0, 0, 0); 
2851                 write_off 
= uio
->uio_offset
; 
2852                 write_cnt 
= uio_resid(uio
); 
2854                  * delay updating the sequential write info 
2855                  * in the control block until we've obtained 
2860         while ((total_size 
= (io_resid 
+ zero_cnt 
+ zero_cnt1
)) && retval 
== 0) { 
2862                  * for this iteration of the loop, figure out where our starting point is 
2865                         start_offset 
= (int)(zero_off 
& PAGE_MASK_64
); 
2866                         upl_f_offset 
= zero_off 
- start_offset
; 
2867                 } else if (io_resid
) { 
2868                         start_offset 
= (int)(uio
->uio_offset 
& PAGE_MASK_64
); 
2869                         upl_f_offset 
= uio
->uio_offset 
- start_offset
; 
2871                         start_offset 
= (int)(zero_off1 
& PAGE_MASK_64
); 
2872                         upl_f_offset 
= zero_off1 
- start_offset
; 
2874                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 46)) | DBG_FUNC_NONE
, 
2875                              (int)zero_off
, (int)zero_cnt
, (int)zero_off1
, (int)zero_cnt1
, 0); 
2877                 if (total_size 
> max_io_size
) 
2878                         total_size 
= max_io_size
; 
2880                 cl
.b_addr 
= (daddr64_t
)(upl_f_offset 
/ PAGE_SIZE_64
); 
2882                 if (uio 
&& ((flags 
& (IO_SYNC 
| IO_HEADZEROFILL 
| IO_TAILZEROFILL
)) == 0)) { 
2884                          * assumption... total_size <= io_resid 
2885                          * because IO_HEADZEROFILL and IO_TAILZEROFILL not set 
2887                         if ((start_offset 
+ total_size
) > max_io_size
) 
2888                                 total_size 
= max_io_size 
- start_offset
; 
2889                         xfer_resid 
= total_size
; 
2891                         retval 
= cluster_copy_ubc_data_internal(vp
, uio
, &xfer_resid
, 1, 1); 
2896                         io_resid    
-= (total_size 
- xfer_resid
); 
2897                         total_size   
= xfer_resid
; 
2898                         start_offset 
= (int)(uio
->uio_offset 
& PAGE_MASK_64
); 
2899                         upl_f_offset 
= uio
->uio_offset 
- start_offset
; 
2901                         if (total_size 
== 0) { 
2904                                          * the write did not finish on a page boundary 
2905                                          * which will leave upl_f_offset pointing to the 
2906                                          * beginning of the last page written instead of 
2907                                          * the page beyond it... bump it in this case 
2908                                          * so that the cluster code records the last page 
2911                                         upl_f_offset 
+= PAGE_SIZE_64
; 
2919                  * compute the size of the upl needed to encompass 
2920                  * the requested write... limit each call to cluster_io 
2921                  * to the maximum UPL size... cluster_io will clip if 
2922                  * this exceeds the maximum io_size for the device, 
2923                  * make sure to account for  
2924                  * a starting offset that's not page aligned 
2926                 upl_size 
= (start_offset 
+ total_size 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
2928                 if (upl_size 
> max_io_size
) 
2929                         upl_size 
= max_io_size
; 
2931                 pages_in_upl 
= upl_size 
/ PAGE_SIZE
; 
2932                 io_size      
= upl_size 
- start_offset
; 
2934                 if ((long long)io_size 
> total_size
) 
2935                         io_size 
= total_size
; 
2937                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, io_size
, total_size
, 0, 0); 
2941                  * Gather the pages from the buffer cache. 
2942                  * The UPL_WILL_MODIFY flag lets the UPL subsystem know 
2943                  * that we intend to modify these pages. 
2945                 kret 
= ubc_create_upl(vp
,  
2950                                       UPL_SET_LITE 
| (( uio
!=NULL 
&& (uio
->uio_flags 
& UIO_FLAGS_IS_COMPRESSED_FILE
)) ? 0 : UPL_WILL_MODIFY
)); 
2951                 if (kret 
!= KERN_SUCCESS
) 
2952                         panic("cluster_write_copy: failed to get pagelist"); 
2954                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
, 
2955                         upl
, (int)upl_f_offset
, start_offset
, 0, 0); 
2957                 if (start_offset 
&& upl_f_offset 
< oldEOF 
&& !upl_valid_page(pl
, 0)) { 
2961                          * we're starting in the middle of the first page of the upl 
2962                          * and the page isn't currently valid, so we're going to have 
2963                          * to read it in first... this is a synchronous operation 
2965                         read_size 
= PAGE_SIZE
; 
2967                         if ((upl_f_offset 
+ read_size
) > oldEOF
) 
2968                                 read_size 
= oldEOF 
- upl_f_offset
; 
2970                         retval 
= cluster_io(vp
, upl
, 0, upl_f_offset
, read_size
, 
2971                                             CL_READ 
| bflag
, (buf_t
)NULL
, (struct clios 
*)NULL
, callback
, callback_arg
); 
2974                                  * we had an error during the read which causes us to abort 
2975                                  * the current cluster_write request... before we do, we need 
2976                                  * to release the rest of the pages in the upl without modifying 
2977                                  * there state and mark the failed page in error 
2979                                 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
|UPL_ABORT_FREE_ON_EMPTY
); 
2981                                 if (upl_size 
> PAGE_SIZE
) 
2982                                         ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
); 
2984                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
, 
2985                                              upl
, 0, 0, retval
, 0); 
2989                 if ((start_offset 
== 0 || upl_size 
> PAGE_SIZE
) && ((start_offset 
+ io_size
) & PAGE_MASK
)) { 
2991                          * the last offset we're writing to in this upl does not end on a page 
2992                          * boundary... if it's not beyond the old EOF, then we'll also need to 
2993                          * pre-read this page in if it isn't already valid 
2995                         upl_offset 
= upl_size 
- PAGE_SIZE
; 
2997                         if ((upl_f_offset 
+ start_offset 
+ io_size
) < oldEOF 
&& 
2998                             !upl_valid_page(pl
, upl_offset 
/ PAGE_SIZE
)) { 
3001                                 read_size 
= PAGE_SIZE
; 
3003                                 if ((off_t
)(upl_f_offset 
+ upl_offset 
+ read_size
) > oldEOF
) 
3004                                         read_size 
= oldEOF 
- (upl_f_offset 
+ upl_offset
); 
3006                                 retval 
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset 
+ upl_offset
, read_size
, 
3007                                                     CL_READ 
| bflag
, (buf_t
)NULL
, (struct clios 
*)NULL
, callback
, callback_arg
); 
3010                                          * we had an error during the read which causes us to abort 
3011                                          * the current cluster_write request... before we do, we 
3012                                          * need to release the rest of the pages in the upl without 
3013                                          * modifying there state and mark the failed page in error 
3015                                         ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
|UPL_ABORT_FREE_ON_EMPTY
); 
3017                                         if (upl_size 
> PAGE_SIZE
) 
3018                                                 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
); 
3020                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
, 
3021                                                      upl
, 0, 0, retval
, 0); 
3026                 xfer_resid 
= io_size
; 
3027                 io_offset 
= start_offset
; 
3029                 while (zero_cnt 
&& xfer_resid
) { 
3031                         if (zero_cnt 
< (long long)xfer_resid
) 
3032                                 bytes_to_zero 
= zero_cnt
; 
3034                                 bytes_to_zero 
= xfer_resid
; 
3036                         bytes_to_zero 
= cluster_zero_range(upl
, pl
, flags
, io_offset
, zero_off
, upl_f_offset
, bytes_to_zero
); 
3038                         xfer_resid 
-= bytes_to_zero
; 
3039                         zero_cnt   
-= bytes_to_zero
; 
3040                         zero_off   
+= bytes_to_zero
; 
3041                         io_offset  
+= bytes_to_zero
; 
3043                 if (xfer_resid 
&& io_resid
) { 
3044                         u_int32_t  io_requested
; 
3046                         bytes_to_move 
= min(io_resid
, xfer_resid
); 
3047                         io_requested 
= bytes_to_move
; 
3049                         retval 
= cluster_copy_upl_data(uio
, upl
, io_offset
, (int *)&io_requested
); 
3052                                 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_DUMP_PAGES 
| UPL_ABORT_FREE_ON_EMPTY
); 
3054                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
, 
3055                                              upl
, 0, 0, retval
, 0); 
3057                                 io_resid   
-= bytes_to_move
; 
3058                                 xfer_resid 
-= bytes_to_move
; 
3059                                 io_offset  
+= bytes_to_move
; 
3062                 while (xfer_resid 
&& zero_cnt1 
&& retval 
== 0) { 
3064                         if (zero_cnt1 
< (long long)xfer_resid
) 
3065                                 bytes_to_zero 
= zero_cnt1
; 
3067                                 bytes_to_zero 
= xfer_resid
; 
3069                         bytes_to_zero 
= cluster_zero_range(upl
, pl
, flags
, io_offset
, zero_off1
, upl_f_offset
, bytes_to_zero
); 
3071                         xfer_resid 
-= bytes_to_zero
; 
3072                         zero_cnt1  
-= bytes_to_zero
; 
3073                         zero_off1  
+= bytes_to_zero
; 
3074                         io_offset  
+= bytes_to_zero
; 
3078                         int ret_cluster_try_push
; 
3080                         io_size 
+= start_offset
; 
3082                         if ((upl_f_offset 
+ io_size
) >= newEOF 
&& (u_int
)io_size 
< upl_size
) { 
3084                                  * if we're extending the file with this write 
3085                                  * we'll zero fill the rest of the page so that 
3086                                  * if the file gets extended again in such a way as to leave a 
3087                                  * hole starting at this EOF, we'll have zero's in the correct spot 
3089                                 cluster_zero(upl
, io_size
, upl_size 
- io_size
, NULL
);  
3092                          * release the upl now if we hold one since... 
3093                          * 1) pages in it may be present in the sparse cluster map 
3094                          *    and may span 2 separate buckets there... if they do and  
3095                          *    we happen to have to flush a bucket to make room and it intersects 
3096                          *    this upl, a deadlock may result on page BUSY 
3097                          * 2) we're delaying the I/O... from this point forward we're just updating 
3098                          *    the cluster state... no need to hold the pages, so commit them 
3099                          * 3) IO_SYNC is set... 
3100                          *    because we had to ask for a UPL that provides currenty non-present pages, the 
3101                          *    UPL has been automatically set to clear the dirty flags (both software and hardware) 
3102                          *    upon committing it... this is not the behavior we want since it's possible for 
3103                          *    pages currently present as part of a mapped file to be dirtied while the I/O is in flight. 
3104                          *    we'll pick these pages back up later with the correct behavior specified. 
3105                          * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush 
3106                          *    of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages 
3107                          *    we hold since the flushing context is holding the cluster lock. 
3109                         ubc_upl_commit_range(upl
, 0, upl_size
, 
3110                                              UPL_COMMIT_SET_DIRTY 
| UPL_COMMIT_INACTIVATE 
| UPL_COMMIT_FREE_ON_EMPTY
); 
3113                          * calculate the last logical block number  
3114                          * that this delayed I/O encompassed 
3116                         cl
.e_addr 
= (daddr64_t
)((upl_f_offset 
+ (off_t
)upl_size
) / PAGE_SIZE_64
); 
3118                         if (flags 
& IO_SYNC
) { 
3120                                  * if the IO_SYNC flag is set than we need to  
3121                                  * bypass any clusters and immediately issue 
3127                          * take the lock to protect our accesses 
3128                          * of the writebehind and sparse cluster state 
3130                         wbp 
= cluster_get_wbp(vp
, CLW_ALLOCATE 
| CLW_RETURNLOCKED
); 
3132                         if (wbp
->cl_scmap
) { 
3134                                 if ( !(flags 
& IO_NOCACHE
)) { 
3136                                          * we've fallen into the sparse 
3137                                          * cluster method of delaying dirty pages 
3139                                         sparse_cluster_add(&(wbp
->cl_scmap
), vp
, &cl
, newEOF
, callback
, callback_arg
); 
3141                                         lck_mtx_unlock(&wbp
->cl_lockw
); 
3146                                  * must have done cached writes that fell into 
3147                                  * the sparse cluster mechanism... we've switched 
3148                                  * to uncached writes on the file, so go ahead 
3149                                  * and push whatever's in the sparse map 
3150                                  * and switch back to normal clustering 
3154                                 sparse_cluster_push(&(wbp
->cl_scmap
), vp
, newEOF
, PUSH_ALL
, 0, callback
, callback_arg
); 
3156                                  * no clusters of either type present at this point 
3157                                  * so just go directly to start_new_cluster since 
3158                                  * we know we need to delay this I/O since we've 
3159                                  * already released the pages back into the cache 
3160                                  * to avoid the deadlock with sparse_cluster_push 
3162                                 goto start_new_cluster
; 
3165                                 if (write_off 
== wbp
->cl_last_write
) 
3166                                         wbp
->cl_seq_written 
+= write_cnt
; 
3168                                         wbp
->cl_seq_written 
= write_cnt
; 
3170                                 wbp
->cl_last_write 
= write_off 
+ write_cnt
; 
3174                         if (wbp
->cl_number 
== 0) 
3176                                  * no clusters currently present 
3178                                 goto start_new_cluster
; 
3180                         for (cl_index 
= 0; cl_index 
< wbp
->cl_number
; cl_index
++) { 
3182                                  * check each cluster that we currently hold 
3183                                  * try to merge some or all of this write into 
3184                                  * one or more of the existing clusters... if 
3185                                  * any portion of the write remains, start a 
3188                                 if (cl
.b_addr 
>= wbp
->cl_clusters
[cl_index
].b_addr
) { 
3190                                          * the current write starts at or after the current cluster 
3192                                         if (cl
.e_addr 
<= (wbp
->cl_clusters
[cl_index
].b_addr 
+ max_cluster_pgcount
)) { 
3194                                                  * we have a write that fits entirely 
3195                                                  * within the existing cluster limits 
3197                                                 if (cl
.e_addr 
> wbp
->cl_clusters
[cl_index
].e_addr
) 
3199                                                          * update our idea of where the cluster ends 
3201                                                         wbp
->cl_clusters
[cl_index
].e_addr 
= cl
.e_addr
; 
3204                                         if (cl
.b_addr 
< (wbp
->cl_clusters
[cl_index
].b_addr 
+ max_cluster_pgcount
)) { 
3206                                                  * we have a write that starts in the middle of the current cluster 
3207                                                  * but extends beyond the cluster's limit... we know this because 
3208                                                  * of the previous checks 
3209                                                  * we'll extend the current cluster to the max 
3210                                                  * and update the b_addr for the current write to reflect that 
3211                                                  * the head of it was absorbed into this cluster... 
3212                                                  * note that we'll always have a leftover tail in this case since 
3213                                                  * full absorbtion would have occurred in the clause above 
3215                                                 wbp
->cl_clusters
[cl_index
].e_addr 
= wbp
->cl_clusters
[cl_index
].b_addr 
+ max_cluster_pgcount
; 
3217                                                 cl
.b_addr 
= wbp
->cl_clusters
[cl_index
].e_addr
; 
3220                                          * we come here for the case where the current write starts 
3221                                          * beyond the limit of the existing cluster or we have a leftover 
3222                                          * tail after a partial absorbtion 
3224                                          * in either case, we'll check the remaining clusters before  
3225                                          * starting a new one 
3229                                          * the current write starts in front of the cluster we're currently considering 
3231                                         if ((wbp
->cl_clusters
[cl_index
].e_addr 
- cl
.b_addr
) <= max_cluster_pgcount
) { 
3233                                                  * we can just merge the new request into 
3234                                                  * this cluster and leave it in the cache 
3235                                                  * since the resulting cluster is still  
3236                                                  * less than the maximum allowable size 
3238                                                 wbp
->cl_clusters
[cl_index
].b_addr 
= cl
.b_addr
; 
3240                                                 if (cl
.e_addr 
> wbp
->cl_clusters
[cl_index
].e_addr
) { 
3242                                                          * the current write completely 
3243                                                          * envelops the existing cluster and since 
3244                                                          * each write is limited to at most max_cluster_pgcount pages 
3245                                                          * we can just use the start and last blocknos of the write 
3246                                                          * to generate the cluster limits 
3248                                                         wbp
->cl_clusters
[cl_index
].e_addr 
= cl
.e_addr
; 
3254                                          * if we were to combine this write with the current cluster 
3255                                          * we would exceed the cluster size limit.... so, 
3256                                          * let's see if there's any overlap of the new I/O with 
3257                                          * the cluster we're currently considering... in fact, we'll 
3258                                          * stretch the cluster out to it's full limit and see if we 
3259                                          * get an intersection with the current write 
3262                                         if (cl
.e_addr 
> wbp
->cl_clusters
[cl_index
].e_addr 
- max_cluster_pgcount
) { 
3264                                                  * the current write extends into the proposed cluster 
3265                                                  * clip the length of the current write after first combining it's 
3266                                                  * tail with the newly shaped cluster 
3268                                                 wbp
->cl_clusters
[cl_index
].b_addr 
= wbp
->cl_clusters
[cl_index
].e_addr 
- max_cluster_pgcount
; 
3270                                                 cl
.e_addr 
= wbp
->cl_clusters
[cl_index
].b_addr
; 
3273                                          * if we get here, there was no way to merge 
3274                                          * any portion of this write with this cluster  
3275                                          * or we could only merge part of it which  
3276                                          * will leave a tail... 
3277                                          * we'll check the remaining clusters before starting a new one 
3281                         if (cl_index 
< wbp
->cl_number
) 
3283                                  * we found an existing cluster(s) that we 
3284                                  * could entirely merge this I/O into 
3288                         if (!((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
) && 
3289                             wbp
->cl_number 
== MAX_CLUSTERS 
&& 
3290                             wbp
->cl_seq_written 
>= (MAX_CLUSTERS 
* (max_cluster_pgcount 
* PAGE_SIZE
))) { 
3293                                 if (vp
->v_mount
->mnt_kern_flag 
& MNTK_SSD
) 
3294                                         n 
= WRITE_BEHIND_SSD
; 
3299                                         cluster_try_push(wbp
, vp
, newEOF
, 0, 0, callback
, callback_arg
); 
3301                         if (wbp
->cl_number 
< MAX_CLUSTERS
) { 
3303                                  * we didn't find an existing cluster to 
3304                                  * merge into, but there's room to start 
3307                                 goto start_new_cluster
; 
3310                          * no exisitng cluster to merge with and no 
3311                          * room to start a new one... we'll try  
3312                          * pushing one of the existing ones... if none of 
3313                          * them are able to be pushed, we'll switch 
3314                          * to the sparse cluster mechanism 
3315                          * cluster_try_push updates cl_number to the 
3316                          * number of remaining clusters... and 
3317                          * returns the number of currently unused clusters 
3319                         ret_cluster_try_push 
= 0; 
3322                          * if writes are not deferred, call cluster push immediately 
3324                         if (!((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
)) { 
3326                                 ret_cluster_try_push 
= cluster_try_push(wbp
, vp
, newEOF
, (flags 
& IO_NOCACHE
) ? 0 : PUSH_DELAY
, 0, callback
, callback_arg
); 
3330                          * execute following regardless of writes being deferred or not 
3332                         if (ret_cluster_try_push 
== 0) { 
3334                                  * no more room in the normal cluster mechanism 
3335                                  * so let's switch to the more expansive but expensive 
3336                                  * sparse mechanism.... 
3338                                 sparse_cluster_switch(wbp
, vp
, newEOF
, callback
, callback_arg
); 
3339                                 sparse_cluster_add(&(wbp
->cl_scmap
), vp
, &cl
, newEOF
, callback
, callback_arg
); 
3341                                 lck_mtx_unlock(&wbp
->cl_lockw
); 
3346                         wbp
->cl_clusters
[wbp
->cl_number
].b_addr 
= cl
.b_addr
; 
3347                         wbp
->cl_clusters
[wbp
->cl_number
].e_addr 
= cl
.e_addr
; 
3349                         wbp
->cl_clusters
[wbp
->cl_number
].io_flags 
= 0; 
3351                         if (flags 
& IO_NOCACHE
) 
3352                                 wbp
->cl_clusters
[wbp
->cl_number
].io_flags 
|= CLW_IONOCACHE
; 
3354                         if (bflag 
& CL_PASSIVE
) 
3355                                 wbp
->cl_clusters
[wbp
->cl_number
].io_flags 
|= CLW_IOPASSIVE
; 
3359                         lck_mtx_unlock(&wbp
->cl_lockw
); 
3364                          * we don't hold the lock at this point 
3366                          * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set 
3367                          * so that we correctly deal with a change in state of the hardware modify bit... 
3368                          * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force 
3369                          * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also 
3370                          * responsible for generating the correct sized I/O(s) 
3372                         retval 
= cluster_push_now(vp
, &cl
, newEOF
, flags
, callback
, callback_arg
); 
3375         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
, retval
, 0, io_resid
, 0, 0); 
3383 cluster_read(vnode_t vp
, struct uio 
*uio
, off_t filesize
, int xflags
) 
3385         return cluster_read_ext(vp
, uio
, filesize
, xflags
, NULL
, NULL
); 
3390 cluster_read_ext(vnode_t vp
, struct uio 
*uio
, off_t filesize
, int xflags
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
3394         user_ssize_t    cur_resid
; 
3396         u_int32_t       read_length 
= 0; 
3397         int             read_type 
= IO_COPY
; 
3401         if (vp
->v_flag 
& VNOCACHE_DATA
) 
3402                 flags 
|= IO_NOCACHE
; 
3403         if ((vp
->v_flag 
& VRAOFF
) || speculative_reads_disabled
) 
3407          * If we're doing an encrypted IO, then first check to see 
3408          * if the IO requested was page aligned.  If not, then bail  
3411         if (flags 
& IO_ENCRYPTED
) {              
3412                 if (read_length 
& PAGE_MASK
) { 
3419          * do a read through the cache if one of the following is true.... 
3420          *   NOCACHE is not true 
3421          *   the uio request doesn't target USERSPACE 
3422          * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well. 
3423          * Reading encrypted data from a CP filesystem should never result in the data touching 
3426          * otherwise, find out if we want the direct or contig variant for 
3427          * the first vector in the uio request 
3429         if (((flags 
& IO_NOCACHE
) || (flags 
& IO_ENCRYPTED
)) && UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
)) { 
3430                 retval 
= cluster_io_type(uio
, &read_type
, &read_length
, 0); 
3433         while ((cur_resid 
= uio_resid(uio
)) && uio
->uio_offset 
< filesize 
&& retval 
== 0) { 
3435                 switch (read_type
) { 
3439                          * make sure the uio_resid isn't too big... 
3440                          * internally, we want to handle all of the I/O in 
3441                          * chunk sizes that fit in a 32 bit int 
3443                         if (cur_resid 
> (user_ssize_t
)(MAX_IO_REQUEST_SIZE
)) 
3444                                 io_size 
= MAX_IO_REQUEST_SIZE
; 
3446                                 io_size 
= (u_int32_t
)cur_resid
; 
3448                         retval 
= cluster_read_copy(vp
, uio
, io_size
, filesize
, flags
, callback
, callback_arg
); 
3452                         retval 
= cluster_read_direct(vp
, uio
, filesize
, &read_type
, &read_length
, flags
, callback
, callback_arg
); 
3456                         retval 
= cluster_read_contig(vp
, uio
, filesize
, &read_type
, &read_length
, callback
, callback_arg
, flags
); 
3460                         retval 
= cluster_io_type(uio
, &read_type
, &read_length
, 0); 
3470 cluster_read_upl_release(upl_t upl
, int start_pg
, int last_pg
, int take_reference
) 
3473         int abort_flags 
= UPL_ABORT_FREE_ON_EMPTY
; 
3475         if ((range 
= last_pg 
- start_pg
)) { 
3477                         abort_flags 
|= UPL_ABORT_REFERENCE
; 
3479                 ubc_upl_abort_range(upl
, start_pg 
* PAGE_SIZE
, range 
* PAGE_SIZE
, abort_flags
); 
3485 cluster_read_copy(vnode_t vp
, struct uio 
*uio
, u_int32_t io_req_size
, off_t filesize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
3487         upl_page_info_t 
*pl
; 
3489         vm_offset_t      upl_offset
; 
3498         off_t            last_ioread_offset
; 
3499         off_t            last_request_offset
; 
3503         u_int32_t        size_of_prefetch
; 
3506         u_int32_t        max_rd_size
; 
3507         u_int32_t        max_io_size
; 
3508         u_int32_t        max_prefetch
; 
3509         u_int            rd_ahead_enabled 
= 1; 
3510         u_int            prefetch_enabled 
= 1; 
3511         struct cl_readahead 
*   rap
; 
3512         struct clios            iostate
; 
3513         struct cl_extent        extent
; 
3515         int              take_reference 
= 1; 
3516         int              policy 
= IOPOL_DEFAULT
; 
3517         boolean_t        iolock_inited 
= FALSE
; 
3519         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_START
, 
3520                      (int)uio
->uio_offset
, io_req_size
, (int)filesize
, flags
, 0); 
3522         if (flags 
& IO_ENCRYPTED
) { 
3523                 panic ("encrypted blocks will hit UBC!"); 
3526         policy 
= proc_get_task_selfdiskacc(); 
3528         if (policy 
== IOPOL_THROTTLE 
|| policy 
== IOPOL_UTILITY 
|| (flags 
& IO_NOCACHE
)) 
3531         if (flags 
& IO_PASSIVE
) 
3536         if (flags 
& IO_NOCACHE
) 
3537                 bflag 
|= CL_NOCACHE
; 
3539         max_io_size 
= cluster_max_io_size(vp
->v_mount
, CL_READ
); 
3540         max_prefetch 
= MAX_PREFETCH(vp
, max_io_size
, (vp
->v_mount
->mnt_kern_flag 
& MNTK_SSD
)); 
3541         max_rd_size 
= max_prefetch
; 
3543         last_request_offset 
= uio
->uio_offset 
+ io_req_size
; 
3545         if (last_request_offset 
> filesize
) 
3546                 last_request_offset 
= filesize
; 
3548         if ((flags 
& (IO_RAOFF
|IO_NOCACHE
)) || ((last_request_offset 
& ~PAGE_MASK_64
) == (uio
->uio_offset 
& ~PAGE_MASK_64
))) { 
3549                 rd_ahead_enabled 
= 0; 
3552                 if (cluster_hard_throttle_on(vp
, 1)) { 
3554                          * we're in the throttle window, at the very least 
3555                          * we want to limit the size of the I/O we're about 
3558                         rd_ahead_enabled 
= 0; 
3559                         prefetch_enabled 
= 0; 
3561                         max_rd_size 
= THROTTLE_MAX_IOSIZE
; 
3563                 if ((rap 
= cluster_get_rap(vp
)) == NULL
) 
3564                         rd_ahead_enabled 
= 0; 
3566                         extent
.b_addr 
= uio
->uio_offset 
/ PAGE_SIZE_64
; 
3567                         extent
.e_addr 
= (last_request_offset 
- 1) / PAGE_SIZE_64
; 
3570         if (rap 
!= NULL 
&& rap
->cl_ralen 
&& (rap
->cl_lastr 
== extent
.b_addr 
|| (rap
->cl_lastr 
+ 1) == extent
.b_addr
)) { 
3572                  * determine if we already have a read-ahead in the pipe courtesy of the 
3573                  * last read systemcall that was issued... 
3574                  * if so, pick up it's extent to determine where we should start 
3575                  * with respect to any read-ahead that might be necessary to  
3576                  * garner all the data needed to complete this read systemcall 
3578                 last_ioread_offset 
= (rap
->cl_maxra 
* PAGE_SIZE_64
) + PAGE_SIZE_64
; 
3580                 if (last_ioread_offset 
< uio
->uio_offset
) 
3581                         last_ioread_offset 
= (off_t
)0; 
3582                 else if (last_ioread_offset 
> last_request_offset
) 
3583                         last_ioread_offset 
= last_request_offset
; 
3585                 last_ioread_offset 
= (off_t
)0; 
3587         while (io_req_size 
&& uio
->uio_offset 
< filesize 
&& retval 
== 0) { 
3589                 max_size 
= filesize 
- uio
->uio_offset
; 
3591                 if ((off_t
)(io_req_size
) < max_size
) 
3592                         io_size 
= io_req_size
; 
3596                 if (!(flags 
& IO_NOCACHE
)) { 
3600                                 u_int32_t io_requested
; 
3603                                  * if we keep finding the pages we need already in the cache, then 
3604                                  * don't bother to call cluster_read_prefetch since it costs CPU cycles 
3605                                  * to determine that we have all the pages we need... once we miss in 
3606                                  * the cache and have issued an I/O, than we'll assume that we're likely 
3607                                  * to continue to miss in the cache and it's to our advantage to try and prefetch 
3609                                 if (last_request_offset 
&& last_ioread_offset 
&& (size_of_prefetch 
= (last_request_offset 
- last_ioread_offset
))) { 
3610                                         if ((last_ioread_offset 
- uio
->uio_offset
) <= max_rd_size 
&& prefetch_enabled
) { 
3612                                                  * we've already issued I/O for this request and 
3613                                                  * there's still work to do and 
3614                                                  * our prefetch stream is running dry, so issue a 
3615                                                  * pre-fetch I/O... the I/O latency will overlap 
3616                                                  * with the copying of the data 
3618                                                 if (size_of_prefetch 
> max_rd_size
) 
3619                                                         size_of_prefetch 
= max_rd_size
; 
3621                                                 size_of_prefetch 
= cluster_read_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
, callback
, callback_arg
, bflag
); 
3623                                                 last_ioread_offset 
+= (off_t
)(size_of_prefetch 
* PAGE_SIZE
); 
3625                                                 if (last_ioread_offset 
> last_request_offset
) 
3626                                                         last_ioread_offset 
= last_request_offset
; 
3630                                  * limit the size of the copy we're about to do so that  
3631                                  * we can notice that our I/O pipe is running dry and  
3632                                  * get the next I/O issued before it does go dry 
3634                                 if (last_ioread_offset 
&& io_size 
> (max_io_size 
/ 4)) 
3635                                         io_resid 
= (max_io_size 
/ 4); 
3639                                 io_requested 
= io_resid
; 
3641                                 retval 
= cluster_copy_ubc_data_internal(vp
, uio
, (int *)&io_resid
, 0, take_reference
); 
3643                                 xsize 
= io_requested 
- io_resid
; 
3646                                 io_req_size 
-= xsize
; 
3648                                 if (retval 
|| io_resid
) 
3650                                          * if we run into a real error or 
3651                                          * a page that is not in the cache 
3652                                          * we need to leave streaming mode 
3656                                 if (rd_ahead_enabled 
&& (io_size 
== 0 || last_ioread_offset 
== last_request_offset
)) { 
3658                                          * we're already finished the I/O for this read request 
3659                                          * let's see if we should do a read-ahead 
3661                                         cluster_read_ahead(vp
, &extent
, filesize
, rap
, callback
, callback_arg
, bflag
); 
3668                                         if (extent
.e_addr 
< rap
->cl_lastr
) 
3670                                         rap
->cl_lastr 
= extent
.e_addr
; 
3675                          * recompute max_size since cluster_copy_ubc_data_internal 
3676                          * may have advanced uio->uio_offset 
3678                         max_size 
= filesize 
- uio
->uio_offset
; 
3681                 iostate
.io_completed 
= 0; 
3682                 iostate
.io_issued 
= 0; 
3683                 iostate
.io_error 
= 0; 
3684                 iostate
.io_wanted 
= 0; 
3686                 if ( (flags 
& IO_RETURN_ON_THROTTLE
) ) { 
3687                         if (cluster_hard_throttle_on(vp
, 0) == 2) { 
3688                                 if ( !cluster_io_present_in_BC(vp
, uio
->uio_offset
)) { 
3690                                          * we're in the throttle window and at least 1 I/O 
3691                                          * has already been issued by a throttleable thread 
3692                                          * in this window, so return with EAGAIN to indicate 
3693                                          * to the FS issuing the cluster_read call that it 
3694                                          * should now throttle after dropping any locks 
3696                                         throttle_info_update_by_mount(vp
->v_mount
); 
3705                  * compute the size of the upl needed to encompass 
3706                  * the requested read... limit each call to cluster_io 
3707                  * to the maximum UPL size... cluster_io will clip if 
3708                  * this exceeds the maximum io_size for the device, 
3709                  * make sure to account for  
3710                  * a starting offset that's not page aligned 
3712                 start_offset 
= (int)(uio
->uio_offset 
& PAGE_MASK_64
); 
3713                 upl_f_offset 
= uio
->uio_offset 
- (off_t
)start_offset
; 
3715                 if (io_size 
> max_rd_size
) 
3716                         io_size 
= max_rd_size
; 
3718                 upl_size 
= (start_offset 
+ io_size 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
3720                 if (flags 
& IO_NOCACHE
) { 
3721                         if (upl_size 
> max_io_size
) 
3722                                 upl_size 
= max_io_size
; 
3724                         if (upl_size 
> max_io_size 
/ 4) 
3725                                 upl_size 
= max_io_size 
/ 4; 
3727                 pages_in_upl 
= upl_size 
/ PAGE_SIZE
; 
3729                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_START
, 
3730                              upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0); 
3732                 kret 
= ubc_create_upl(vp
,  
3737                                       UPL_FILE_IO 
| UPL_SET_LITE
); 
3738                 if (kret 
!= KERN_SUCCESS
) 
3739                         panic("cluster_read_copy: failed to get pagelist"); 
3741                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_END
, 
3742                              upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0); 
3745                  * scan from the beginning of the upl looking for the first 
3746                  * non-valid page.... this will become the first page in 
3747                  * the request we're going to make to 'cluster_io'... if all 
3748                  * of the pages are valid, we won't call through to 'cluster_io' 
3750                 for (start_pg 
= 0; start_pg 
< pages_in_upl
; start_pg
++) { 
3751                         if (!upl_valid_page(pl
, start_pg
)) 
3756                  * scan from the starting invalid page looking for a valid 
3757                  * page before the end of the upl is reached, if we  
3758                  * find one, then it will be the last page of the request to 
3761                 for (last_pg 
= start_pg
; last_pg 
< pages_in_upl
; last_pg
++) { 
3762                         if (upl_valid_page(pl
, last_pg
)) 
3766                 if (start_pg 
< last_pg
) {                
3768                          * we found a range of 'invalid' pages that must be filled 
3769                          * if the last page in this range is the last page of the file 
3770                          * we may have to clip the size of it to keep from reading past 
3771                          * the end of the last physical block associated with the file 
3773                         if (iolock_inited 
== FALSE
) { 
3774                                 lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
); 
3776                                 iolock_inited 
= TRUE
; 
3778                         upl_offset 
= start_pg 
* PAGE_SIZE
; 
3779                         io_size    
= (last_pg 
- start_pg
) * PAGE_SIZE
; 
3781                         if ((off_t
)(upl_f_offset 
+ upl_offset 
+ io_size
) > filesize
) 
3782                                 io_size 
= filesize 
- (upl_f_offset 
+ upl_offset
); 
3785                          * issue an asynchronous read to cluster_io 
3788                         error 
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset 
+ upl_offset
, 
3789                                            io_size
, CL_READ 
| CL_ASYNC 
| bflag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
); 
3792                                 if (extent
.e_addr 
< rap
->cl_maxra
) { 
3794                                         * we've just issued a read for a block that should have been 
3795                                         * in the cache courtesy of the read-ahead engine... something 
3796                                         * has gone wrong with the pipeline, so reset the read-ahead 
3797                                         * logic which will cause us to restart from scratch 
3805                          * if the read completed successfully, or there was no I/O request 
3806                          * issued, than copy the data into user land via 'cluster_upl_copy_data' 
3807                          * we'll first add on any 'valid' 
3808                          * pages that were present in the upl when we acquired it. 
3812                         for (uio_last 
= last_pg
; uio_last 
< pages_in_upl
; uio_last
++) { 
3813                                 if (!upl_valid_page(pl
, uio_last
)) 
3816                         if (uio_last 
< pages_in_upl
) { 
3818                                  * there were some invalid pages beyond the valid pages 
3819                                  * that we didn't issue an I/O for, just release them 
3820                                  * unchanged now, so that any prefetch/readahed can 
3823                                 ubc_upl_abort_range(upl
, uio_last 
* PAGE_SIZE
, 
3824                                                     (pages_in_upl 
- uio_last
) * PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
); 
3828                          * compute size to transfer this round,  if io_req_size is 
3829                          * still non-zero after this attempt, we'll loop around and 
3830                          * set up for another I/O. 
3832                         val_size 
= (uio_last 
* PAGE_SIZE
) - start_offset
; 
3834                         if (val_size 
> max_size
) 
3835                                 val_size 
= max_size
; 
3837                         if (val_size 
> io_req_size
) 
3838                                 val_size 
= io_req_size
; 
3840                         if ((uio
->uio_offset 
+ val_size
) > last_ioread_offset
) 
3841                                 last_ioread_offset 
= uio
->uio_offset 
+ val_size
; 
3843                         if ((size_of_prefetch 
= (last_request_offset 
- last_ioread_offset
)) && prefetch_enabled
) { 
3845                                 if ((last_ioread_offset 
- (uio
->uio_offset 
+ val_size
)) <= upl_size
) { 
3847                                          * if there's still I/O left to do for this request, and... 
3848                                          * we're not in hard throttle mode, and... 
3849                                          * we're close to using up the previous prefetch, then issue a 
3850                                          * new pre-fetch I/O... the I/O latency will overlap 
3851                                          * with the copying of the data 
3853                                         if (size_of_prefetch 
> max_rd_size
) 
3854                                                 size_of_prefetch 
= max_rd_size
; 
3856                                         size_of_prefetch 
= cluster_read_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
, callback
, callback_arg
, bflag
); 
3858                                         last_ioread_offset 
+= (off_t
)(size_of_prefetch 
* PAGE_SIZE
); 
3860                                         if (last_ioread_offset 
> last_request_offset
) 
3861                                                 last_ioread_offset 
= last_request_offset
; 
3864                         } else if ((uio
->uio_offset 
+ val_size
) == last_request_offset
) { 
3866                                  * this transfer will finish this request, so... 
3867                                  * let's try to read ahead if we're in  
3868                                  * a sequential access pattern and we haven't 
3869                                  * explicitly disabled it 
3871                                 if (rd_ahead_enabled
) 
3872                                         cluster_read_ahead(vp
, &extent
, filesize
, rap
, callback
, callback_arg
, bflag
); 
3875                                         if (extent
.e_addr 
< rap
->cl_lastr
) 
3877                                         rap
->cl_lastr 
= extent
.e_addr
; 
3880                         if (iostate
.io_issued 
> iostate
.io_completed
) 
3881                                 cluster_iostate_wait(&iostate
, 0, "cluster_read_copy"); 
3883                         if (iostate
.io_error
) 
3884                                 error 
= iostate
.io_error
; 
3886                                 u_int32_t io_requested
; 
3888                                 io_requested 
= val_size
; 
3890                                 retval 
= cluster_copy_upl_data(uio
, upl
, start_offset
, (int *)&io_requested
); 
3892                                 io_req_size 
-= (val_size 
- io_requested
); 
3895                         if (iostate
.io_issued 
> iostate
.io_completed
) 
3896                                 cluster_iostate_wait(&iostate
, 0, "cluster_read_copy"); 
3898                 if (start_pg 
< last_pg
) { 
3900                          * compute the range of pages that we actually issued an I/O for 
3901                          * and either commit them as valid if the I/O succeeded 
3902                          * or abort them if the I/O failed or we're not supposed to  
3903                          * keep them in the cache 
3905                         io_size 
= (last_pg 
- start_pg
) * PAGE_SIZE
; 
3907                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
, upl
, start_pg 
* PAGE_SIZE
, io_size
, error
, 0); 
3909                         if (error 
|| (flags 
& IO_NOCACHE
)) 
3910                                 ubc_upl_abort_range(upl
, start_pg 
* PAGE_SIZE
, io_size
, 
3911                                                     UPL_ABORT_DUMP_PAGES 
| UPL_ABORT_FREE_ON_EMPTY
); 
3913                                 int     commit_flags 
= UPL_COMMIT_CLEAR_DIRTY 
| UPL_COMMIT_FREE_ON_EMPTY
; 
3916                                         commit_flags 
|= UPL_COMMIT_INACTIVATE
; 
3918                                         commit_flags 
|= UPL_COMMIT_SPECULATE
; 
3920                                 ubc_upl_commit_range(upl
, start_pg 
* PAGE_SIZE
, io_size
, commit_flags
); 
3922                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
, upl
, start_pg 
* PAGE_SIZE
, io_size
, error
, 0); 
3924                 if ((last_pg 
- start_pg
) < pages_in_upl
) { 
3926                          * the set of pages that we issued an I/O for did not encompass 
3927                          * the entire upl... so just release these without modifying 
3931                                 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
); 
3934                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
, 
3935                                              upl
, -1, pages_in_upl 
- (last_pg 
- start_pg
), 0, 0); 
3938                                  * handle any valid pages at the beginning of 
3939                                  * the upl... release these appropriately 
3941                                 cluster_read_upl_release(upl
, 0, start_pg
, take_reference
); 
3944                                  * handle any valid pages immediately after the 
3945                                  * pages we issued I/O for... ... release these appropriately 
3947                                 cluster_read_upl_release(upl
, last_pg
, uio_last
, take_reference
); 
3949                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
, upl
, -1, -1, 0, 0); 
3956                         if (cluster_hard_throttle_on(vp
, 1)) { 
3958                                  * we're in the throttle window, at the very least 
3959                                  * we want to limit the size of the I/O we're about 
3962                                 rd_ahead_enabled 
= 0; 
3963                                 prefetch_enabled 
= 0; 
3964                                 max_rd_size 
= THROTTLE_MAX_IOSIZE
; 
3966                                 if (max_rd_size 
== THROTTLE_MAX_IOSIZE
) { 
3968                                          * coming out of throttled state 
3970                                         if (policy 
!= IOPOL_THROTTLE 
&& policy 
!= IOPOL_UTILITY
) { 
3972                                                         rd_ahead_enabled 
= 1; 
3973                                                 prefetch_enabled 
= 1; 
3975                                         max_rd_size 
= max_prefetch
; 
3976                                         last_ioread_offset 
= 0; 
3981         if (iolock_inited 
== TRUE
) { 
3982                 if (iostate
.io_issued 
> iostate
.io_completed
) { 
3984                          * cluster_io returned an error after it 
3985                          * had already issued some I/O.  we need 
3986                          * to wait for that I/O to complete before 
3987                          * we can destroy the iostate mutex... 
3988                          * 'retval' already contains the early error 
3989                          * so no need to pick it up from iostate.io_error 
3991                         cluster_iostate_wait(&iostate
, 0, "cluster_read_copy"); 
3993                 lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
); 
3996                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
, 
3997                              (int)uio
->uio_offset
, io_req_size
, rap
->cl_lastr
, retval
, 0); 
3999                 lck_mtx_unlock(&rap
->cl_lockr
); 
4001                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
, 
4002                              (int)uio
->uio_offset
, io_req_size
, 0, retval
, 0); 
4010 cluster_read_direct(vnode_t vp
, struct uio 
*uio
, off_t filesize
, int *read_type
, u_int32_t 
*read_length
, 
4011                     int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
4014         upl_page_info_t  
*pl
; 
4016         vm_offset_t      upl_offset
, vector_upl_offset 
= 0; 
4017         upl_size_t       upl_size
, vector_upl_size 
= 0; 
4018         vm_size_t        upl_needed_size
; 
4019         unsigned int     pages_in_pl
; 
4023         int              force_data_sync
; 
4025         int              no_zero_fill 
= 0; 
4028         struct clios     iostate
; 
4029         user_addr_t      iov_base
; 
4030         u_int32_t        io_req_size
; 
4031         u_int32_t        offset_in_file
; 
4032         u_int32_t        offset_in_iovbase
; 
4036         u_int32_t        devblocksize
; 
4037         u_int32_t        mem_alignment_mask
; 
4038         u_int32_t        max_upl_size
; 
4039         u_int32_t        max_rd_size
; 
4040         u_int32_t        max_rd_ahead
; 
4041         u_int32_t        max_vector_size
; 
4042         boolean_t        strict_uncached_IO 
= FALSE
; 
4043         boolean_t        io_throttled 
= FALSE
; 
4045         u_int32_t        vector_upl_iosize 
= 0; 
4046         int              issueVectorUPL 
= 0,useVectorUPL 
= (uio
->uio_iovcnt 
> 1); 
4047         off_t            v_upl_uio_offset 
= 0; 
4048         int              vector_upl_index
=0; 
4049         upl_t            vector_upl 
= NULL
; 
4051         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_START
, 
4052                      (int)uio
->uio_offset
, (int)filesize
, *read_type
, *read_length
, 0); 
4054         max_upl_size 
= cluster_max_io_size(vp
->v_mount
, CL_READ
); 
4056         max_rd_size 
= max_upl_size
; 
4057         max_rd_ahead 
= max_rd_size 
* IO_SCALE(vp
, 2); 
4059         io_flag 
= CL_COMMIT 
| CL_READ 
| CL_ASYNC 
| CL_NOZERO 
| CL_DIRECT_IO
; 
4061         if (flags 
& IO_PASSIVE
) 
4062                 io_flag 
|= CL_PASSIVE
; 
4064         if (flags 
& IO_ENCRYPTED
) { 
4065                 io_flag 
|= CL_RAW_ENCRYPTED
; 
4068         if (flags 
& IO_NOCACHE
) { 
4069                 io_flag 
|= CL_NOCACHE
; 
4072         iostate
.io_completed 
= 0; 
4073         iostate
.io_issued 
= 0; 
4074         iostate
.io_error 
= 0; 
4075         iostate
.io_wanted 
= 0; 
4077         lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
); 
4079         devblocksize 
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
; 
4080         mem_alignment_mask 
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
; 
4082         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_NONE
, 
4083                      (int)devblocksize
, (int)mem_alignment_mask
, 0, 0, 0); 
4085         if (devblocksize 
== 1) { 
4087                 * the AFP client advertises a devblocksize of 1 
4088                 * however, its BLOCKMAP routine maps to physical 
4089                 * blocks that are PAGE_SIZE in size... 
4090                 * therefore we can't ask for I/Os that aren't page aligned 
4091                 * or aren't multiples of PAGE_SIZE in size 
4092                 * by setting devblocksize to PAGE_SIZE, we re-instate 
4093                 * the old behavior we had before the mem_alignment_mask 
4094                 * changes went in... 
4096                devblocksize 
= PAGE_SIZE
; 
4099         strict_uncached_IO 
= ubc_strict_uncached_IO(vp
); 
4102         io_req_size 
= *read_length
; 
4103         iov_base 
= uio_curriovbase(uio
); 
4105         max_io_size 
= filesize 
- uio
->uio_offset
; 
4107         if ((off_t
)io_req_size 
> max_io_size
) 
4108                 io_req_size 
= max_io_size
; 
4110         offset_in_file 
= (u_int32_t
)uio
->uio_offset 
& (devblocksize 
- 1); 
4111         offset_in_iovbase 
= (u_int32_t
)iov_base 
& mem_alignment_mask
; 
4113         if (offset_in_file 
|| offset_in_iovbase
) { 
4115                  * one of the 2 important offsets is misaligned 
4116                  * so fire an I/O through the cache for this entire vector 
4120         if (iov_base 
& (devblocksize 
- 1)) { 
4122                  * the offset in memory must be on a device block boundary 
4123                  * so that we can guarantee that we can generate an 
4124                  * I/O that ends on a page boundary in cluster_io 
4130          * The user must request IO in aligned chunks.  If the  
4131          * offset into the file is bad, or the userland pointer  
4132          * is non-aligned, then we cannot service the encrypted IO request. 
4134         if ((flags 
& IO_ENCRYPTED
) && (misaligned
)) { 
4139          * When we get to this point, we know... 
4140          *  -- the offset into the file is on a devblocksize boundary 
4143         while (io_req_size 
&& retval 
== 0) { 
4146                 if (cluster_hard_throttle_on(vp
, 1)) { 
4148                          * we're in the throttle window, at the very least 
4149                          * we want to limit the size of the I/O we're about 
4152                         max_rd_size  
= THROTTLE_MAX_IOSIZE
; 
4153                         max_rd_ahead 
= THROTTLE_MAX_IOSIZE 
- 1; 
4154                         max_vector_size 
= THROTTLE_MAX_IOSIZE
; 
4156                         max_rd_size  
= max_upl_size
; 
4157                         max_rd_ahead 
= max_rd_size 
* IO_SCALE(vp
, 2); 
4158                         max_vector_size 
= MAX_VECTOR_UPL_SIZE
; 
4160                 io_start 
= io_size 
= io_req_size
; 
4163                  * First look for pages already in the cache 
4164                  * and move them to user space.  But only do this 
4165                  * check if we are not retrieving encrypted data directly 
4166                  * from the filesystem;  those blocks should never 
4169                  * cluster_copy_ubc_data returns the resid 
4172                 if ((strict_uncached_IO 
== FALSE
) && ((flags 
& IO_ENCRYPTED
) == 0)) { 
4173                         retval 
= cluster_copy_ubc_data_internal(vp
, uio
, (int *)&io_size
, 0, 0); 
4176                  * calculate the number of bytes actually copied 
4177                  * starting size - residual 
4179                 xsize 
= io_start 
- io_size
; 
4181                 io_req_size 
-= xsize
; 
4183                 if(useVectorUPL 
&& (xsize 
|| (iov_base 
& PAGE_MASK
))) { 
4185                          * We found something in the cache or we have an iov_base that's not 
4188                          * Issue all I/O's that have been collected within this Vectored UPL. 
4190                         if(vector_upl_index
) { 
4191                                 retval 
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
); 
4192                                 reset_vector_run_state(); 
4199                         * After this point, if we are using the Vector UPL path and the base is 
4200                         * not page-aligned then the UPL with that base will be the first in the vector UPL. 
4205                  * check to see if we are finished with this request. 
4207                  * If we satisfied this IO already, then io_req_size will be 0. 
4208                  * Otherwise, see if the IO was mis-aligned and needs to go through  
4209                  * the UBC to deal with the 'tail'. 
4212                 if (io_req_size 
== 0 || (misaligned
)) { 
4214                          * see if there's another uio vector to 
4215                          * process that's of type IO_DIRECT 
4217                          * break out of while loop to get there 
4222                  * assume the request ends on a device block boundary 
4224                 io_min 
= devblocksize
; 
4227                  * we can handle I/O's in multiples of the device block size 
4228                  * however, if io_size isn't a multiple of devblocksize we 
4229                  * want to clip it back to the nearest page boundary since 
4230                  * we are going to have to go through cluster_read_copy to 
4231                  * deal with the 'overhang'... by clipping it to a PAGE_SIZE 
4232                  * multiple, we avoid asking the drive for the same physical 
4233                  * blocks twice.. once for the partial page at the end of the 
4234                  * request and a 2nd time for the page we read into the cache 
4235                  * (which overlaps the end of the direct read) in order to  
4236                  * get at the overhang bytes 
4238                 if (io_size 
& (devblocksize 
- 1)) {                      
4239                         if (flags 
& IO_ENCRYPTED
) { 
4241                                  * Normally, we'd round down to the previous page boundary to  
4242                                  * let the UBC manage the zero-filling of the file past the EOF. 
4243                                  * But if we're doing encrypted IO, we can't let any of 
4244                                  * the data hit the UBC.  This means we have to do the full 
4245                                  * IO to the upper block boundary of the device block that 
4246                                  * contains the EOF. The user will be responsible for not 
4247                                  * interpreting data PAST the EOF in its buffer. 
4249                                  * So just bump the IO back up to a multiple of devblocksize 
4251                                 io_size 
= ((io_size 
+ devblocksize
) & ~(devblocksize 
- 1)); 
4256                                  * Clip the request to the previous page size boundary 
4257                                  * since request does NOT end on a device block boundary 
4259                                 io_size 
&= ~PAGE_MASK
; 
4264                 if (retval 
|| io_size 
< io_min
) { 
4266                          * either an error or we only have the tail left to 
4267                          * complete via the copy path... 
4268                          * we may have already spun some portion of this request 
4269                          * off as async requests... we need to wait for the I/O 
4270                          * to complete before returning 
4272                         goto wait_for_dreads
; 
4276                  * Don't re-check the UBC data if we are looking for uncached IO 
4277                  * or asking for encrypted blocks. 
4279                 if ((strict_uncached_IO 
== FALSE
) && ((flags 
& IO_ENCRYPTED
) == 0)) { 
4281                         if ((xsize 
= io_size
) > max_rd_size
) 
4282                                 xsize 
= max_rd_size
; 
4286                         ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset 
+ xsize
, UPL_ROP_ABSENT
, (int *)&io_size
); 
4290                                  * a page must have just come into the cache 
4291                                  * since the first page in this range is no 
4292                                  * longer absent, go back and re-evaluate 
4297                 if ( (flags 
& IO_RETURN_ON_THROTTLE
) ) { 
4298                         if (cluster_hard_throttle_on(vp
, 0) == 2) { 
4299                                 if ( !cluster_io_present_in_BC(vp
, uio
->uio_offset
)) { 
4301                                          * we're in the throttle window and at least 1 I/O 
4302                                          * has already been issued by a throttleable thread 
4303                                          * in this window, so return with EAGAIN to indicate 
4304                                          * to the FS issuing the cluster_read call that it 
4305                                          * should now throttle after dropping any locks 
4307                                         throttle_info_update_by_mount(vp
->v_mount
); 
4309                                         io_throttled 
= TRUE
; 
4310                                         goto wait_for_dreads
; 
4314                 if (io_size 
> max_rd_size
) 
4315                         io_size 
= max_rd_size
; 
4317                 iov_base 
= uio_curriovbase(uio
); 
4319                 upl_offset 
= (vm_offset_t
)((u_int32_t
)iov_base 
& PAGE_MASK
); 
4320                 upl_needed_size 
= (upl_offset 
+ io_size 
+ (PAGE_SIZE 
-1)) & ~PAGE_MASK
; 
4322                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_START
, 
4323                              (int)upl_offset
, upl_needed_size
, (int)iov_base
, io_size
, 0); 
4325                 if (upl_offset 
== 0 && ((io_size 
& PAGE_MASK
) == 0)) 
4330                 for (force_data_sync 
= 0; force_data_sync 
< 3; force_data_sync
++) { 
4332                         upl_size 
= upl_needed_size
; 
4333                         upl_flags 
= UPL_FILE_IO 
| UPL_NO_SYNC 
| UPL_SET_INTERNAL 
| UPL_SET_LITE 
| UPL_SET_IO_WIRE
; 
4336                                 upl_flags 
|= UPL_NOZEROFILL
; 
4337                         if (force_data_sync
) 
4338                                 upl_flags 
|= UPL_FORCE_DATA_SYNC
; 
4340                         kret 
= vm_map_create_upl(current_map(), 
4341                                                  (vm_map_offset_t
)(iov_base 
& ~((user_addr_t
)PAGE_MASK
)), 
4342                                                  &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
); 
4344                         if (kret 
!= KERN_SUCCESS
) { 
4345                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
, 
4346                                              (int)upl_offset
, upl_size
, io_size
, kret
, 0); 
4348                                  * failed to get pagelist 
4350                                  * we may have already spun some portion of this request 
4351                                  * off as async requests... we need to wait for the I/O 
4352                                  * to complete before returning 
4354                                 goto wait_for_dreads
; 
4356                         pages_in_pl 
= upl_size 
/ PAGE_SIZE
; 
4357                         pl 
= UPL_GET_INTERNAL_PAGE_LIST(upl
); 
4359                         for (i 
= 0; i 
< pages_in_pl
; i
++) { 
4360                                 if (!upl_page_present(pl
, i
)) 
4363                         if (i 
== pages_in_pl
) 
4366                         ubc_upl_abort(upl
, 0); 
4368                 if (force_data_sync 
>= 3) { 
4369                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
, 
4370                                      (int)upl_offset
, upl_size
, io_size
, kret
, 0); 
4372                         goto wait_for_dreads
; 
4375                  * Consider the possibility that upl_size wasn't satisfied. 
4377                 if (upl_size 
< upl_needed_size
) { 
4378                         if (upl_size 
&& upl_offset 
== 0) 
4384                         ubc_upl_abort(upl
, 0); 
4385                         goto wait_for_dreads
; 
4387                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
, 
4388                              (int)upl_offset
, upl_size
, io_size
, kret
, 0); 
4391                         vm_offset_t end_off 
= ((iov_base 
+ io_size
) & PAGE_MASK
); 
4395                          * After this point, if we are using a vector UPL, then 
4396                          * either all the UPL elements end on a page boundary OR 
4397                          * this UPL is the last element because it does not end 
4398                          * on a page boundary. 
4403                  * request asynchronously so that we can overlap 
4404                  * the preparation of the next I/O 
4405                  * if there are already too many outstanding reads 
4406                  * wait until some have completed before issuing the next read 
4408                 if (iostate
.io_issued 
> iostate
.io_completed
) 
4409                         cluster_iostate_wait(&iostate
, max_rd_ahead
, "cluster_read_direct"); 
4411                 if (iostate
.io_error
) { 
4413                          * one of the earlier reads we issued ran into a hard error 
4414                          * don't issue any more reads, cleanup the UPL 
4415                          * that was just created but not used, then 
4416                          * go wait for any other reads to complete before 
4417                          * returning the error to the caller 
4419                         ubc_upl_abort(upl
, 0); 
4421                         goto wait_for_dreads
; 
4423                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_START
, 
4424                              upl
, (int)upl_offset
, (int)uio
->uio_offset
, io_size
, 0); 
4429                                 io_flag 
&= ~CL_PRESERVE
; 
4431                                 io_flag 
|= CL_PRESERVE
; 
4433                         retval 
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, io_size
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
); 
4437                         if(!vector_upl_index
) { 
4438                                 vector_upl 
= vector_upl_create(upl_offset
); 
4439                                 v_upl_uio_offset 
= uio
->uio_offset
; 
4440                                 vector_upl_offset 
= upl_offset
; 
4443                         vector_upl_set_subupl(vector_upl
,upl
, upl_size
); 
4444                         vector_upl_set_iostate(vector_upl
, upl
, vector_upl_size
, upl_size
); 
4446                         vector_upl_size 
+= upl_size
; 
4447                         vector_upl_iosize 
+= io_size
; 
4449                         if(issueVectorUPL 
|| vector_upl_index 
==  MAX_VECTOR_UPL_ELEMENTS 
|| vector_upl_size 
>= max_vector_size
) { 
4450                                 retval 
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
,  io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
); 
4451                                 reset_vector_run_state();        
4455                  * update the uio structure 
4457                 if ((flags 
& IO_ENCRYPTED
) && (max_io_size 
< io_size
)) { 
4458                         uio_update(uio
, (user_size_t
)max_io_size
); 
4461                         uio_update(uio
, (user_size_t
)io_size
); 
4464                  * Under normal circumstances, the io_size should not be 
4465                  * bigger than the io_req_size, but we may have had to round up 
4466                  * to the end of the page in the encrypted IO case.  In that case only, 
4467                  * ensure that we only decrement io_req_size to 0.  
4469                 if ((flags 
& IO_ENCRYPTED
) && (io_size 
> io_req_size
)) { 
4473                         io_req_size 
-= io_size
; 
4476                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_END
, 
4477                              upl
, (int)uio
->uio_offset
, io_req_size
, retval
, 0); 
4481         if (retval 
== 0 && iostate
.io_error 
== 0 && io_req_size 
== 0 && uio
->uio_offset 
< filesize
) { 
4483                 retval 
= cluster_io_type(uio
, read_type
, read_length
, 0); 
4485                 if (retval 
== 0 && *read_type 
== IO_DIRECT
) { 
4487                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_NONE
, 
4488                                      (int)uio
->uio_offset
, (int)filesize
, *read_type
, *read_length
, 0); 
4496         if(retval 
== 0 && iostate
.io_error 
== 0 && useVectorUPL 
&& vector_upl_index
) { 
4497                 retval 
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
,  io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
); 
4498                 reset_vector_run_state(); 
4501          * make sure all async reads that are part of this stream 
4502          * have completed before we return 
4504         if (iostate
.io_issued 
> iostate
.io_completed
) 
4505                 cluster_iostate_wait(&iostate
, 0, "cluster_read_direct"); 
4507         if (iostate
.io_error
) 
4508                 retval 
= iostate
.io_error
; 
4510         lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
); 
4512         if (io_throttled 
== TRUE 
&& retval 
== 0) 
4515         if (io_req_size 
&& retval 
== 0) { 
4517                  * we couldn't handle the tail of this request in DIRECT mode 
4518                  * so fire it through the copy path 
4520                 retval 
= cluster_read_copy(vp
, uio
, io_req_size
, filesize
, flags
, callback
, callback_arg
); 
4522                 *read_type 
= IO_UNKNOWN
; 
4524         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_END
, 
4525                      (int)uio
->uio_offset
, (int)uio_resid(uio
), io_req_size
, retval
, 0); 
4532 cluster_read_contig(vnode_t vp
, struct uio 
*uio
, off_t filesize
, int *read_type
, u_int32_t 
*read_length
, 
4533                     int (*callback
)(buf_t
, void *), void *callback_arg
, int flags
) 
4535         upl_page_info_t 
*pl
; 
4536         upl_t            upl
[MAX_VECTS
]; 
4537         vm_offset_t      upl_offset
; 
4538         addr64_t         dst_paddr 
= 0; 
4539         user_addr_t      iov_base
; 
4541         upl_size_t       upl_size
; 
4542         vm_size_t        upl_needed_size
; 
4543         mach_msg_type_number_t  pages_in_pl
; 
4546         struct clios     iostate
; 
4553         u_int32_t        devblocksize
; 
4554         u_int32_t        mem_alignment_mask
; 
4555         u_int32_t        tail_size 
= 0; 
4558         if (flags 
& IO_PASSIVE
) 
4563         if (flags 
& IO_NOCACHE
) 
4564                 bflag 
|= CL_NOCACHE
; 
4567          * When we enter this routine, we know 
4568          *  -- the read_length will not exceed the current iov_len 
4569          *  -- the target address is physically contiguous for read_length 
4571         cluster_syncup(vp
, filesize
, callback
, callback_arg
); 
4573         devblocksize 
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
; 
4574         mem_alignment_mask 
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
; 
4576         iostate
.io_completed 
= 0; 
4577         iostate
.io_issued 
= 0; 
4578         iostate
.io_error 
= 0; 
4579         iostate
.io_wanted 
= 0; 
4581         lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
); 
4584         io_size 
= *read_length
; 
4586         max_size 
= filesize 
- uio
->uio_offset
; 
4588         if (io_size 
> max_size
) 
4591         iov_base 
= uio_curriovbase(uio
); 
4593         upl_offset 
= (vm_offset_t
)((u_int32_t
)iov_base 
& PAGE_MASK
); 
4594         upl_needed_size 
= upl_offset 
+ io_size
; 
4597         upl_size 
= upl_needed_size
; 
4598         upl_flags 
= UPL_FILE_IO 
| UPL_NO_SYNC 
| UPL_CLEAN_IN_PLACE 
| UPL_SET_INTERNAL 
| UPL_SET_LITE 
| UPL_SET_IO_WIRE
; 
4601         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 92)) | DBG_FUNC_START
, 
4602                      (int)upl_offset
, (int)upl_size
, (int)iov_base
, io_size
, 0); 
4604         kret 
= vm_map_get_upl(current_map(), 
4605                               (vm_map_offset_t
)(iov_base 
& ~((user_addr_t
)PAGE_MASK
)), 
4606                               &upl_size
, &upl
[cur_upl
], NULL
, &pages_in_pl
, &upl_flags
, 0); 
4608         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 92)) | DBG_FUNC_END
, 
4609                      (int)upl_offset
, upl_size
, io_size
, kret
, 0); 
4611         if (kret 
!= KERN_SUCCESS
) { 
4613                  * failed to get pagelist 
4616                 goto wait_for_creads
; 
4620         if (upl_size 
< upl_needed_size
) { 
4622                  * The upl_size wasn't satisfied. 
4625                 goto wait_for_creads
; 
4627         pl 
= ubc_upl_pageinfo(upl
[cur_upl
]); 
4629         dst_paddr 
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + (addr64_t
)upl_offset
; 
4631         while (((uio
->uio_offset 
& (devblocksize 
- 1)) || io_size 
< devblocksize
) && io_size
) { 
4632                 u_int32_t   head_size
; 
4634                 head_size 
= devblocksize 
- (u_int32_t
)(uio
->uio_offset 
& (devblocksize 
- 1)); 
4636                 if (head_size 
> io_size
) 
4637                         head_size 
= io_size
; 
4639                 error 
= cluster_align_phys_io(vp
, uio
, dst_paddr
, head_size
, CL_READ
, callback
, callback_arg
); 
4642                         goto wait_for_creads
; 
4644                 upl_offset 
+= head_size
; 
4645                 dst_paddr  
+= head_size
; 
4646                 io_size    
-= head_size
; 
4648                 iov_base   
+= head_size
; 
4650         if ((u_int32_t
)iov_base 
& mem_alignment_mask
) { 
4652                  * request doesn't set up on a memory boundary 
4653                  * the underlying DMA engine can handle... 
4654                  * return an error instead of going through 
4655                  * the slow copy path since the intent of this 
4656                  * path is direct I/O to device memory 
4659                 goto wait_for_creads
; 
4662         tail_size 
= io_size 
& (devblocksize 
- 1); 
4664         io_size  
-= tail_size
; 
4666         while (io_size 
&& error 
== 0) { 
4668                 if (io_size 
> MAX_IO_CONTIG_SIZE
) 
4669                         xsize 
= MAX_IO_CONTIG_SIZE
; 
4673                  * request asynchronously so that we can overlap 
4674                  * the preparation of the next I/O... we'll do 
4675                  * the commit after all the I/O has completed 
4676                  * since its all issued against the same UPL 
4677                  * if there are already too many outstanding reads 
4678                  * wait until some have completed before issuing the next 
4680                 if (iostate
.io_issued 
> iostate
.io_completed
) 
4681                         cluster_iostate_wait(&iostate
, MAX_IO_CONTIG_SIZE 
* IO_SCALE(vp
, 2), "cluster_read_contig"); 
4683                 if (iostate
.io_error
) { 
4685                          * one of the earlier reads we issued ran into a hard error 
4686                          * don't issue any more reads... 
4687                          * go wait for any other reads to complete before 
4688                          * returning the error to the caller 
4690                         goto wait_for_creads
; 
4692                 error 
= cluster_io(vp
, upl
[cur_upl
], upl_offset
, uio
->uio_offset
, xsize
,  
4693                                    CL_READ 
| CL_NOZERO 
| CL_DEV_MEMORY 
| CL_ASYNC 
| bflag
, 
4694                                    (buf_t
)NULL
, &iostate
, callback
, callback_arg
); 
4696                  * The cluster_io read was issued successfully, 
4697                  * update the uio structure 
4700                         uio_update(uio
, (user_size_t
)xsize
); 
4703                         upl_offset 
+= xsize
; 
4707         if (error 
== 0 && iostate
.io_error 
== 0 && tail_size 
== 0 && num_upl 
< MAX_VECTS 
&& uio
->uio_offset 
< filesize
) { 
4709                 error 
= cluster_io_type(uio
, read_type
, read_length
, 0); 
4711                 if (error 
== 0 && *read_type 
== IO_CONTIG
) { 
4716                 *read_type 
= IO_UNKNOWN
; 
4720          * make sure all async reads that are part of this stream 
4721          * have completed before we proceed 
4723         if (iostate
.io_issued 
> iostate
.io_completed
) 
4724                 cluster_iostate_wait(&iostate
, 0, "cluster_read_contig"); 
4726         if (iostate
.io_error
) 
4727                 error 
= iostate
.io_error
; 
4729         lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
); 
4731         if (error 
== 0 && tail_size
) 
4732                 error 
= cluster_align_phys_io(vp
, uio
, dst_paddr
, tail_size
, CL_READ
, callback
, callback_arg
); 
4734         for (n 
= 0; n 
< num_upl
; n
++) 
4736                  * just release our hold on each physically contiguous 
4737                  * region without changing any state 
4739                 ubc_upl_abort(upl
[n
], 0); 
4746 cluster_io_type(struct uio 
*uio
, int *io_type
, u_int32_t 
*io_length
, u_int32_t min_length
) 
4748         user_size_t      iov_len
; 
4749         user_addr_t      iov_base 
= 0; 
4751         upl_size_t       upl_size
; 
4756          * skip over any emtpy vectors 
4758         uio_update(uio
, (user_size_t
)0); 
4760         iov_len 
= uio_curriovlen(uio
); 
4762         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 94)) | DBG_FUNC_START
, uio
, (int)iov_len
, 0, 0, 0); 
4765                 iov_base 
= uio_curriovbase(uio
); 
4767                  * make sure the size of the vector isn't too big... 
4768                  * internally, we want to handle all of the I/O in 
4769                  * chunk sizes that fit in a 32 bit int 
4771                 if (iov_len 
> (user_size_t
)MAX_IO_REQUEST_SIZE
) 
4772                         upl_size 
= MAX_IO_REQUEST_SIZE
; 
4774                         upl_size 
= (u_int32_t
)iov_len
; 
4776                 upl_flags 
= UPL_QUERY_OBJECT_TYPE
; 
4778                 if ((vm_map_get_upl(current_map(), 
4779                                     (vm_map_offset_t
)(iov_base 
& ~((user_addr_t
)PAGE_MASK
)), 
4780                                     &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, 0)) != KERN_SUCCESS
) { 
4782                          * the user app must have passed in an invalid address 
4789                 *io_length 
= upl_size
; 
4791                 if (upl_flags 
& UPL_PHYS_CONTIG
) 
4792                         *io_type 
= IO_CONTIG
; 
4793                 else if (iov_len 
>= min_length
) 
4794                         *io_type 
= IO_DIRECT
; 
4799                  * nothing left to do for this uio 
4802                 *io_type   
= IO_UNKNOWN
; 
4804         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 94)) | DBG_FUNC_END
, iov_base
, *io_type
, *io_length
, retval
, 0); 
4811  * generate advisory I/O's in the largest chunks possible 
4812  * the completed pages will be released into the VM cache 
4815 advisory_read(vnode_t vp
, off_t filesize
, off_t f_offset
, int resid
) 
4817         return advisory_read_ext(vp
, filesize
, f_offset
, resid
, NULL
, NULL
, CL_PASSIVE
); 
4821 advisory_read_ext(vnode_t vp
, off_t filesize
, off_t f_offset
, int resid
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
) 
4823         upl_page_info_t 
*pl
; 
4825         vm_offset_t      upl_offset
; 
4838         uint32_t         max_io_size
; 
4841         if ( !UBCINFOEXISTS(vp
)) 
4847         max_io_size 
= cluster_max_io_size(vp
->v_mount
, CL_READ
); 
4850         if (max_io_size 
> speculative_prefetch_max_iosize
) 
4851                 max_io_size 
= speculative_prefetch_max_iosize
; 
4853         if ((vp
->v_mount
->mnt_kern_flag 
& MNTK_SSD
) && !ignore_is_ssd
) { 
4854                 if (max_io_size 
> speculative_prefetch_max_iosize
) 
4855                         max_io_size 
= speculative_prefetch_max_iosize
; 
4859         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_START
, 
4860                      (int)f_offset
, resid
, (int)filesize
, 0, 0); 
4862         while (resid 
&& f_offset 
< filesize 
&& retval 
== 0) { 
4864                  * compute the size of the upl needed to encompass 
4865                  * the requested read... limit each call to cluster_io 
4866                  * to the maximum UPL size... cluster_io will clip if 
4867                  * this exceeds the maximum io_size for the device, 
4868                  * make sure to account for  
4869                  * a starting offset that's not page aligned 
4871                 start_offset 
= (int)(f_offset 
& PAGE_MASK_64
); 
4872                 upl_f_offset 
= f_offset 
- (off_t
)start_offset
; 
4873                 max_size     
= filesize 
- f_offset
; 
4875                 if (resid 
< max_size
) 
4880                 upl_size 
= (start_offset 
+ io_size 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
4881                 if ((uint32_t)upl_size 
> max_io_size
) 
4882                         upl_size 
= max_io_size
; 
4886                  * return the number of contiguously present pages in the cache 
4887                  * starting at upl_f_offset within the file 
4889                 ubc_range_op(vp
, upl_f_offset
, upl_f_offset 
+ upl_size
, UPL_ROP_PRESENT
, &skip_range
); 
4893                          * skip over pages already present in the cache 
4895                         io_size 
= skip_range 
- start_offset
; 
4897                         f_offset 
+= io_size
; 
4900                         if (skip_range 
== upl_size
) 
4903                          * have to issue some real I/O 
4904                          * at this point, we know it's starting on a page boundary 
4905                          * because we've skipped over at least the first page in the request 
4908                         upl_f_offset 
+= skip_range
; 
4909                         upl_size     
-= skip_range
; 
4911                 pages_in_upl 
= upl_size 
/ PAGE_SIZE
; 
4913                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_START
, 
4914                              upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0); 
4916                 kret 
= ubc_create_upl(vp
,  
4921                                       UPL_RET_ONLY_ABSENT 
| UPL_SET_LITE
); 
4922                 if (kret 
!= KERN_SUCCESS
) 
4927                  * before we start marching forward, we must make sure we end on  
4928                  * a present page, otherwise we will be working with a freed 
4931                 for (last_pg 
= pages_in_upl 
- 1; last_pg 
>= 0; last_pg
--) { 
4932                         if (upl_page_present(pl
, last_pg
)) 
4935                 pages_in_upl 
= last_pg 
+ 1; 
4938                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_END
, 
4939                              upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0); 
4942                 for (last_pg 
= 0; last_pg 
< pages_in_upl
; ) { 
4944                          * scan from the beginning of the upl looking for the first 
4945                          * page that is present.... this will become the first page in 
4946                          * the request we're going to make to 'cluster_io'... if all 
4947                          * of the pages are absent, we won't call through to 'cluster_io' 
4949                         for (start_pg 
= last_pg
; start_pg 
< pages_in_upl
; start_pg
++) { 
4950                                 if (upl_page_present(pl
, start_pg
)) 
4955                          * scan from the starting present page looking for an absent 
4956                          * page before the end of the upl is reached, if we  
4957                          * find one, then it will terminate the range of pages being 
4958                          * presented to 'cluster_io' 
4960                         for (last_pg 
= start_pg
; last_pg 
< pages_in_upl
; last_pg
++) { 
4961                                 if (!upl_page_present(pl
, last_pg
)) 
4965                         if (last_pg 
> start_pg
) {                
4967                                  * we found a range of pages that must be filled 
4968                                  * if the last page in this range is the last page of the file 
4969                                  * we may have to clip the size of it to keep from reading past 
4970                                  * the end of the last physical block associated with the file 
4972                                 upl_offset 
= start_pg 
* PAGE_SIZE
; 
4973                                 io_size    
= (last_pg 
- start_pg
) * PAGE_SIZE
; 
4975                                 if ((off_t
)(upl_f_offset 
+ upl_offset 
+ io_size
) > filesize
) 
4976                                         io_size 
= filesize 
- (upl_f_offset 
+ upl_offset
); 
4979                                  * issue an asynchronous read to cluster_io 
4981                                 retval 
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset 
+ upl_offset
, io_size
, 
4982                                                     CL_ASYNC 
| CL_READ 
| CL_COMMIT 
| CL_AGE 
| bflag
, (buf_t
)NULL
, (struct clios 
*)NULL
, callback
, callback_arg
); 
4988                         ubc_upl_abort(upl
, 0); 
4990                 io_size 
= upl_size 
- start_offset
; 
4992                 if (io_size 
> resid
) 
4994                 f_offset 
+= io_size
; 
4998         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_END
, 
4999                      (int)f_offset
, resid
, retval
, 0, 0); 
5006 cluster_push(vnode_t vp
, int flags
) 
5008         return cluster_push_ext(vp
, flags
, NULL
, NULL
); 
5013 cluster_push_ext(vnode_t vp
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
5016         int     my_sparse_wait 
= 0; 
5017         struct  cl_writebehind 
*wbp
; 
5019         if ( !UBCINFOEXISTS(vp
)) { 
5020                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, vp
, flags
, 0, -1, 0); 
5023         /* return if deferred write is set */ 
5024         if (((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
) && (flags 
& IO_DEFWRITE
)) { 
5027         if ((wbp 
= cluster_get_wbp(vp
, CLW_RETURNLOCKED
)) == NULL
) { 
5028                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, vp
, flags
, 0, -2, 0); 
5031         if (wbp
->cl_number 
== 0 && wbp
->cl_scmap 
== NULL
) { 
5032                 lck_mtx_unlock(&wbp
->cl_lockw
); 
5034                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, vp
, flags
, 0, -3, 0); 
5037         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_START
, 
5038                      wbp
->cl_scmap
, wbp
->cl_number
, flags
, 0, 0); 
5041          * if we have an fsync in progress, we don't want to allow any additional 
5042          * sync/fsync/close(s) to occur until it finishes. 
5043          * note that its possible for writes to continue to occur to this file 
5044          * while we're waiting and also once the fsync starts to clean if we're 
5045          * in the sparse map case 
5047         while (wbp
->cl_sparse_wait
) { 
5048                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_START
, vp
, 0, 0, 0, 0); 
5050                 msleep((caddr_t
)&wbp
->cl_sparse_wait
, &wbp
->cl_lockw
, PRIBIO 
+ 1, "cluster_push_ext", NULL
); 
5052                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_END
, vp
, 0, 0, 0, 0); 
5054         if (flags 
& IO_SYNC
) { 
5056                 wbp
->cl_sparse_wait 
= 1; 
5059                  * this is an fsync (or equivalent)... we must wait for any existing async 
5060                  * cleaning operations to complete before we evaulate the current state 
5061                  * and finish cleaning... this insures that all writes issued before this 
5062                  * fsync actually get cleaned to the disk before this fsync returns 
5064                 while (wbp
->cl_sparse_pushes
) { 
5065                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 98)) | DBG_FUNC_START
, vp
, 0, 0, 0, 0); 
5067                         msleep((caddr_t
)&wbp
->cl_sparse_pushes
, &wbp
->cl_lockw
, PRIBIO 
+ 1, "cluster_push_ext", NULL
); 
5069                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 98)) | DBG_FUNC_END
, vp
, 0, 0, 0, 0); 
5072         if (wbp
->cl_scmap
) { 
5075                 if (wbp
->cl_sparse_pushes 
< SPARSE_PUSH_LIMIT
) { 
5077                         scmap 
= wbp
->cl_scmap
; 
5078                         wbp
->cl_scmap 
= NULL
; 
5080                         wbp
->cl_sparse_pushes
++; 
5082                         lck_mtx_unlock(&wbp
->cl_lockw
); 
5084                         sparse_cluster_push(&scmap
, vp
, ubc_getsize(vp
), PUSH_ALL
, flags 
| IO_PASSIVE
, callback
, callback_arg
); 
5086                         lck_mtx_lock(&wbp
->cl_lockw
); 
5088                         wbp
->cl_sparse_pushes
--; 
5090                         if (wbp
->cl_sparse_wait 
&& wbp
->cl_sparse_pushes 
== 0) 
5091                                 wakeup((caddr_t
)&wbp
->cl_sparse_pushes
); 
5093                         sparse_cluster_push(&(wbp
->cl_scmap
), vp
, ubc_getsize(vp
), PUSH_ALL
, flags 
| IO_PASSIVE
, callback
, callback_arg
); 
5097                 retval 
= cluster_try_push(wbp
, vp
, ubc_getsize(vp
), PUSH_ALL
, flags 
| IO_PASSIVE
, callback
, callback_arg
); 
5099         lck_mtx_unlock(&wbp
->cl_lockw
); 
5101         if (flags 
& IO_SYNC
) 
5102                 (void)vnode_waitforwrites(vp
, 0, 0, 0, "cluster_push"); 
5104         if (my_sparse_wait
) { 
5106                  * I'm the owner of the serialization token 
5107                  * clear it and wakeup anyone that is waiting 
5110                 lck_mtx_lock(&wbp
->cl_lockw
); 
5112                 wbp
->cl_sparse_wait 
= 0; 
5113                 wakeup((caddr_t
)&wbp
->cl_sparse_wait
); 
5115                 lck_mtx_unlock(&wbp
->cl_lockw
); 
5117         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_END
, 
5118                      wbp
->cl_scmap
, wbp
->cl_number
, retval
, 0, 0); 
5124 __private_extern__ 
void 
5125 cluster_release(struct ubc_info 
*ubc
) 
5127         struct cl_writebehind 
*wbp
; 
5128         struct cl_readahead   
*rap
; 
5130         if ((wbp 
= ubc
->cl_wbehind
)) { 
5132                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, ubc
, wbp
->cl_scmap
, 0, 0, 0); 
5135                         vfs_drt_control(&(wbp
->cl_scmap
), 0); 
5137                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, ubc
, 0, 0, 0, 0); 
5140         rap 
= ubc
->cl_rahead
; 
5143                 lck_mtx_destroy(&wbp
->cl_lockw
, cl_mtx_grp
); 
5144                 FREE_ZONE((void *)wbp
, sizeof *wbp
, M_CLWRBEHIND
); 
5146         if ((rap 
= ubc
->cl_rahead
)) { 
5147                 lck_mtx_destroy(&rap
->cl_lockr
, cl_mtx_grp
); 
5148                 FREE_ZONE((void *)rap
, sizeof *rap
, M_CLRDAHEAD
); 
5150         ubc
->cl_rahead  
= NULL
; 
5151         ubc
->cl_wbehind 
= NULL
; 
5153         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_END
, ubc
, rap
, wbp
, 0, 0); 
5158 cluster_try_push(struct cl_writebehind 
*wbp
, vnode_t vp
, off_t EOF
, int push_flag
, int io_flags
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
5165         struct cl_wextent l_clusters
[MAX_CLUSTERS
]; 
5166         u_int  max_cluster_pgcount
; 
5169         max_cluster_pgcount 
= MAX_CLUSTER_SIZE(vp
) / PAGE_SIZE
; 
5171          * the write behind context exists and has 
5172          * already been locked... 
5174         if (wbp
->cl_number 
== 0) 
5176                  * no clusters to push 
5177                  * return number of empty slots 
5179                 return (MAX_CLUSTERS
); 
5182          * make a local 'sorted' copy of the clusters 
5183          * and clear wbp->cl_number so that new clusters can 
5186         for (cl_index 
= 0; cl_index 
< wbp
->cl_number
; cl_index
++) { 
5187                 for (min_index 
= -1, cl_index1 
= 0; cl_index1 
< wbp
->cl_number
; cl_index1
++) { 
5188                         if (wbp
->cl_clusters
[cl_index1
].b_addr 
== wbp
->cl_clusters
[cl_index1
].e_addr
) 
5190                         if (min_index 
== -1) 
5191                                 min_index 
= cl_index1
; 
5192                         else if (wbp
->cl_clusters
[cl_index1
].b_addr 
< wbp
->cl_clusters
[min_index
].b_addr
) 
5193                                 min_index 
= cl_index1
; 
5195                 if (min_index 
== -1) 
5198                 l_clusters
[cl_index
].b_addr 
= wbp
->cl_clusters
[min_index
].b_addr
; 
5199                 l_clusters
[cl_index
].e_addr 
= wbp
->cl_clusters
[min_index
].e_addr
; 
5200                 l_clusters
[cl_index
].io_flags 
= wbp
->cl_clusters
[min_index
].io_flags
; 
5202                 wbp
->cl_clusters
[min_index
].b_addr 
= wbp
->cl_clusters
[min_index
].e_addr
; 
5208         if ( (push_flag 
& PUSH_DELAY
) && cl_len 
== MAX_CLUSTERS 
) { 
5212                  * determine if we appear to be writing the file sequentially 
5213                  * if not, by returning without having pushed any clusters 
5214                  * we will cause this vnode to be pushed into the sparse cluster mechanism 
5215                  * used for managing more random I/O patterns 
5217                  * we know that we've got all clusters currently in use and the next write doesn't fit into one of them... 
5218                  * that's why we're in try_push with PUSH_DELAY... 
5220                  * check to make sure that all the clusters except the last one are 'full'... and that each cluster 
5221                  * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above 
5222                  * so we can just make a simple pass through, up to, but not including the last one... 
5223                  * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they 
5226                  * we let the last one be partial as long as it was adjacent to the previous one... 
5227                  * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out 
5228                  * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world... 
5230                 for (i 
= 0; i 
< MAX_CLUSTERS 
- 1; i
++) { 
5231                         if ((l_clusters
[i
].e_addr 
- l_clusters
[i
].b_addr
) != max_cluster_pgcount
) 
5233                         if (l_clusters
[i
].e_addr 
!= l_clusters
[i
+1].b_addr
) 
5237         for (cl_index 
= 0; cl_index 
< cl_len
; cl_index
++) { 
5239                 struct  cl_extent cl
; 
5241                 flags 
= io_flags 
& (IO_PASSIVE
|IO_CLOSE
); 
5244                  * try to push each cluster in turn... 
5246                 if (l_clusters
[cl_index
].io_flags 
& CLW_IONOCACHE
) 
5247                         flags 
|= IO_NOCACHE
; 
5249                 if (l_clusters
[cl_index
].io_flags 
& CLW_IOPASSIVE
) 
5250                         flags 
|= IO_PASSIVE
; 
5252                 if (push_flag 
& PUSH_SYNC
) 
5255                 cl
.b_addr 
= l_clusters
[cl_index
].b_addr
; 
5256                 cl
.e_addr 
= l_clusters
[cl_index
].e_addr
; 
5258                 cluster_push_now(vp
, &cl
, EOF
, flags
, callback
, callback_arg
); 
5260                 l_clusters
[cl_index
].b_addr 
= 0; 
5261                 l_clusters
[cl_index
].e_addr 
= 0; 
5265                 if ( !(push_flag 
& PUSH_ALL
) ) 
5269         if (cl_len 
> cl_pushed
) { 
5271                 * we didn't push all of the clusters, so 
5272                 * lets try to merge them back in to the vnode 
5274                 if ((MAX_CLUSTERS 
- wbp
->cl_number
) < (cl_len 
- cl_pushed
)) { 
5276                          * we picked up some new clusters while we were trying to 
5277                          * push the old ones... this can happen because I've dropped 
5278                          * the vnode lock... the sum of the 
5279                          * leftovers plus the new cluster count exceeds our ability 
5280                          * to represent them, so switch to the sparse cluster mechanism 
5282                          * collect the active public clusters... 
5284                         sparse_cluster_switch(wbp
, vp
, EOF
, callback
, callback_arg
); 
5286                         for (cl_index 
= 0, cl_index1 
= 0; cl_index 
< cl_len
; cl_index
++) { 
5287                                 if (l_clusters
[cl_index
].b_addr 
== l_clusters
[cl_index
].e_addr
) 
5289                                 wbp
->cl_clusters
[cl_index1
].b_addr 
= l_clusters
[cl_index
].b_addr
; 
5290                                 wbp
->cl_clusters
[cl_index1
].e_addr 
= l_clusters
[cl_index
].e_addr
; 
5291                                 wbp
->cl_clusters
[cl_index1
].io_flags 
= l_clusters
[cl_index
].io_flags
; 
5296                          * update the cluster count 
5298                         wbp
->cl_number 
= cl_index1
; 
5301                          * and collect the original clusters that were moved into the  
5302                          * local storage for sorting purposes 
5304                         sparse_cluster_switch(wbp
, vp
, EOF
, callback
, callback_arg
); 
5308                          * we've got room to merge the leftovers back in 
5309                          * just append them starting at the next 'hole' 
5310                          * represented by wbp->cl_number 
5312                         for (cl_index 
= 0, cl_index1 
= wbp
->cl_number
; cl_index 
< cl_len
; cl_index
++) { 
5313                                 if (l_clusters
[cl_index
].b_addr 
== l_clusters
[cl_index
].e_addr
) 
5316                                 wbp
->cl_clusters
[cl_index1
].b_addr 
= l_clusters
[cl_index
].b_addr
; 
5317                                 wbp
->cl_clusters
[cl_index1
].e_addr 
= l_clusters
[cl_index
].e_addr
; 
5318                                 wbp
->cl_clusters
[cl_index1
].io_flags 
= l_clusters
[cl_index
].io_flags
; 
5323                          * update the cluster count 
5325                         wbp
->cl_number 
= cl_index1
; 
5328         return (MAX_CLUSTERS 
- wbp
->cl_number
); 
5334 cluster_push_now(vnode_t vp
, struct cl_extent 
*cl
, off_t EOF
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
5336         upl_page_info_t 
*pl
; 
5338         vm_offset_t      upl_offset
; 
5353         if (flags 
& IO_PASSIVE
) 
5358         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_START
, 
5359                      (int)cl
->b_addr
, (int)cl
->e_addr
, (int)EOF
, flags
, 0); 
5361         if ((pages_in_upl 
= (int)(cl
->e_addr 
- cl
->b_addr
)) == 0) { 
5362                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 0, 0, 0, 0); 
5366         upl_size 
= pages_in_upl 
* PAGE_SIZE
; 
5367         upl_f_offset 
= (off_t
)(cl
->b_addr 
* PAGE_SIZE_64
); 
5369         if (upl_f_offset 
+ upl_size 
>= EOF
) { 
5371                 if (upl_f_offset 
>= EOF
) { 
5373                          * must have truncated the file and missed  
5374                          * clearing a dangling cluster (i.e. it's completely 
5375                          * beyond the new EOF 
5377                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 1, 0, 0, 0); 
5381                 size 
= EOF 
- upl_f_offset
; 
5383                 upl_size 
= (size 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
5384                 pages_in_upl 
= upl_size 
/ PAGE_SIZE
; 
5388         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, size
, 0, 0, 0); 
5391          * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior 
5393          * - only pages that are currently dirty are returned... these are the ones we need to clean 
5394          * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set 
5395          * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page 
5396          * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if  
5397          *   someone dirties this page while the I/O is in progress, we don't lose track of the new state 
5399          * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard) 
5402         if ((vp
->v_flag 
& VNOCACHE_DATA
) || (flags 
& IO_NOCACHE
)) 
5403                 upl_flags 
= UPL_COPYOUT_FROM 
| UPL_RET_ONLY_DIRTY 
| UPL_SET_LITE 
| UPL_WILL_BE_DUMPED
; 
5405                 upl_flags 
= UPL_COPYOUT_FROM 
| UPL_RET_ONLY_DIRTY 
| UPL_SET_LITE
; 
5407         kret 
= ubc_create_upl(vp
,  
5413         if (kret 
!= KERN_SUCCESS
) 
5414                 panic("cluster_push: failed to get pagelist"); 
5416         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
, upl
, upl_f_offset
, 0, 0, 0); 
5419          * since we only asked for the dirty pages back 
5420          * it's possible that we may only get a few or even none, so... 
5421          * before we start marching forward, we must make sure we know 
5422          * where the last present page is in the UPL, otherwise we could 
5423          * end up working with a freed upl due to the FREE_ON_EMPTY semantics 
5424          * employed by commit_range and abort_range. 
5426         for (last_pg 
= pages_in_upl 
- 1; last_pg 
>= 0; last_pg
--) { 
5427                 if (upl_page_present(pl
, last_pg
)) 
5430         pages_in_upl 
= last_pg 
+ 1; 
5432         if (pages_in_upl 
== 0) { 
5433                 ubc_upl_abort(upl
, 0); 
5435                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 2, 0, 0, 0); 
5439         for (last_pg 
= 0; last_pg 
< pages_in_upl
; ) { 
5441                  * find the next dirty page in the UPL 
5442                  * this will become the first page in the  
5443                  * next I/O to generate 
5445                 for (start_pg 
= last_pg
; start_pg 
< pages_in_upl
; start_pg
++) { 
5446                         if (upl_dirty_page(pl
, start_pg
)) 
5448                         if (upl_page_present(pl
, start_pg
)) 
5450                                  * RET_ONLY_DIRTY will return non-dirty 'precious' pages 
5451                                  * just release these unchanged since we're not going 
5452                                  * to steal them or change their state 
5454                                 ubc_upl_abort_range(upl
, start_pg 
* PAGE_SIZE
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
); 
5456                 if (start_pg 
>= pages_in_upl
) 
5458                          * done... no more dirty pages to push 
5461                 if (start_pg 
> last_pg
) 
5463                          * skipped over some non-dirty pages 
5465                         size 
-= ((start_pg 
- last_pg
) * PAGE_SIZE
); 
5468                  * find a range of dirty pages to write 
5470                 for (last_pg 
= start_pg
; last_pg 
< pages_in_upl
; last_pg
++) { 
5471                         if (!upl_dirty_page(pl
, last_pg
)) 
5474                 upl_offset 
= start_pg 
* PAGE_SIZE
; 
5476                 io_size 
= min(size
, (last_pg 
- start_pg
) * PAGE_SIZE
); 
5478                 io_flags 
= CL_THROTTLE 
| CL_COMMIT 
| CL_AGE 
| bflag
; 
5480                 if ( !(flags 
& IO_SYNC
)) 
5481                         io_flags 
|= CL_ASYNC
; 
5483                 if (flags 
& IO_CLOSE
) 
5484                         io_flags 
|= CL_CLOSE
; 
5486                 if (flags 
& IO_NOCACHE
) 
5487                         io_flags 
|= CL_NOCACHE
; 
5489                 retval 
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset 
+ upl_offset
, io_size
, 
5490                                     io_flags
, (buf_t
)NULL
, (struct clios 
*)NULL
, callback
, callback_arg
); 
5492                 if (error 
== 0 && retval
) 
5497         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 3, 0, 0, 0); 
5504  * sparse_cluster_switch is called with the write behind lock held 
5507 sparse_cluster_switch(struct cl_writebehind 
*wbp
, vnode_t vp
, off_t EOF
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
5511         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_START
, vp
, wbp
->cl_scmap
, 0, 0, 0); 
5513         for (cl_index 
= 0; cl_index 
< wbp
->cl_number
; cl_index
++) { 
5515                 struct cl_extent cl
; 
5517                 for (cl
.b_addr 
= wbp
->cl_clusters
[cl_index
].b_addr
; cl
.b_addr 
< wbp
->cl_clusters
[cl_index
].e_addr
; cl
.b_addr
++) { 
5519                         if (ubc_page_op(vp
, (off_t
)(cl
.b_addr 
* PAGE_SIZE_64
), 0, NULL
, &flags
) == KERN_SUCCESS
) { 
5520                                 if (flags 
& UPL_POP_DIRTY
) { 
5521                                         cl
.e_addr 
= cl
.b_addr 
+ 1; 
5523                                         sparse_cluster_add(&(wbp
->cl_scmap
), vp
, &cl
, EOF
, callback
, callback_arg
); 
5530         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_END
, vp
, wbp
->cl_scmap
, 0, 0, 0); 
5535  * sparse_cluster_push must be called with the write-behind lock held if the scmap is 
5536  * still associated with the write-behind context... however, if the scmap has been disassociated 
5537  * from the write-behind context (the cluster_push case), the wb lock is not held 
5540 sparse_cluster_push(void **scmap
, vnode_t vp
, off_t EOF
, int push_flag
, int io_flags
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
5542         struct cl_extent cl
; 
5546         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_START
, vp
, (*scmap
), 0, push_flag
, 0); 
5548         if (push_flag 
& PUSH_ALL
) 
5549                 vfs_drt_control(scmap
, 1); 
5552                 if (vfs_drt_get_cluster(scmap
, &offset
, &length
) != KERN_SUCCESS
) 
5555                 cl
.b_addr 
= (daddr64_t
)(offset 
/ PAGE_SIZE_64
); 
5556                 cl
.e_addr 
= (daddr64_t
)((offset 
+ length
) / PAGE_SIZE_64
); 
5558                 cluster_push_now(vp
, &cl
, EOF
, io_flags 
& (IO_PASSIVE
|IO_CLOSE
), callback
, callback_arg
); 
5560                 if ( !(push_flag 
& PUSH_ALL
) ) 
5563         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_END
, vp
, (*scmap
), 0, 0, 0); 
5568  * sparse_cluster_add is called with the write behind lock held 
5571 sparse_cluster_add(void **scmap
, vnode_t vp
, struct cl_extent 
*cl
, off_t EOF
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
5577         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_START
, (*scmap
), 0, cl
->b_addr
, (int)cl
->e_addr
, 0); 
5579         offset 
= (off_t
)(cl
->b_addr 
* PAGE_SIZE_64
); 
5580         length 
= ((u_int
)(cl
->e_addr 
- cl
->b_addr
)) * PAGE_SIZE
; 
5582         while (vfs_drt_mark_pages(scmap
, offset
, length
, &new_dirty
) != KERN_SUCCESS
) { 
5584                  * no room left in the map 
5585                  * only a partial update was done 
5586                  * push out some pages and try again 
5588                 sparse_cluster_push(scmap
, vp
, EOF
, 0, 0, callback
, callback_arg
); 
5590                 offset 
+= (new_dirty 
* PAGE_SIZE_64
); 
5591                 length 
-= (new_dirty 
* PAGE_SIZE
); 
5593         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_END
, vp
, (*scmap
), 0, 0, 0); 
5598 cluster_align_phys_io(vnode_t vp
, struct uio 
*uio
, addr64_t usr_paddr
, u_int32_t xsize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
) 
5600         upl_page_info_t  
*pl
; 
5610         if (flags 
& IO_PASSIVE
) 
5615         if (flags 
& IO_NOCACHE
) 
5616                 bflag 
|= CL_NOCACHE
; 
5618         upl_flags 
= UPL_SET_LITE
; 
5620         if ( !(flags 
& CL_READ
) ) { 
5622                  * "write" operation:  let the UPL subsystem know 
5623                  * that we intend to modify the buffer cache pages 
5626                 upl_flags 
|= UPL_WILL_MODIFY
; 
5629                  * indicate that there is no need to pull the 
5630                  * mapping for this page... we're only going 
5631                  * to read from it, not modify it. 
5633                 upl_flags 
|= UPL_FILE_IO
; 
5635         kret 
= ubc_create_upl(vp
, 
5636                               uio
->uio_offset 
& ~PAGE_MASK_64
, 
5642         if (kret 
!= KERN_SUCCESS
) 
5645         if (!upl_valid_page(pl
, 0)) { 
5647                  * issue a synchronous read to cluster_io 
5649                 error 
= cluster_io(vp
, upl
, 0, uio
->uio_offset 
& ~PAGE_MASK_64
, PAGE_SIZE
, 
5650                                    CL_READ 
| bflag
, (buf_t
)NULL
, (struct clios 
*)NULL
, callback
, callback_arg
); 
5652                           ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES 
| UPL_ABORT_FREE_ON_EMPTY
); 
5658         ubc_paddr 
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + (addr64_t
)(uio
->uio_offset 
& PAGE_MASK_64
); 
5661  *      NOTE:  There is no prototype for the following in BSD. It, and the definitions 
5662  *      of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in 
5663  *      osfmk/ppc/mappings.h.  They are not included here because there appears to be no 
5664  *      way to do so without exporting them to kexts as well. 
5666         if (flags 
& CL_READ
) 
5667 //              copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);    /* Copy physical to physical and flush the destination */ 
5668                 copypv(ubc_paddr
, usr_paddr
, xsize
,        2 |        1 |        4);    /* Copy physical to physical and flush the destination */ 
5670 //              copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);    /* Copy physical to physical and flush the source */ 
5671                 copypv(usr_paddr
, ubc_paddr
, xsize
,        2 |        1 |        8);    /* Copy physical to physical and flush the source */ 
5673         if ( !(flags 
& CL_READ
) || (upl_valid_page(pl
, 0) && upl_dirty_page(pl
, 0))) { 
5675                  * issue a synchronous write to cluster_io 
5677                 error 
= cluster_io(vp
, upl
, 0, uio
->uio_offset 
& ~PAGE_MASK_64
, PAGE_SIZE
, 
5678                                    bflag
, (buf_t
)NULL
, (struct clios 
*)NULL
, callback
, callback_arg
); 
5681                 uio_update(uio
, (user_size_t
)xsize
); 
5684                 abort_flags 
= UPL_ABORT_FREE_ON_EMPTY
; 
5686                 abort_flags 
= UPL_ABORT_FREE_ON_EMPTY 
| UPL_ABORT_DUMP_PAGES
; 
5688         ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, abort_flags
); 
5696 cluster_copy_upl_data(struct uio 
*uio
, upl_t upl
, int upl_offset
, int *io_resid
) 
5704         upl_page_info_t 
*pl
; 
5708         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
, 
5709                      (int)uio
->uio_offset
, upl_offset
, xsize
, 0, 0); 
5711         segflg 
= uio
->uio_segflg
; 
5715           case UIO_USERSPACE32
: 
5716           case UIO_USERISPACE32
: 
5717                 uio
->uio_segflg 
= UIO_PHYS_USERSPACE32
; 
5721           case UIO_USERISPACE
: 
5722                 uio
->uio_segflg 
= UIO_PHYS_USERSPACE
; 
5725           case UIO_USERSPACE64
: 
5726           case UIO_USERISPACE64
: 
5727                 uio
->uio_segflg 
= UIO_PHYS_USERSPACE64
; 
5731                 uio
->uio_segflg 
= UIO_PHYS_SYSSPACE
; 
5735         pl 
= ubc_upl_pageinfo(upl
); 
5737         pg_index  
= upl_offset 
/ PAGE_SIZE
; 
5738         pg_offset 
= upl_offset 
& PAGE_MASK
; 
5739         csize     
= min(PAGE_SIZE 
- pg_offset
, xsize
); 
5741         while (xsize 
&& retval 
== 0) { 
5744                 paddr 
= ((addr64_t
)upl_phys_page(pl
, pg_index
) << 12) + pg_offset
; 
5746                 retval 
= uiomove64(paddr
, csize
, uio
); 
5751                 csize     
= min(PAGE_SIZE
, xsize
); 
5755         uio
->uio_segflg 
= segflg
; 
5757         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
, 
5758                      (int)uio
->uio_offset
, xsize
, retval
, segflg
, 0); 
5765 cluster_copy_ubc_data(vnode_t vp
, struct uio 
*uio
, int *io_resid
, int mark_dirty
) 
5768         return (cluster_copy_ubc_data_internal(vp
, uio
, io_resid
, mark_dirty
, 1)); 
5773 cluster_copy_ubc_data_internal(vnode_t vp
, struct uio 
*uio
, int *io_resid
, int mark_dirty
, int take_reference
) 
5780         memory_object_control_t  control
; 
5782         io_size 
= *io_resid
; 
5784         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
, 
5785                      (int)uio
->uio_offset
, io_size
, mark_dirty
, take_reference
, 0); 
5787         control 
= ubc_getobject(vp
, UBC_FLAGS_NONE
); 
5789         if (control 
== MEMORY_OBJECT_CONTROL_NULL
) { 
5790                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
, 
5791                              (int)uio
->uio_offset
, io_size
, retval
, 3, 0); 
5795         segflg 
= uio
->uio_segflg
; 
5799           case UIO_USERSPACE32
: 
5800           case UIO_USERISPACE32
: 
5801                 uio
->uio_segflg 
= UIO_PHYS_USERSPACE32
; 
5804           case UIO_USERSPACE64
: 
5805           case UIO_USERISPACE64
: 
5806                 uio
->uio_segflg 
= UIO_PHYS_USERSPACE64
; 
5810           case UIO_USERISPACE
: 
5811                 uio
->uio_segflg 
= UIO_PHYS_USERSPACE
; 
5815                 uio
->uio_segflg 
= UIO_PHYS_SYSSPACE
; 
5819         if ( (io_size 
= *io_resid
) ) { 
5820                 start_offset 
= (int)(uio
->uio_offset 
& PAGE_MASK_64
); 
5821                 xsize 
= uio_resid(uio
); 
5823                 retval 
= memory_object_control_uiomove(control
, uio
->uio_offset 
- start_offset
, uio
, 
5824                                                        start_offset
, io_size
, mark_dirty
, take_reference
); 
5825                 xsize 
-= uio_resid(uio
); 
5828         uio
->uio_segflg 
= segflg
; 
5829         *io_resid       
= io_size
; 
5831         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
, 
5832                      (int)uio
->uio_offset
, io_size
, retval
, 0x80000000 | segflg
, 0); 
5839 is_file_clean(vnode_t vp
, off_t filesize
) 
5843         int   total_dirty 
= 0; 
5845         for (f_offset 
= 0; f_offset 
< filesize
; f_offset 
+= PAGE_SIZE_64
) { 
5846                 if (ubc_page_op(vp
, f_offset
, 0, NULL
, &flags
) == KERN_SUCCESS
) { 
5847                         if (flags 
& UPL_POP_DIRTY
) { 
5861  * Dirty region tracking/clustering mechanism. 
5863  * This code (vfs_drt_*) provides a mechanism for tracking and clustering 
5864  * dirty regions within a larger space (file).  It is primarily intended to 
5865  * support clustering in large files with many dirty areas. 
5867  * The implementation assumes that the dirty regions are pages. 
5869  * To represent dirty pages within the file, we store bit vectors in a 
5870  * variable-size circular hash. 
5874  * Bitvector size.  This determines the number of pages we group in a 
5875  * single hashtable entry.  Each hashtable entry is aligned to this 
5876  * size within the file. 
5878 #define DRT_BITVECTOR_PAGES             256 
5881  * File offset handling. 
5883  * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES; 
5884  * the correct formula is  (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1) 
5886 #define DRT_ADDRESS_MASK                (~((1 << 20) - 1)) 
5887 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK) 
5890  * Hashtable address field handling. 
5892  * The low-order bits of the hashtable address are used to conserve 
5895  * DRT_HASH_COUNT_MASK must be large enough to store the range 
5896  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value 
5897  * to indicate that the bucket is actually unoccupied. 
5899 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK) 
5900 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \ 
5902                 (scm)->scm_hashtable[(i)].dhe_control =                                                 \ 
5903                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \ 
5905 #define DRT_HASH_COUNT_MASK             0x1ff 
5906 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK) 
5907 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \ 
5909                 (scm)->scm_hashtable[(i)].dhe_control =                                                                 \ 
5910                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \ 
5912 #define DRT_HASH_CLEAR(scm, i)                                                                                          \ 
5914                 (scm)->scm_hashtable[(i)].dhe_control = 0;                                                              \ 
5916 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK) 
5917 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK) 
5918 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \ 
5920                 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \ 
5921                 DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \ 
5926  * Hash table moduli. 
5928  * Since the hashtable entry's size is dependent on the size of 
5929  * the bitvector, and since the hashtable size is constrained to 
5930  * both being prime and fitting within the desired allocation 
5931  * size, these values need to be manually determined. 
5933  * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes. 
5935  * The small hashtable allocation is 1024 bytes, so the modulus is 23. 
5936  * The large hashtable allocation is 16384 bytes, so the modulus is 401. 
5938 #define DRT_HASH_SMALL_MODULUS  23 
5939 #define DRT_HASH_LARGE_MODULUS  401 
5942  * Physical memory required before the large hash modulus is permitted. 
5944  * On small memory systems, the large hash modulus can lead to phsyical 
5945  * memory starvation, so we avoid using it there. 
5947 #define DRT_HASH_LARGE_MEMORY_REQUIRED  (1024LL * 1024LL * 1024LL)      /* 1GiB */ 
5949 #define DRT_SMALL_ALLOCATION    1024    /* 104 bytes spare */ 
5950 #define DRT_LARGE_ALLOCATION    16384   /* 344 bytes spare */ 
5952 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */ 
5955  * Hashtable bitvector handling. 
5957  * Bitvector fields are 32 bits long. 
5960 #define DRT_HASH_SET_BIT(scm, i, bit)                           \ 
5961         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32)) 
5963 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \ 
5964         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32)) 
5966 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \ 
5967         ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32))) 
5969 #define DRT_BITVECTOR_CLEAR(scm, i)                             \ 
5970         bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t)) 
5972 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \ 
5973         bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \ 
5974             &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \ 
5975             (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t)) 
5982 struct vfs_drt_hashentry 
{ 
5983         u_int64_t       dhe_control
; 
5984         u_int32_t       dhe_bitvector
[DRT_BITVECTOR_PAGES 
/ 32]; 
5988  * Dirty Region Tracking structure. 
5990  * The hashtable is allocated entirely inside the DRT structure. 
5992  * The hash is a simple circular prime modulus arrangement, the structure 
5993  * is resized from small to large if it overflows. 
5996 struct vfs_drt_clustermap 
{ 
5997         u_int32_t               scm_magic
;      /* sanity/detection */ 
5998 #define DRT_SCM_MAGIC           0x12020003 
5999         u_int32_t               scm_modulus
;    /* current ring size */ 
6000         u_int32_t               scm_buckets
;    /* number of occupied buckets */ 
6001         u_int32_t               scm_lastclean
;  /* last entry we cleaned */ 
6002         u_int32_t               scm_iskips
;     /* number of slot skips */ 
6004         struct vfs_drt_hashentry scm_hashtable
[0]; 
6008 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus) 
6009 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus) 
6012  * Debugging codes and arguments. 
6014 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */ 
6015 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */ 
6016 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */ 
6017 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */ 
6018 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length, 
6021                                                            /* 1 (clean, no map) */ 
6022                                                            /* 2 (map alloc fail) */ 
6023                                                            /* 3, resid (partial) */ 
6024 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87)) 
6025 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets, 
6026                                                             * lastclean, iskips */ 
6029 static kern_return_t    
vfs_drt_alloc_map(struct vfs_drt_clustermap 
**cmapp
); 
6030 static kern_return_t    
vfs_drt_free_map(struct vfs_drt_clustermap 
*cmap
); 
6031 static kern_return_t    
vfs_drt_search_index(struct vfs_drt_clustermap 
*cmap
, 
6032         u_int64_t offset
, int *indexp
); 
6033 static kern_return_t    
vfs_drt_get_index(struct vfs_drt_clustermap 
**cmapp
, 
6037 static kern_return_t    
vfs_drt_do_mark_pages( 
6043 static void             vfs_drt_trace( 
6044         struct vfs_drt_clustermap 
*cmap
, 
6053  * Allocate and initialise a sparse cluster map. 
6055  * Will allocate a new map, resize or compact an existing map. 
6057  * XXX we should probably have at least one intermediate map size, 
6058  * as the 1:16 ratio seems a bit drastic. 
6060 static kern_return_t
 
6061 vfs_drt_alloc_map(struct vfs_drt_clustermap 
**cmapp
) 
6063         struct vfs_drt_clustermap 
*cmap
, *ocmap
; 
6067         int             nsize
, active_buckets
, index
, copycount
; 
6074          * Decide on the size of the new map. 
6076         if (ocmap 
== NULL
) { 
6077                 nsize 
= DRT_HASH_SMALL_MODULUS
; 
6079                 /* count the number of active buckets in the old map */ 
6081                 for (i 
= 0; i 
< ocmap
->scm_modulus
; i
++) { 
6082                         if (!DRT_HASH_VACANT(ocmap
, i
) && 
6083                             (DRT_HASH_GET_COUNT(ocmap
, i
) != 0)) 
6087                  * If we're currently using the small allocation, check to 
6088                  * see whether we should grow to the large one. 
6090                 if (ocmap
->scm_modulus 
== DRT_HASH_SMALL_MODULUS
) { 
6092                          * If the ring is nearly full and we are allowed to 
6093                          * use the large modulus, upgrade. 
6095                         if ((active_buckets 
> (DRT_HASH_SMALL_MODULUS 
- 5)) && 
6096                             (max_mem 
>= DRT_HASH_LARGE_MEMORY_REQUIRED
)) { 
6097                                 nsize 
= DRT_HASH_LARGE_MODULUS
; 
6099                                 nsize 
= DRT_HASH_SMALL_MODULUS
; 
6102                         /* already using the large modulus */ 
6103                         nsize 
= DRT_HASH_LARGE_MODULUS
; 
6105                          * If the ring is completely full, there's 
6106                          * nothing useful for us to do.  Behave as 
6107                          * though we had compacted into the new 
6110                         if (active_buckets 
>= DRT_HASH_LARGE_MODULUS
) 
6111                                 return(KERN_SUCCESS
); 
6116          * Allocate and initialise the new map. 
6119         kret 
= kmem_alloc(kernel_map
, (vm_offset_t 
*)&cmap
, 
6120             (nsize 
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION 
: DRT_LARGE_ALLOCATION
); 
6121         if (kret 
!= KERN_SUCCESS
) 
6123         cmap
->scm_magic 
= DRT_SCM_MAGIC
; 
6124         cmap
->scm_modulus 
= nsize
; 
6125         cmap
->scm_buckets 
= 0; 
6126         cmap
->scm_lastclean 
= 0; 
6127         cmap
->scm_iskips 
= 0; 
6128         for (i 
= 0; i 
< cmap
->scm_modulus
; i
++) { 
6129                 DRT_HASH_CLEAR(cmap
, i
); 
6130                 DRT_HASH_VACATE(cmap
, i
); 
6131                 DRT_BITVECTOR_CLEAR(cmap
, i
); 
6135          * If there's an old map, re-hash entries from it into the new map. 
6138         if (ocmap 
!= NULL
) { 
6139                 for (i 
= 0; i 
< ocmap
->scm_modulus
; i
++) { 
6140                         /* skip empty buckets */ 
6141                         if (DRT_HASH_VACANT(ocmap
, i
) || 
6142                             (DRT_HASH_GET_COUNT(ocmap
, i
) == 0)) 
6145                         offset 
= DRT_HASH_GET_ADDRESS(ocmap
, i
); 
6146                         kret 
= vfs_drt_get_index(&cmap
, offset
, &index
, 1); 
6147                         if (kret 
!= KERN_SUCCESS
) { 
6148                                 /* XXX need to bail out gracefully here */ 
6149                                 panic("vfs_drt: new cluster map mysteriously too small"); 
6153                         DRT_HASH_COPY(ocmap
, i
, cmap
, index
); 
6158         /* log what we've done */ 
6159         vfs_drt_trace(cmap
, DRT_DEBUG_ALLOC
, copycount
, 0, 0, 0); 
6162          * It's important to ensure that *cmapp always points to  
6163          * a valid map, so we must overwrite it before freeing 
6167         if (ocmap 
!= NULL
) { 
6168                 /* emit stats into trace buffer */ 
6169                 vfs_drt_trace(ocmap
, DRT_DEBUG_SCMDATA
, 
6172                               ocmap
->scm_lastclean
, 
6175                 vfs_drt_free_map(ocmap
); 
6177         return(KERN_SUCCESS
); 
6182  * Free a sparse cluster map. 
6184 static kern_return_t
 
6185 vfs_drt_free_map(struct vfs_drt_clustermap 
*cmap
) 
6187         kmem_free(kernel_map
, (vm_offset_t
)cmap
,  
6188                   (cmap
->scm_modulus 
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION 
: DRT_LARGE_ALLOCATION
); 
6189         return(KERN_SUCCESS
); 
6194  * Find the hashtable slot currently occupied by an entry for the supplied offset. 
6196 static kern_return_t
 
6197 vfs_drt_search_index(struct vfs_drt_clustermap 
*cmap
, u_int64_t offset
, int *indexp
) 
6202         offset 
= DRT_ALIGN_ADDRESS(offset
); 
6203         index 
= DRT_HASH(cmap
, offset
); 
6205         /* traverse the hashtable */ 
6206         for (i 
= 0; i 
< cmap
->scm_modulus
; i
++) { 
6209                  * If the slot is vacant, we can stop. 
6211                 if (DRT_HASH_VACANT(cmap
, index
)) 
6215                  * If the address matches our offset, we have success. 
6217                 if (DRT_HASH_GET_ADDRESS(cmap
, index
) == offset
) { 
6219                         return(KERN_SUCCESS
); 
6223                  * Move to the next slot, try again. 
6225                 index 
= DRT_HASH_NEXT(cmap
, index
); 
6230         return(KERN_FAILURE
); 
6234  * Find the hashtable slot for the supplied offset.  If we haven't allocated 
6235  * one yet, allocate one and populate the address field.  Note that it will 
6236  * not have a nonzero page count and thus will still technically be free, so 
6237  * in the case where we are called to clean pages, the slot will remain free. 
6239 static kern_return_t
 
6240 vfs_drt_get_index(struct vfs_drt_clustermap 
**cmapp
, u_int64_t offset
, int *indexp
, int recursed
) 
6242         struct vfs_drt_clustermap 
*cmap
; 
6249         /* look for an existing entry */ 
6250         kret 
= vfs_drt_search_index(cmap
, offset
, indexp
); 
6251         if (kret 
== KERN_SUCCESS
) 
6254         /* need to allocate an entry */ 
6255         offset 
= DRT_ALIGN_ADDRESS(offset
); 
6256         index 
= DRT_HASH(cmap
, offset
); 
6258         /* scan from the index forwards looking for a vacant slot */ 
6259         for (i 
= 0; i 
< cmap
->scm_modulus
; i
++) { 
6261                 if (DRT_HASH_VACANT(cmap
, index
) || DRT_HASH_GET_COUNT(cmap
,index
) == 0) { 
6262                         cmap
->scm_buckets
++; 
6263                         if (index 
< cmap
->scm_lastclean
) 
6264                                 cmap
->scm_lastclean 
= index
; 
6265                         DRT_HASH_SET_ADDRESS(cmap
, index
, offset
); 
6266                         DRT_HASH_SET_COUNT(cmap
, index
, 0); 
6267                         DRT_BITVECTOR_CLEAR(cmap
, index
); 
6269                         vfs_drt_trace(cmap
, DRT_DEBUG_INSERT
, (int)offset
, i
, 0, 0); 
6270                         return(KERN_SUCCESS
); 
6272                 cmap
->scm_iskips 
+= i
; 
6273                 index 
= DRT_HASH_NEXT(cmap
, index
); 
6277          * We haven't found a vacant slot, so the map is full.  If we're not 
6278          * already recursed, try reallocating/compacting it. 
6281                 return(KERN_FAILURE
); 
6282         kret 
= vfs_drt_alloc_map(cmapp
); 
6283         if (kret 
== KERN_SUCCESS
) { 
6284                 /* now try to insert again */ 
6285                 kret 
= vfs_drt_get_index(cmapp
, offset
, indexp
, 1); 
6291  * Implementation of set dirty/clean. 
6293  * In the 'clean' case, not finding a map is OK. 
6295 static kern_return_t
 
6296 vfs_drt_do_mark_pages( 
6303         struct vfs_drt_clustermap 
*cmap
, **cmapp
; 
6305         int             i
, index
, pgoff
, pgcount
, setcount
, ecount
; 
6307         cmapp 
= (struct vfs_drt_clustermap 
**)private; 
6310         vfs_drt_trace(cmap
, DRT_DEBUG_MARK 
| DBG_FUNC_START
, (int)offset
, (int)length
, dirty
, 0); 
6312         if (setcountp 
!= NULL
) 
6315         /* allocate a cluster map if we don't already have one */ 
6317                 /* no cluster map, nothing to clean */ 
6319                         vfs_drt_trace(cmap
, DRT_DEBUG_MARK 
| DBG_FUNC_END
, 1, 0, 0, 0); 
6320                         return(KERN_SUCCESS
); 
6322                 kret 
= vfs_drt_alloc_map(cmapp
); 
6323                 if (kret 
!= KERN_SUCCESS
) { 
6324                         vfs_drt_trace(cmap
, DRT_DEBUG_MARK 
| DBG_FUNC_END
, 2, 0, 0, 0); 
6331          * Iterate over the length of the region. 
6333         while (length 
> 0) { 
6335                  * Get the hashtable index for this offset. 
6337                  * XXX this will add blank entries if we are clearing a range 
6338                  * that hasn't been dirtied. 
6340                 kret 
= vfs_drt_get_index(cmapp
, offset
, &index
, 0); 
6341                 cmap 
= *cmapp
;  /* may have changed! */ 
6342                 /* this may be a partial-success return */ 
6343                 if (kret 
!= KERN_SUCCESS
) { 
6344                         if (setcountp 
!= NULL
) 
6345                                 *setcountp 
= setcount
; 
6346                         vfs_drt_trace(cmap
, DRT_DEBUG_MARK 
| DBG_FUNC_END
, 3, (int)length
, 0, 0); 
6352                  * Work out how many pages we're modifying in this 
6355                 pgoff 
= (offset 
- DRT_ALIGN_ADDRESS(offset
)) / PAGE_SIZE
; 
6356                 pgcount 
= min((length 
/ PAGE_SIZE
), (DRT_BITVECTOR_PAGES 
- pgoff
)); 
6359                  * Iterate over pages, dirty/clearing as we go. 
6361                 ecount 
= DRT_HASH_GET_COUNT(cmap
, index
); 
6362                 for (i 
= 0; i 
< pgcount
; i
++) { 
6364                                 if (!DRT_HASH_TEST_BIT(cmap
, index
, pgoff 
+ i
)) { 
6365                                         DRT_HASH_SET_BIT(cmap
, index
, pgoff 
+ i
); 
6370                                 if (DRT_HASH_TEST_BIT(cmap
, index
, pgoff 
+ i
)) { 
6371                                         DRT_HASH_CLEAR_BIT(cmap
, index
, pgoff 
+ i
); 
6377                 DRT_HASH_SET_COUNT(cmap
, index
, ecount
); 
6379                 offset 
+= pgcount 
* PAGE_SIZE
; 
6380                 length 
-= pgcount 
* PAGE_SIZE
; 
6382         if (setcountp 
!= NULL
) 
6383                 *setcountp 
= setcount
; 
6385         vfs_drt_trace(cmap
, DRT_DEBUG_MARK 
| DBG_FUNC_END
, 0, setcount
, 0, 0); 
6387         return(KERN_SUCCESS
); 
6391  * Mark a set of pages as dirty/clean. 
6393  * This is a public interface. 
6396  *      Pointer to storage suitable for holding a pointer.  Note that 
6397  *      this must either be NULL or a value set by this function. 
6400  *      Current file size in bytes. 
6403  *      Offset of the first page to be marked as dirty, in bytes.  Must be 
6407  *      Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE. 
6410  *      Number of pages newly marked dirty by this call (optional). 
6412  * Returns KERN_SUCCESS if all the pages were successfully marked. 
6414 static kern_return_t
 
6415 vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, u_int 
*setcountp
) 
6417         /* XXX size unused, drop from interface */ 
6418         return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, setcountp
, 1)); 
6422 static kern_return_t
 
6423 vfs_drt_unmark_pages(void **cmapp
, off_t offset
, u_int length
) 
6425         return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0)); 
6430  * Get a cluster of dirty pages. 
6432  * This is a public interface. 
6435  *      Pointer to storage managed by drt_mark_pages.  Note that this must 
6436  *      be NULL or a value set by drt_mark_pages. 
6439  *      Returns the byte offset into the file of the first page in the cluster. 
6442  *      Returns the length in bytes of the cluster of dirty pages. 
6444  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there 
6445  * are no dirty pages meeting the minmum size criteria.  Private storage will 
6446  * be released if there are no more dirty pages left in the map 
6449 static kern_return_t
 
6450 vfs_drt_get_cluster(void **cmapp
, off_t 
*offsetp
, u_int 
*lengthp
) 
6452         struct vfs_drt_clustermap 
*cmap
; 
6456         int             index
, i
, fs
, ls
; 
6459         if ((cmapp 
== NULL
) || (*cmapp 
== NULL
)) 
6460                 return(KERN_FAILURE
); 
6463         /* walk the hashtable */ 
6464         for (offset 
= 0, j 
= 0; j 
< cmap
->scm_modulus
; offset 
+= (DRT_BITVECTOR_PAGES 
* PAGE_SIZE
), j
++) { 
6465                 index 
= DRT_HASH(cmap
, offset
); 
6467                 if (DRT_HASH_VACANT(cmap
, index
) || (DRT_HASH_GET_COUNT(cmap
, index
) == 0)) 
6470                 /* scan the bitfield for a string of bits */ 
6473                 for (i 
= 0; i 
< DRT_BITVECTOR_PAGES
; i
++) { 
6474                         if (DRT_HASH_TEST_BIT(cmap
, index
, i
)) { 
6480                         /*  didn't find any bits set */ 
6481                         panic("vfs_drt: entry summary count > 0 but no bits set in map"); 
6483                 for (ls 
= 0; i 
< DRT_BITVECTOR_PAGES
; i
++, ls
++) { 
6484                         if (!DRT_HASH_TEST_BIT(cmap
, index
, i
)) 
6488                 /* compute offset and length, mark pages clean */ 
6489                 offset 
= DRT_HASH_GET_ADDRESS(cmap
, index
) + (PAGE_SIZE 
* fs
); 
6490                 length 
= ls 
* PAGE_SIZE
; 
6491                 vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0); 
6492                 cmap
->scm_lastclean 
= index
; 
6494                 /* return successful */ 
6495                 *offsetp 
= (off_t
)offset
; 
6498                 vfs_drt_trace(cmap
, DRT_DEBUG_RETCLUSTER
, (int)offset
, (int)length
, 0, 0); 
6499                 return(KERN_SUCCESS
); 
6502          * We didn't find anything... hashtable is empty 
6503          * emit stats into trace buffer and 
6506         vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
, 
6509                       cmap
->scm_lastclean
, 
6512         vfs_drt_free_map(cmap
); 
6515         return(KERN_FAILURE
); 
6519 static kern_return_t
 
6520 vfs_drt_control(void **cmapp
, int op_type
) 
6522         struct vfs_drt_clustermap 
*cmap
; 
6525         if ((cmapp 
== NULL
) || (*cmapp 
== NULL
)) 
6526                 return(KERN_FAILURE
); 
6531                 /* emit stats into trace buffer */ 
6532                 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
, 
6535                               cmap
->scm_lastclean
, 
6538                 vfs_drt_free_map(cmap
); 
6543                 cmap
->scm_lastclean 
= 0; 
6546         return(KERN_SUCCESS
); 
6552  * Emit a summary of the state of the clustermap into the trace buffer 
6553  * along with some caller-provided data. 
6557 vfs_drt_trace(__unused 
struct vfs_drt_clustermap 
*cmap
, int code
, int arg1
, int arg2
, int arg3
, int arg4
) 
6559         KERNEL_DEBUG(code
, arg1
, arg2
, arg3
, arg4
, 0); 
6563 vfs_drt_trace(__unused 
struct vfs_drt_clustermap 
*cmap
, __unused 
int code
,  
6564                           __unused 
int arg1
, __unused 
int arg2
, __unused 
int arg3
,  
6572  * Perform basic sanity check on the hash entry summary count 
6573  * vs. the actual bits set in the entry. 
6576 vfs_drt_sanity(struct vfs_drt_clustermap 
*cmap
) 
6581         for (index 
= 0; index 
< cmap
->scm_modulus
; index
++) { 
6582                 if (DRT_HASH_VACANT(cmap
, index
)) 
6585                 for (bits_on 
= 0, i 
= 0; i 
< DRT_BITVECTOR_PAGES
; i
++) { 
6586                         if (DRT_HASH_TEST_BIT(cmap
, index
, i
)) 
6589                 if (bits_on 
!= DRT_HASH_GET_COUNT(cmap
, index
)) 
6590                         panic("bits_on = %d,  index = %d\n", bits_on
, index
);