2  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. 
   4  * @APPLE_LICENSE_HEADER_START@ 
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved. 
   8  * This file contains Original Code and/or Modifications of Original Code 
   9  * as defined in and that are subject to the Apple Public Source License 
  10  * Version 2.0 (the 'License'). You may not use this file except in 
  11  * compliance with the License. Please obtain a copy of the License at 
  12  * http://www.opensource.apple.com/apsl/ and read it before using this 
  15  * The Original Code and all software distributed under the License are 
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
  20  * Please see the License for the specific language governing rights and 
  21  * limitations under the License. 
  23  * @APPLE_LICENSE_HEADER_END@ 
  25 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 
  28  *      The Regents of the University of California.  All rights reserved. 
  30  * Redistribution and use in source and binary forms, with or without 
  31  * modification, are permitted provided that the following conditions 
  33  * 1. Redistributions of source code must retain the above copyright 
  34  *    notice, this list of conditions and the following disclaimer. 
  35  * 2. Redistributions in binary form must reproduce the above copyright 
  36  *    notice, this list of conditions and the following disclaimer in the 
  37  *    documentation and/or other materials provided with the distribution. 
  38  * 3. All advertising materials mentioning features or use of this software 
  39  *    must display the following acknowledgement: 
  40  *      This product includes software developed by the University of 
  41  *      California, Berkeley and its contributors. 
  42  * 4. Neither the name of the University nor the names of its contributors 
  43  *    may be used to endorse or promote products derived from this software 
  44  *    without specific prior written permission. 
  46  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 
  47  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
  48  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
  49  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 
  50  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
  51  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 
  52  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
  53  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
  54  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 
  55  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 
  58  *      @(#)vfs_cluster.c       8.10 (Berkeley) 3/28/95 
  61 #include <sys/param.h> 
  64 #include <sys/vnode.h> 
  65 #include <sys/mount.h> 
  66 #include <sys/trace.h> 
  67 #include <sys/malloc.h> 
  69 #include <sys/kernel.h> 
  70 #include <sys/resourcevar.h> 
  71 #include <libkern/libkern.h> 
  72 #include <machine/machine_routines.h> 
  75 #include <vm/vm_pageout.h> 
  77 #include <mach/mach_types.h> 
  78 #include <mach/memory_object_types.h> 
  80 #include <sys/kdebug.h> 
  85 #define CL_COMMIT    0x04 
  86 #define CL_PAGEOUT   0x10 
  89 #define CL_NOZERO    0x80 
  90 #define CL_PAGEIN    0x100 
  91 #define CL_DEV_MEMORY 0x200 
  92 #define CL_PRESERVE   0x400 
  93 #define CL_THROTTLE   0x800 
  97         u_int  io_completed
;       /* amount of io that has currently completed */ 
  98         u_int  io_issued
;          /* amount of io that was successfully issued */ 
  99         int    io_error
;           /* error code of first error encountered */ 
 100         int    io_wanted
;          /* someone is sleeping waiting for a change in state */ 
 104 static void cluster_zero(upl_t upl
, vm_offset_t   upl_offset
, 
 105                 int size
, struct buf 
*bp
); 
 106 static int cluster_read_x(struct vnode 
*vp
, struct uio 
*uio
, 
 107                 off_t filesize
, int devblocksize
, int flags
); 
 108 static int cluster_write_x(struct vnode 
*vp
, struct uio 
*uio
, 
 109                 off_t oldEOF
, off_t newEOF
, off_t headOff
, 
 110                 off_t tailOff
, int devblocksize
, int flags
); 
 111 static int cluster_nocopy_read(struct vnode 
*vp
, struct uio 
*uio
, 
 112                 off_t filesize
, int devblocksize
, int flags
); 
 113 static int cluster_nocopy_write(struct vnode 
*vp
, struct uio 
*uio
, 
 114                 off_t newEOF
, int devblocksize
, int flags
); 
 115 static int cluster_phys_read(struct vnode 
*vp
, struct uio 
*uio
, 
 116                 off_t filesize
, int devblocksize
, int flags
); 
 117 static int cluster_phys_write(struct vnode 
*vp
, struct uio 
*uio
, 
 118                 off_t newEOF
, int devblocksize
, int flags
); 
 119 static int cluster_align_phys_io(struct vnode 
*vp
, struct uio 
*uio
, 
 120                 addr64_t usr_paddr
, int xsize
, int devblocksize
, int flags
); 
 121 static int cluster_push_x(struct vnode 
*vp
, off_t EOF
, daddr_t first
, daddr_t last
, int can_delay
); 
 122 static int cluster_try_push(struct vnode 
*vp
, off_t EOF
, int can_delay
, int push_all
); 
 124 static int sparse_cluster_switch(struct vnode 
*vp
, off_t EOF
); 
 125 static int sparse_cluster_push(struct vnode 
*vp
, off_t EOF
, int push_all
); 
 126 static int sparse_cluster_add(struct vnode 
*vp
, off_t EOF
, daddr_t first
, daddr_t last
); 
 128 static kern_return_t 
vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, int *setcountp
); 
 129 static kern_return_t 
vfs_drt_unmark_pages(void **cmapp
, off_t offset
, u_int length
); 
 130 static kern_return_t 
vfs_drt_get_cluster(void **cmapp
, off_t 
*offsetp
, u_int 
*lengthp
); 
 131 static kern_return_t 
vfs_drt_control(void **cmapp
, int op_type
); 
 133 int     ubc_page_op_with_control 
__P((memory_object_control_t
, off_t
, int, ppnum_t 
*, int *)); 
 137  * throttle the number of async writes that 
 138  * can be outstanding on a single vnode 
 139  * before we issue a synchronous write  
 141 #define ASYNC_THROTTLE  18 
 142 #define HARD_THROTTLE_MAXCNT 1 
 143 #define HARD_THROTTLE_MAXSIZE (64 * 1024) 
 145 int hard_throttle_on_root 
= 0; 
 146 struct timeval priority_IO_timestamp_for_root
; 
 150 cluster_hard_throttle_on(vp
) 
 153         static struct timeval hard_throttle_maxelapsed 
= { 0, 300000 }; 
 155         if (vp
->v_mount
->mnt_kern_flag 
& MNTK_ROOTDEV
) { 
 156                 struct timeval elapsed
; 
 158                 if (hard_throttle_on_root
) 
 162                 timevalsub(&elapsed
, &priority_IO_timestamp_for_root
); 
 164                 if (timevalcmp(&elapsed
, &hard_throttle_maxelapsed
, <)) 
 183         struct buf 
*cbp_head
; 
 184         struct buf 
*cbp_next
; 
 187         struct clios 
*iostate
; 
 192         cbp_head 
= (struct buf 
*)(bp
->b_trans_head
); 
 194         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_START
, 
 195                      (int)cbp_head
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0); 
 197         for (cbp 
= cbp_head
; cbp
; cbp 
= cbp
->b_trans_next
) { 
 199                  * all I/O requests that are part of this transaction 
 200                  * have to complete before we can process it 
 202                 if ( !(cbp
->b_flags 
& B_DONE
)) { 
 204                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
, 
 205                                      (int)cbp_head
, (int)cbp
, cbp
->b_bcount
, cbp
->b_flags
, 0); 
 215         upl_offset 
= cbp
->b_uploffset
; 
 216         upl        
= cbp
->b_pagelist
; 
 217         b_flags    
= cbp
->b_flags
; 
 218         real_bp    
= cbp
->b_real_bp
; 
 220         zero_offset
= cbp
->b_validend
; 
 221         iostate    
= (struct clios 
*)cbp
->b_iostate
; 
 224                 if ((cbp
->b_flags 
& B_ERROR
) && error 
== 0) 
 225                         error 
= cbp
->b_error
; 
 227                 total_resid 
+= cbp
->b_resid
; 
 228                 total_size  
+= cbp
->b_bcount
; 
 230                 cbp_next 
= cbp
->b_trans_next
; 
 237                 cluster_zero(upl
, zero_offset
, PAGE_SIZE 
- (zero_offset 
& PAGE_MASK
), real_bp
); 
 239         if ((vp
->v_flag 
& VTHROTTLED
) && (vp
->v_numoutput 
<= (ASYNC_THROTTLE 
/ 3))) { 
 240                 vp
->v_flag 
&= ~VTHROTTLED
; 
 241                 wakeup((caddr_t
)&vp
->v_numoutput
); 
 245                  * someone has issued multiple I/Os asynchrounsly 
 246                  * and is waiting for them to complete (streaming) 
 248                 if (error 
&& iostate
->io_error 
== 0) 
 249                         iostate
->io_error 
= error
; 
 251                 iostate
->io_completed 
+= total_size
; 
 253                 if (iostate
->io_wanted
) { 
 255                          * someone is waiting for the state of 
 256                          * this io stream to change 
 258                         iostate
->io_wanted 
= 0; 
 259                         wakeup((caddr_t
)&iostate
->io_wanted
); 
 262         if ((b_flags 
& B_NEED_IODONE
) && real_bp
) { 
 264                         real_bp
->b_flags 
|= B_ERROR
; 
 265                         real_bp
->b_error 
= error
; 
 267                 real_bp
->b_resid 
= total_resid
; 
 271         if (error 
== 0 && total_resid
) 
 274         if (b_flags 
& B_COMMIT_UPL
) { 
 275                 pg_offset   
= upl_offset 
& PAGE_MASK
; 
 276                 commit_size 
= (pg_offset 
+ total_size 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
 278                 if (error 
|| (b_flags 
& B_NOCACHE
)) { 
 281                         if ((b_flags 
& B_PAGEOUT
) && (error 
!= ENXIO
)) /* transient error */ 
 282                                 upl_abort_code 
= UPL_ABORT_FREE_ON_EMPTY
; 
 283                         else if (b_flags 
& B_PGIN
) 
 284                                 upl_abort_code 
= UPL_ABORT_FREE_ON_EMPTY 
| UPL_ABORT_ERROR
; 
 286                                 upl_abort_code 
= UPL_ABORT_FREE_ON_EMPTY 
| UPL_ABORT_DUMP_PAGES
; 
 288                         ubc_upl_abort_range(upl
, upl_offset 
- pg_offset
, commit_size
, 
 291                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
, 
 292                                      (int)upl
, upl_offset 
- pg_offset
, commit_size
, 
 293                                      0x80000000|upl_abort_code
, 0); 
 296                         int upl_commit_flags 
= UPL_COMMIT_FREE_ON_EMPTY
; 
 298                         if (b_flags 
& B_PHYS
) { 
 299                                 if (b_flags 
& B_READ
) 
 300                                         upl_commit_flags 
|= UPL_COMMIT_SET_DIRTY
; 
 301                         } else if ( !(b_flags 
& B_PAGEOUT
)) 
 302                                 upl_commit_flags 
|= UPL_COMMIT_CLEAR_DIRTY
; 
 305                                 upl_commit_flags 
|= UPL_COMMIT_INACTIVATE
; 
 307                         ubc_upl_commit_range(upl
, upl_offset 
- pg_offset
, commit_size
, 
 310                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
, 
 311                                      (int)upl
, upl_offset 
- pg_offset
, commit_size
, 
 312                                      upl_commit_flags
, 0); 
 315                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
, 
 316                              (int)upl
, upl_offset
, 0, error
, 0); 
 323 cluster_zero(upl
, upl_offset
, size
, bp
) 
 325         vm_offset_t   upl_offset
; 
 331         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_START
, 
 332                      upl_offset
, size
, (int)bp
, 0, 0); 
 334         if (bp 
== NULL 
|| bp
->b_data 
== NULL
) { 
 336                 pl 
= ubc_upl_pageinfo(upl
); 
 344                         page_index  
= upl_offset 
/ PAGE_SIZE
; 
 345                         page_offset 
= upl_offset 
& PAGE_MASK
; 
 347                         zero_addr 
= ((addr64_t
)upl_phys_page(pl
, page_index
) << 12) + page_offset
; 
 348                         zero_cnt  
= min(PAGE_SIZE 
- page_offset
, size
); 
 350                         bzero_phys(zero_addr
, zero_cnt
); 
 353                         upl_offset 
+= zero_cnt
; 
 356                 bzero((caddr_t
)((vm_offset_t
)bp
->b_data 
+ upl_offset
), size
); 
 358         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_END
, 
 359                      upl_offset
, size
, 0, 0, 0); 
 363 cluster_io(vp
, upl
, upl_offset
, f_offset
, non_rounded_size
, devblocksize
, flags
, real_bp
, iostate
) 
 366         vm_offset_t   upl_offset
; 
 368         int           non_rounded_size
; 
 372         struct clios 
*iostate
; 
 380         struct buf   
*cbp_head 
= 0; 
 381         struct buf   
*cbp_tail 
= 0; 
 392                 size 
= (non_rounded_size 
+ (devblocksize 
- 1)) & ~(devblocksize 
- 1); 
 394                 size 
= non_rounded_size
; 
 396         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_START
, 
 397                      (int)f_offset
, size
, upl_offset
, flags
, 0); 
 400         if (flags 
& CL_READ
) { 
 401                 io_flags 
= (B_VECTORLIST 
| B_READ
); 
 403                 vfs_io_attributes(vp
, B_READ
, &max_iosize
, &max_vectors
); 
 405                 io_flags 
= (B_VECTORLIST 
| B_WRITEINPROG
); 
 407                 vfs_io_attributes(vp
, B_WRITE
, &max_iosize
, &max_vectors
); 
 410          * make sure the maximum iosize are at least the size of a page 
 411          * and that they are multiples of the page size 
 413         max_iosize  
&= ~PAGE_MASK
; 
 415         if (flags 
& CL_THROTTLE
) { 
 416                 if ( !(flags 
& CL_PAGEOUT
) && cluster_hard_throttle_on(vp
)) { 
 417                         if (max_iosize 
> HARD_THROTTLE_MAXSIZE
) 
 418                                 max_iosize 
= HARD_THROTTLE_MAXSIZE
; 
 419                         async_throttle 
= HARD_THROTTLE_MAXCNT
; 
 421                         async_throttle 
= ASYNC_THROTTLE
; 
 426                 io_flags 
|= B_NOCACHE
; 
 427         if (flags 
& CL_PAGEIN
) 
 429         if (flags 
& CL_PAGEOUT
) 
 430                 io_flags 
|= B_PAGEOUT
; 
 431         if (flags 
& CL_COMMIT
) 
 432                 io_flags 
|= B_COMMIT_UPL
; 
 433         if (flags 
& CL_PRESERVE
) 
 436         if ((flags 
& CL_READ
) && ((upl_offset 
+ non_rounded_size
) & PAGE_MASK
) && (!(flags 
& CL_NOZERO
))) { 
 438                  * then we are going to end up 
 439                  * with a page that we can't complete (the file size wasn't a multiple 
 440                  * of PAGE_SIZE and we're trying to read to the end of the file 
 441                  * so we'll go ahead and zero out the portion of the page we can't 
 442                  * read in from the file 
 444                 zero_offset 
= upl_offset 
+ non_rounded_size
; 
 454                 if (size 
> max_iosize
) 
 455                         io_size 
= max_iosize
; 
 459                 if (error 
= VOP_CMAP(vp
, f_offset
, io_size
, &blkno
, (size_t *)&io_size
, NULL
)) { 
 460                         if (error 
== EOPNOTSUPP
) 
 461                                 panic("VOP_CMAP Unimplemented"); 
 465                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 24)) | DBG_FUNC_NONE
, 
 466                              (int)f_offset
, (int)blkno
, io_size
, zero_offset
, 0); 
 468                 if ( (!(flags 
& CL_READ
) && (long)blkno 
== -1) || io_size 
== 0) { 
 469                         if (flags 
& CL_PAGEOUT
) { 
 474                         /* Try paging out the page individually before 
 475                            giving up entirely and dumping it (it could 
 476                            be mapped in a "hole" and require allocation 
 479                          ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
); 
 480                          if (ubc_pushdirty_range(vp
, f_offset
, PAGE_SIZE_64
) == 0) { 
 485                         f_offset   
+= PAGE_SIZE_64
; 
 486                         upl_offset 
+= PAGE_SIZE
; 
 490                 lblkno 
= (daddr_t
)(f_offset 
/ PAGE_SIZE_64
); 
 492                  * we have now figured out how much I/O we can do - this is in 'io_size' 
 493                  * pg_offset is the starting point in the first page for the I/O 
 494                  * pg_count is the number of full and partial pages that 'io_size' encompasses 
 496                 pg_offset 
= upl_offset 
& PAGE_MASK
; 
 498                 if (flags 
& CL_DEV_MEMORY
) { 
 500                          * currently, can't deal with reading 'holes' in file 
 502                         if ((long)blkno 
== -1) { 
 507                          * treat physical requests as one 'giant' page 
 511                         pg_count  
= (io_size 
+ pg_offset 
+ (PAGE_SIZE 
- 1)) / PAGE_SIZE
; 
 513                 if ((flags 
& CL_READ
) && (long)blkno 
== -1) { 
 517                          * if we're reading and blkno == -1, then we've got a 
 518                          * 'hole' in the file that we need to deal with by zeroing 
 519                          * out the affected area in the upl 
 521                         if (zero_offset 
&& io_size 
== size
) { 
 523                                  * if this upl contains the EOF and it is not a multiple of PAGE_SIZE 
 524                                  * than 'zero_offset' will be non-zero 
 525                                  * if the 'hole' returned by VOP_CMAP extends all the way to the eof 
 526                                  * (indicated by the io_size finishing off the I/O request for this UPL) 
 527                                  * than we're not going to issue an I/O for the 
 528                                  * last page in this upl... we need to zero both the hole and the tail 
 529                                  * of the page beyond the EOF, since the delayed zero-fill won't kick in  
 531                                 bytes_to_zero 
= (((upl_offset 
+ io_size
) + (PAGE_SIZE 
- 1)) & ~PAGE_MASK
) - upl_offset
; 
 535                                 bytes_to_zero 
= io_size
; 
 537                         cluster_zero(upl
, upl_offset
, bytes_to_zero
, real_bp
); 
 541                                  * if there is a current I/O chain pending 
 542                                  * then the first page of the group we just zero'd 
 543                                  * will be handled by the I/O completion if the zero 
 544                                  * fill started in the middle of the page 
 546                                 pg_count 
= (io_size 
- pg_offset
) / PAGE_SIZE
; 
 549                                  * no pending I/O to pick up that first page 
 550                                  * so, we have to make sure it gets committed 
 552                                  * set the pg_offset to 0 so that the upl_commit_range 
 553                                  * starts with this page 
 555                                 pg_count 
= (io_size 
+ pg_offset
) / PAGE_SIZE
; 
 558                         if (io_size 
== size 
&& ((upl_offset 
+ io_size
) & PAGE_MASK
)) 
 560                                  * if we're done with the request for this UPL 
 561                                  * then we have to make sure to commit the last page 
 562                                  * even if we only partially zero-filled it 
 568                                         pg_resid 
= PAGE_SIZE 
- pg_offset
; 
 572                                 if (flags 
& CL_COMMIT
) 
 573                                         ubc_upl_commit_range(upl
, 
 574                                                         (upl_offset 
+ pg_resid
) & ~PAGE_MASK
,  
 575                                                         pg_count 
* PAGE_SIZE
, 
 576                                                         UPL_COMMIT_CLEAR_DIRTY 
| UPL_COMMIT_FREE_ON_EMPTY
); 
 578                         upl_offset 
+= io_size
; 
 582                         if (cbp_head 
&& pg_count
)  
 586                 } else if (real_bp 
&& (real_bp
->b_blkno 
== real_bp
->b_lblkno
)) { 
 587                         real_bp
->b_blkno 
= blkno
; 
 590                 if (pg_count 
> max_vectors
) { 
 591                         io_size 
-= (pg_count 
- max_vectors
) * PAGE_SIZE
; 
 594                                 io_size 
= PAGE_SIZE 
- pg_offset
; 
 597                                 pg_count 
= max_vectors
; 
 600                 if ( !(vp
->v_mount
->mnt_kern_flag 
& MNTK_VIRTUALDEV
)) 
 602                          * if we're not targeting a virtual device i.e. a disk image 
 603                          * it's safe to dip into the reserve pool since real devices 
 604                          * can complete this I/O request without requiring additional 
 605                          * bufs from the alloc_io_buf pool 
 608                 else if ((flags 
& CL_ASYNC
) && !(flags 
& CL_PAGEOUT
)) 
 610                          * Throttle the speculative IO 
 616                 cbp 
= alloc_io_buf(vp
, priv
); 
 619                 if (flags 
& CL_PAGEOUT
) { 
 620                         for (i 
= 0; i 
< pg_count
; i
++) { 
 625                                 if (bp 
= incore(vp
, lblkno 
+ i
)) { 
 626                                         if (!ISSET(bp
->b_flags
, B_BUSY
)) { 
 628                                                 SET(bp
->b_flags
, (B_BUSY 
| B_INVAL
)); 
 632                                                 panic("BUSY bp found in cluster_io"); 
 637                 if (flags 
& CL_ASYNC
) { 
 638                         cbp
->b_flags 
|= (B_CALL 
| B_ASYNC
); 
 639                         cbp
->b_iodone 
= (void *)cluster_iodone
; 
 641                 cbp
->b_flags 
|= io_flags
; 
 643                 cbp
->b_lblkno 
= lblkno
; 
 644                 cbp
->b_blkno  
= blkno
; 
 645                 cbp
->b_bcount 
= io_size
; 
 646                 cbp
->b_pagelist  
= upl
; 
 647                 cbp
->b_uploffset 
= upl_offset
; 
 648                 cbp
->b_trans_next 
= (struct buf 
*)0; 
 650                 if (cbp
->b_iostate 
= (void *)iostate
) 
 652                          * caller wants to track the state of this 
 653                          * io... bump the amount issued against this stream 
 655                         iostate
->io_issued 
+= io_size
; 
 658                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 26)) | DBG_FUNC_NONE
, 
 659                                      cbp
->b_lblkno
, cbp
->b_blkno
, upl_offset
, io_size
, 0); 
 661                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 27)) | DBG_FUNC_NONE
, 
 662                                      cbp
->b_lblkno
, cbp
->b_blkno
, upl_offset
, io_size
, 0); 
 665                         cbp_tail
->b_trans_next 
= cbp
; 
 671                 (struct buf 
*)(cbp
->b_trans_head
) = cbp_head
; 
 674                 upl_offset 
+= io_size
; 
 678                 if ( (!(upl_offset 
& PAGE_MASK
) && !(flags 
& CL_DEV_MEMORY
) && ((flags 
& CL_ASYNC
) || buf_count 
> 8)) || size 
== 0) { 
 680                          * if we have no more I/O to issue or 
 681                          * the current I/O we've prepared fully 
 682                          * completes the last page in this request 
 683                          * and it's either an ASYNC request or  
 684                          * we've already accumulated more than 8 I/O's into 
 685                          * this transaction and it's not an I/O directed to  
 686                          * special DEVICE memory 
 687                          * then go ahead and issue the I/O 
 691                                 cbp_head
->b_flags 
|= B_NEED_IODONE
; 
 692                                 cbp_head
->b_real_bp 
= real_bp
; 
 694                                 cbp_head
->b_real_bp 
= (struct buf 
*)NULL
; 
 698                                  * we're about to issue the last I/O for this upl 
 699                                  * if this was a read to the eof and the eof doesn't 
 700                                  * finish on a page boundary, than we need to zero-fill 
 701                                  * the rest of the page.... 
 703                                 cbp_head
->b_validend 
= zero_offset
; 
 705                                 cbp_head
->b_validend 
= 0; 
 707                         if (flags 
& CL_THROTTLE
) { 
 708                                 while (vp
->v_numoutput 
>= async_throttle
) { 
 709                                         vp
->v_flag 
|= VTHROTTLED
; 
 710                                         tsleep((caddr_t
)&vp
->v_numoutput
, PRIBIO 
+ 1, "cluster_io", 0); 
 713                         for (cbp 
= cbp_head
; cbp
;) { 
 714                                 struct buf 
* cbp_next
; 
 716                                 if (io_flags 
& B_WRITEINPROG
) 
 717                                         cbp
->b_vp
->v_numoutput
++; 
 719                                 cbp_next 
= cbp
->b_trans_next
; 
 721                                 (void) VOP_STRATEGY(cbp
); 
 724                         if ( !(flags 
& CL_ASYNC
)) { 
 725                                 for (cbp 
= cbp_head
; cbp
; cbp 
= cbp
->b_trans_next
) 
 728                                 if (error 
= cluster_iodone(cbp_head
)) { 
 729                                         if ((flags 
& CL_PAGEOUT
) && (error 
== ENXIO
)) 
 730                                                 retval 
= 0;     /* drop the error */ 
 736                         cbp_head 
= (struct buf 
*)0; 
 737                         cbp_tail 
= (struct buf 
*)0; 
 747                 for (cbp 
= cbp_head
; cbp
;) { 
 748                         struct buf 
* cbp_next
; 
 750                         upl_offset 
-= cbp
->b_bcount
; 
 751                         size       
+= cbp
->b_bcount
; 
 752                         io_size    
+= cbp
->b_bcount
; 
 754                         cbp_next 
= cbp
->b_trans_next
; 
 760                          * update the error condition for this stream 
 761                          * since we never really issued the io 
 762                          * just go ahead and adjust it back 
 764                         if (iostate
->io_error 
== 0) 
 765                                 iostate
->io_error 
= error
; 
 766                         iostate
->io_issued 
-= io_size
; 
 768                         if (iostate
->io_wanted
) { 
 770                                  * someone is waiting for the state of 
 771                                  * this io stream to change 
 773                                 iostate
->io_wanted 
= 0; 
 774                                 wakeup((caddr_t
)&iostate
->io_wanted
); 
 777                 pg_offset  
= upl_offset 
& PAGE_MASK
; 
 778                 abort_size 
= (size 
+ pg_offset 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
 780                 if (flags 
& CL_COMMIT
) { 
 783                         if (flags 
& CL_PRESERVE
) { 
 784                                 ubc_upl_commit_range(upl
, upl_offset 
- pg_offset
, abort_size
, 
 785                                                      UPL_COMMIT_FREE_ON_EMPTY
); 
 787                                 if ((flags 
& CL_PAGEOUT
) && (error 
!= ENXIO
)) /* transient error */ 
 788                                         upl_abort_code 
= UPL_ABORT_FREE_ON_EMPTY
; 
 789                                 else if (flags 
& CL_PAGEIN
) 
 790                                         upl_abort_code 
= UPL_ABORT_FREE_ON_EMPTY 
| UPL_ABORT_ERROR
; 
 792                                         upl_abort_code 
= UPL_ABORT_FREE_ON_EMPTY 
| UPL_ABORT_DUMP_PAGES
; 
 794                                 ubc_upl_abort_range(upl
, upl_offset 
- pg_offset
, abort_size
, 
 797                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 28)) | DBG_FUNC_NONE
, 
 798                                      (int)upl
, upl_offset 
- pg_offset
, abort_size
, error
, 0); 
 801                         real_bp
->b_flags 
|= B_ERROR
; 
 802                         real_bp
->b_error  
= error
; 
 809         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_END
, 
 810                      (int)f_offset
, size
, upl_offset
, retval
, 0); 
 817 cluster_rd_prefetch(vp
, f_offset
, size
, filesize
, devblocksize
) 
 824         int           pages_in_prefetch
; 
 826         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_START
, 
 827                      (int)f_offset
, size
, (int)filesize
, 0, 0); 
 829         if (f_offset 
>= filesize
) { 
 830                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
, 
 831                              (int)f_offset
, 0, 0, 0, 0); 
 834         if (size 
> (MAX_UPL_TRANSFER 
* PAGE_SIZE
)) 
 835                 size 
= (MAX_UPL_TRANSFER 
* PAGE_SIZE
); 
 837                 size 
= (size 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
 839         if ((off_t
)size 
> (filesize 
- f_offset
)) 
 840                 size 
= filesize 
- f_offset
; 
 841         pages_in_prefetch 
= (size 
+ (PAGE_SIZE 
- 1)) / PAGE_SIZE
; 
 843         advisory_read(vp
, filesize
, f_offset
, size
, devblocksize
); 
 845         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
, 
 846                      (int)f_offset 
+ size
, pages_in_prefetch
, 0, 1, 0); 
 848         return (pages_in_prefetch
); 
 854 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
) 
 863         int           size_of_prefetch
; 
 865         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_START
, 
 866                      b_lblkno
, e_lblkno
, vp
->v_lastr
, 0, 0); 
 868         if (b_lblkno 
== vp
->v_lastr 
&& b_lblkno 
== e_lblkno
) { 
 869                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
, 
 870                              vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 0, 0); 
 873         if (vp
->v_lastr 
== -1 || (b_lblkno 
!= vp
->v_lastr 
&& b_lblkno 
!= (vp
->v_lastr 
+ 1) && 
 874                                  (b_lblkno 
!= (vp
->v_maxra 
+ 1) || vp
->v_ralen 
== 0))) { 
 878                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
, 
 879                              vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 1, 0); 
 883         if (e_lblkno 
< vp
->v_maxra
) { 
 884                 if ((vp
->v_maxra 
- e_lblkno
) > (MAX_UPL_TRANSFER 
/ 4)) { 
 886                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
, 
 887                                      vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 2, 0); 
 891         r_lblkno 
= max(e_lblkno
, vp
->v_maxra
) + 1; 
 892         f_offset 
= (off_t
)r_lblkno 
* PAGE_SIZE_64
; 
 894         size_of_prefetch 
= 0; 
 896         ubc_range_op(vp
, f_offset
, f_offset 
+ PAGE_SIZE_64
, UPL_ROP_PRESENT
, &size_of_prefetch
); 
 898         if (size_of_prefetch
) { 
 899                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
, 
 900                              vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 3, 0); 
 903         if (f_offset 
< filesize
) { 
 904                 vp
->v_ralen 
= vp
->v_ralen 
? min(MAX_UPL_TRANSFER
, vp
->v_ralen 
<< 1) : 1; 
 906                 if (((e_lblkno 
+ 1) - b_lblkno
) > vp
->v_ralen
) 
 907                         vp
->v_ralen 
= min(MAX_UPL_TRANSFER
, (e_lblkno 
+ 1) - b_lblkno
); 
 909                 size_of_prefetch 
= cluster_rd_prefetch(vp
, f_offset
, vp
->v_ralen 
* PAGE_SIZE
, filesize
, devblocksize
); 
 911                 if (size_of_prefetch
) 
 912                         vp
->v_maxra 
= (r_lblkno 
+ size_of_prefetch
) - 1; 
 914         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
, 
 915                      vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 4, 0); 
 919 cluster_pageout(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, devblocksize
, flags
) 
 922         vm_offset_t   upl_offset
; 
 934         if (vp
->v_mount
->mnt_kern_flag 
& MNTK_VIRTUALDEV
) 
 936                  * if we know we're issuing this I/O to a virtual device (i.e. disk image) 
 937                  * then we don't want to enforce this throttle... if we do, we can  
 938                  * potentially deadlock since we're stalling the pageout thread at a time 
 939                  * when the disk image might need additional memory (which won't be available 
 940                  * if the pageout thread can't run)... instead we'll just depend on the throttle 
 941                  * that the pageout thread now has in place to deal with external files 
 943                 local_flags 
= CL_PAGEOUT
; 
 945                 local_flags 
= CL_PAGEOUT 
| CL_THROTTLE
; 
 947         if ((flags 
& UPL_IOSYNC
) == 0)  
 948                 local_flags 
|= CL_ASYNC
; 
 949         if ((flags 
& UPL_NOCOMMIT
) == 0)  
 950                 local_flags 
|= CL_COMMIT
; 
 953         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 52)) | DBG_FUNC_NONE
, 
 954                      (int)f_offset
, size
, (int)filesize
, local_flags
, 0); 
 957          * If they didn't specify any I/O, then we are done... 
 958          * we can't issue an abort because we don't know how 
 959          * big the upl really is 
 964         if (vp
->v_mount
->mnt_flag 
& MNT_RDONLY
) { 
 965                 if (local_flags 
& CL_COMMIT
) 
 966                         ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
); 
 970          * can't page-in from a negative offset 
 971          * or if we're starting beyond the EOF 
 972          * or if the file offset isn't page aligned 
 973          * or the size requested isn't a multiple of PAGE_SIZE 
 975         if (f_offset 
< 0 || f_offset 
>= filesize 
|| 
 976            (f_offset 
& PAGE_MASK_64
) || (size 
& PAGE_MASK
)) { 
 977                 if (local_flags 
& CL_COMMIT
) 
 978                         ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
); 
 981         max_size 
= filesize 
- f_offset
; 
 988         rounded_size 
= (io_size 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
 990         if (size 
> rounded_size
) { 
 991                 if (local_flags 
& CL_COMMIT
) 
 992                         ubc_upl_abort_range(upl
, upl_offset 
+ rounded_size
, size 
- rounded_size
, 
 993                                         UPL_ABORT_FREE_ON_EMPTY
); 
 995         vp
->v_flag 
|= VHASBEENPAGED
; 
 997         return (cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
, devblocksize
, 
 998                            local_flags
, (struct buf 
*)0, (struct clios 
*)0)); 
1002 cluster_pagein(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, devblocksize
, flags
) 
1005         vm_offset_t   upl_offset
; 
1016         int           local_flags 
= 0; 
1018         if (upl 
== NULL 
|| size 
< 0) 
1019                 panic("cluster_pagein: NULL upl passed in"); 
1021         if ((flags 
& UPL_IOSYNC
) == 0) 
1022                 local_flags 
|= CL_ASYNC
; 
1023         if ((flags 
& UPL_NOCOMMIT
) == 0)  
1024                 local_flags 
|= CL_COMMIT
; 
1027         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 56)) | DBG_FUNC_NONE
, 
1028                      (int)f_offset
, size
, (int)filesize
, local_flags
, 0); 
1031          * can't page-in from a negative offset 
1032          * or if we're starting beyond the EOF 
1033          * or if the file offset isn't page aligned 
1034          * or the size requested isn't a multiple of PAGE_SIZE 
1036         if (f_offset 
< 0 || f_offset 
>= filesize 
|| 
1037            (f_offset 
& PAGE_MASK_64
) || (size 
& PAGE_MASK
) || (upl_offset 
& PAGE_MASK
)) { 
1038                 if (local_flags 
& CL_COMMIT
) 
1039                         ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY 
| UPL_ABORT_ERROR
); 
1042         max_size 
= filesize 
- f_offset
; 
1044         if (size 
< max_size
) 
1049         rounded_size 
= (io_size 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
1051         if (size 
> rounded_size 
&& (local_flags 
& CL_COMMIT
)) 
1052                 ubc_upl_abort_range(upl
, upl_offset 
+ rounded_size
, 
1053                                     size 
- rounded_size
, UPL_ABORT_FREE_ON_EMPTY 
| UPL_ABORT_ERROR
); 
1055         retval 
= cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
, devblocksize
, 
1056                            local_flags 
| CL_READ 
| CL_PAGEIN
, (struct buf 
*)0, (struct clios 
*)0); 
1062                 b_lblkno 
= (int)(f_offset 
/ PAGE_SIZE_64
); 
1064                         ((f_offset 
+ ((off_t
)io_size 
- 1)) / PAGE_SIZE_64
); 
1066                 if (!(flags 
& UPL_NORDAHEAD
) && !(vp
->v_flag 
& VRAOFF
) && rounded_size 
== PAGE_SIZE
) { 
1068                          * we haven't read the last page in of the file yet 
1069                          * so let's try to read ahead if we're in  
1070                          * a sequential access pattern 
1072                         cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
); 
1074                 vp
->v_lastr 
= e_lblkno
; 
1086         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 19)) | DBG_FUNC_START
, 
1087                      (int)bp
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0); 
1089         if (bp
->b_pagelist 
== (upl_t
) 0) 
1090                 panic("cluster_bp: can't handle NULL upl yet\n"); 
1091         if (bp
->b_flags 
& B_READ
) 
1092                 flags 
= CL_ASYNC 
| CL_READ
; 
1096         f_offset 
= ubc_blktooff(bp
->b_vp
, bp
->b_lblkno
); 
1098         return (cluster_io(bp
->b_vp
, bp
->b_pagelist
, 0, f_offset
, bp
->b_bcount
, 0, flags
, bp
, (struct clios 
*)0)); 
1102 cluster_write(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
) 
1122         if (vp
->v_flag 
& VHASBEENPAGED
) 
1125              * this vnode had pages cleaned to it by 
1126              * the pager which indicates that either 
1127              * it's not very 'hot', or the system is 
1128              * being overwhelmed by a lot of dirty  
1129              * data being delayed in the VM cache... 
1130              * in either event, we'll push our remaining 
1131              * delayed data at this point...  this will 
1132              * be more efficient than paging out 1 page at  
1133              * a time, and will also act as a throttle 
1134              * by delaying this client from writing any 
1135              * more data until all his delayed data has 
1136              * at least been queued to the uderlying driver. 
1140             vp
->v_flag 
&= ~VHASBEENPAGED
; 
1143         if ( (!(vp
->v_flag 
& VNOCACHE_DATA
)) || (!uio
) || (uio
->uio_segflg 
!= UIO_USERSPACE
)) 
1146              * go do a write through the cache if one of the following is true.... 
1147              *   NOCACHE is not true 
1148              *   there is no uio structure or it doesn't target USERSPACE 
1150             return (cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
)); 
1153         while (uio
->uio_resid 
&& uio
->uio_offset 
< newEOF 
&& retval 
== 0) 
1156              * we know we have a resid, so this is safe 
1157              * skip over any emtpy vectors 
1161             while (iov
->iov_len 
== 0) { 
1166             upl_size  
= PAGE_SIZE
; 
1167             upl_flags 
= UPL_QUERY_OBJECT_TYPE
; 
1169             if ((vm_map_get_upl(current_map(), 
1170                                (vm_offset_t
)iov
->iov_base 
& ~PAGE_MASK
, 
1171                                &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, 0)) != KERN_SUCCESS
) 
1174                  * the user app must have passed in an invalid address 
1180              * We check every vector target but if it is physically 
1181              * contiguous space, we skip the sanity checks. 
1183             if (upl_flags 
& UPL_PHYS_CONTIG
) 
1185                 if (flags 
& IO_HEADZEROFILL
) 
1187                     flags 
&= ~IO_HEADZEROFILL
; 
1189                     if (retval 
= cluster_write_x(vp
, (struct uio 
*)0, 0, uio
->uio_offset
, headOff
, 0, devblocksize
, IO_HEADZEROFILL
)) 
1193                 retval 
= cluster_phys_write(vp
, uio
, newEOF
, devblocksize
, flags
); 
1195                 if (uio
->uio_resid 
== 0 && (flags 
& IO_TAILZEROFILL
)) 
1197                     return (cluster_write_x(vp
, (struct uio 
*)0, 0, tailOff
, uio
->uio_offset
, 0, devblocksize
, IO_HEADZEROFILL
)); 
1200             else if ((uio
->uio_resid 
< PAGE_SIZE
) || (flags 
& (IO_TAILZEROFILL 
| IO_HEADZEROFILL
))) 
1203                  * we're here because we're don't have a physically contiguous target buffer 
1204                  * go do a write through the cache if one of the following is true.... 
1205                  *   the total xfer size is less than a page... 
1206                  *   we're being asked to ZEROFILL either the head or the tail of the I/O... 
1208                 return (cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
)); 
1210             else if (((int)uio
->uio_offset 
& PAGE_MASK
) || ((int)iov
->iov_base 
& PAGE_MASK
)) 
1212                 if (((int)uio
->uio_offset 
& PAGE_MASK
) == ((int)iov
->iov_base 
& PAGE_MASK
)) 
1215                      * Bring the file offset write up to a pagesize boundary 
1216                      * this will also bring the base address to a page boundary 
1217                      * since they both are currently on the same offset within a page 
1218                      * note: if we get here, uio->uio_resid is greater than PAGE_SIZE 
1219                      * so the computed clip_size must always be less than the current uio_resid 
1221                     clip_size 
= (PAGE_SIZE 
- (uio
->uio_offset 
& PAGE_MASK_64
)); 
1224                      * Fake the resid going into the cluster_write_x call 
1225                      * and restore it on the way out. 
1227                     prev_resid 
= uio
->uio_resid
; 
1228                     uio
->uio_resid 
= clip_size
; 
1229                     retval 
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
); 
1230                     uio
->uio_resid 
= prev_resid 
- (clip_size 
- uio
->uio_resid
); 
1235                      * can't get both the file offset and the buffer offset aligned to a page boundary 
1236                      * so fire an I/O through the cache for this entire vector 
1238                     clip_size 
= iov
->iov_len
; 
1239                     prev_resid 
= uio
->uio_resid
; 
1240                     uio
->uio_resid 
= clip_size
; 
1241                     retval 
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
); 
1242                     uio
->uio_resid 
= prev_resid 
- (clip_size 
- uio
->uio_resid
); 
1248                  * If we come in here, we know the offset into 
1249                  * the file is on a pagesize boundary and the 
1250                  * target buffer address is also on a page boundary 
1252                 max_io_size 
= newEOF 
- uio
->uio_offset
; 
1253                 clip_size 
= uio
->uio_resid
; 
1254                 if (iov
->iov_len 
< clip_size
) 
1255                   clip_size 
= iov
->iov_len
; 
1256                 if (max_io_size 
< clip_size
) 
1257                   clip_size 
= max_io_size
; 
1259                 if (clip_size 
< PAGE_SIZE
) 
1262                      * Take care of tail end of write in this vector 
1264                     prev_resid 
= uio
->uio_resid
; 
1265                     uio
->uio_resid 
= clip_size
; 
1266                     retval 
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
); 
1267                     uio
->uio_resid 
= prev_resid 
- (clip_size 
- uio
->uio_resid
); 
1271                     /* round clip_size down to a multiple of pagesize */ 
1272                     clip_size 
= clip_size 
& ~(PAGE_MASK
); 
1273                     prev_resid 
= uio
->uio_resid
; 
1274                     uio
->uio_resid 
= clip_size
; 
1275                     retval 
= cluster_nocopy_write(vp
, uio
, newEOF
, devblocksize
, flags
); 
1276                     if ((retval 
== 0) && uio
->uio_resid
) 
1277                       retval 
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
); 
1278                     uio
->uio_resid 
= prev_resid 
- (clip_size 
- uio
->uio_resid
); 
1287 cluster_nocopy_write(vp
, uio
, newEOF
, devblocksize
, flags
) 
1295         upl_page_info_t  
*pl
; 
1297         vm_offset_t      upl_offset
; 
1302         int              upl_needed_size
; 
1308         int              force_data_sync
; 
1310         struct clios     iostate
; 
1312         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_START
, 
1313                      (int)uio
->uio_offset
, (int)uio
->uio_resid
,  
1314                      (int)newEOF
, devblocksize
, 0); 
1317          * When we enter this routine, we know 
1318          *  -- the offset into the file is on a pagesize boundary 
1319          *  -- the resid is a page multiple 
1320          *  -- the resid will not exceed iov_len 
1322         cluster_try_push(vp
, newEOF
, 0, 1); 
1324         iostate
.io_completed 
= 0; 
1325         iostate
.io_issued 
= 0; 
1326         iostate
.io_error 
= 0; 
1327         iostate
.io_wanted 
= 0; 
1331         while (uio
->uio_resid 
&& uio
->uio_offset 
< newEOF 
&& error 
== 0) { 
1332                 io_size 
= uio
->uio_resid
; 
1334                 if (io_size 
> (MAX_UPL_TRANSFER 
* PAGE_SIZE
)) 
1335                         io_size 
= MAX_UPL_TRANSFER 
* PAGE_SIZE
; 
1337                 upl_offset 
= (vm_offset_t
)iov
->iov_base 
& PAGE_MASK
; 
1338                 upl_needed_size 
= (upl_offset 
+ io_size 
+ (PAGE_SIZE 
-1)) & ~PAGE_MASK
; 
1340                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_START
, 
1341                              (int)upl_offset
, upl_needed_size
, (int)iov
->iov_base
, io_size
, 0); 
1343                 for (force_data_sync 
= 0; force_data_sync 
< 3; force_data_sync
++) { 
1345                         upl_size 
= upl_needed_size
; 
1346                         upl_flags 
= UPL_FILE_IO 
| UPL_COPYOUT_FROM 
| UPL_NO_SYNC 
| 
1347                                     UPL_CLEAN_IN_PLACE 
| UPL_SET_INTERNAL 
| UPL_SET_LITE 
| UPL_SET_IO_WIRE
; 
1349                         kret 
= vm_map_get_upl(current_map(), 
1350                                               (vm_offset_t
)iov
->iov_base 
& ~PAGE_MASK
, 
1358                         if (kret 
!= KERN_SUCCESS
) { 
1359                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
, 
1362                                  * cluster_nocopy_write: failed to get pagelist 
1364                                  * we may have already spun some portion of this request 
1365                                  * off as async requests... we need to wait for the I/O 
1366                                  * to complete before returning 
1368                                 goto wait_for_writes
; 
1370                         pl 
= UPL_GET_INTERNAL_PAGE_LIST(upl
); 
1371                         pages_in_pl 
= upl_size 
/ PAGE_SIZE
; 
1373                         for (i 
= 0; i 
< pages_in_pl
; i
++) { 
1374                                 if (!upl_valid_page(pl
, i
)) 
1377                         if (i 
== pages_in_pl
) 
1381                          * didn't get all the pages back that we 
1382                          * needed... release this upl and try again 
1384                         ubc_upl_abort_range(upl
, (upl_offset 
& ~PAGE_MASK
), upl_size
,  
1385                                             UPL_ABORT_FREE_ON_EMPTY
); 
1387                 if (force_data_sync 
>= 3) { 
1388                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
, 
1389                                      i
, pages_in_pl
, upl_size
, kret
, 0); 
1391                          * for some reason, we couldn't acquire a hold on all 
1392                          * the pages needed in the user's address space 
1394                          * we may have already spun some portion of this request 
1395                          * off as async requests... we need to wait for the I/O 
1396                          * to complete before returning 
1398                         goto wait_for_writes
; 
1402                  * Consider the possibility that upl_size wasn't satisfied. 
1404                 if (upl_size 
!= upl_needed_size
) 
1405                         io_size 
= (upl_size 
- (int)upl_offset
) & ~PAGE_MASK
; 
1407                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
, 
1408                              (int)upl_offset
, upl_size
, (int)iov
->iov_base
, io_size
, 0);                        
1411                         ubc_upl_abort_range(upl
, (upl_offset 
& ~PAGE_MASK
), upl_size
,  
1412                                             UPL_ABORT_FREE_ON_EMPTY
); 
1414                          * we may have already spun some portion of this request 
1415                          * off as async requests... we need to wait for the I/O 
1416                          * to complete before returning 
1418                         goto wait_for_writes
; 
1421                  * Now look for pages already in the cache 
1422                  * and throw them away. 
1423                  * uio->uio_offset is page aligned within the file 
1424                  * io_size is a multiple of PAGE_SIZE 
1426                 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset 
+ io_size
, UPL_ROP_DUMP
, NULL
); 
1429                  * we want push out these writes asynchronously so that we can overlap 
1430                  * the preparation of the next I/O 
1431                  * if there are already too many outstanding writes 
1432                  * wait until some complete before issuing the next 
1434                 while ((iostate
.io_issued 
- iostate
.io_completed
) > (2 * MAX_UPL_TRANSFER 
* PAGE_SIZE
)) { 
1435                         iostate
.io_wanted 
= 1; 
1436                         tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO 
+ 1, "cluster_nocopy_write", 0); 
1438                 if (iostate
.io_error
) { 
1440                          * one of the earlier writes we issued ran into a hard error 
1441                          * don't issue any more writes, cleanup the UPL 
1442                          * that was just created but not used, then 
1443                          * go wait for all writes that are part of this stream 
1444                          * to complete before returning the error to the caller 
1446                         ubc_upl_abort_range(upl
, (upl_offset 
& ~PAGE_MASK
), upl_size
,  
1447                                             UPL_ABORT_FREE_ON_EMPTY
); 
1449                         goto wait_for_writes
; 
1451                 io_flag 
= CL_ASYNC 
| CL_PRESERVE 
| CL_COMMIT 
| CL_THROTTLE
; 
1453                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_START
, 
1454                              (int)upl_offset
, (int)uio
->uio_offset
, io_size
, io_flag
, 0); 
1456                 error 
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, 
1457                                    io_size
, devblocksize
, io_flag
, (struct buf 
*)0, &iostate
); 
1459                 iov
->iov_len    
-= io_size
; 
1460                 iov
->iov_base   
+= io_size
; 
1461                 uio
->uio_resid  
-= io_size
; 
1462                 uio
->uio_offset 
+= io_size
; 
1464                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_END
, 
1465                              (int)upl_offset
, (int)uio
->uio_offset
, (int)uio
->uio_resid
, error
, 0); 
1471          * make sure all async writes issued as part of this stream 
1472          * have completed before we return 
1474         while (iostate
.io_issued 
!= iostate
.io_completed
) { 
1475                 iostate
.io_wanted 
= 1; 
1476                 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO 
+ 1, "cluster_nocopy_write", 0); 
1478         if (iostate
.io_error
) 
1479                 error 
= iostate
.io_error
; 
1481         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_END
, 
1482                      (int)uio
->uio_offset
, (int)uio
->uio_resid
, error
, 4, 0); 
1489 cluster_phys_write(vp
, uio
, newEOF
, devblocksize
, flags
) 
1496         upl_page_info_t 
*pl
; 
1499         vm_offset_t      upl_offset
; 
1503         int              upl_needed_size
; 
1511          * When we enter this routine, we know 
1512          *  -- the resid will not exceed iov_len 
1513          *  -- the vector target address is physcially contiguous 
1515         cluster_try_push(vp
, newEOF
, 0, 1); 
1518         io_size 
= iov
->iov_len
; 
1519         upl_offset 
= (vm_offset_t
)iov
->iov_base 
& PAGE_MASK
; 
1520         upl_needed_size 
= upl_offset 
+ io_size
; 
1523         upl_size 
= upl_needed_size
; 
1524         upl_flags 
= UPL_FILE_IO 
| UPL_COPYOUT_FROM 
| UPL_NO_SYNC 
|  
1525                     UPL_CLEAN_IN_PLACE 
| UPL_SET_INTERNAL 
| UPL_SET_LITE 
| UPL_SET_IO_WIRE
; 
1527         kret 
= vm_map_get_upl(current_map(), 
1528                               (vm_offset_t
)iov
->iov_base 
& ~PAGE_MASK
, 
1529                               &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0); 
1531         if (kret 
!= KERN_SUCCESS
) { 
1533                  * cluster_phys_write: failed to get pagelist 
1534                  * note: return kret here 
1539          * Consider the possibility that upl_size wasn't satisfied. 
1540          * This is a failure in the physical memory case. 
1542         if (upl_size 
< upl_needed_size
) { 
1543                 kernel_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
); 
1546         pl 
= ubc_upl_pageinfo(upl
); 
1548         src_paddr 
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + ((addr64_t
)((u_int
)iov
->iov_base 
& PAGE_MASK
)); 
1550         while (((uio
->uio_offset 
& (devblocksize 
- 1)) || io_size 
< devblocksize
) && io_size
) { 
1553                 head_size 
= devblocksize 
- (int)(uio
->uio_offset 
& (devblocksize 
- 1)); 
1555                 if (head_size 
> io_size
) 
1556                         head_size 
= io_size
; 
1558                 error 
= cluster_align_phys_io(vp
, uio
, src_paddr
, head_size
, devblocksize
, 0); 
1561                         ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
); 
1565                 upl_offset 
+= head_size
; 
1566                 src_paddr  
+= head_size
; 
1567                 io_size    
-= head_size
; 
1569         tail_size 
= io_size 
& (devblocksize 
- 1); 
1570         io_size  
-= tail_size
; 
1574                  * issue a synchronous write to cluster_io 
1576                 error 
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, 
1577                                    io_size
, 0, CL_DEV_MEMORY
, (struct buf 
*)0, (struct clios 
*)0); 
1581                  * The cluster_io write completed successfully, 
1582                  * update the uio structure 
1584                 uio
->uio_resid  
-= io_size
; 
1585                 iov
->iov_len    
-= io_size
; 
1586                 iov
->iov_base   
+= io_size
; 
1587                 uio
->uio_offset 
+= io_size
; 
1588                 src_paddr       
+= io_size
; 
1591                         error 
= cluster_align_phys_io(vp
, uio
, src_paddr
, tail_size
, devblocksize
, 0); 
1594          * just release our hold on the physically contiguous 
1595          * region without changing any state 
1597         ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
); 
1604 cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
) 
1614         upl_page_info_t 
*pl
; 
1616         vm_offset_t      upl_offset
; 
1630         long long        total_size
; 
1633         long long        zero_cnt1
; 
1635         daddr_t          start_blkno
; 
1641                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
, 
1642                              (int)uio
->uio_offset
, uio
->uio_resid
, (int)oldEOF
, (int)newEOF
, 0); 
1644                 uio_resid 
= uio
->uio_resid
; 
1646                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
, 
1647                              0, 0, (int)oldEOF
, (int)newEOF
, 0); 
1654         if (flags 
& IO_HEADZEROFILL
) { 
1656                  * some filesystems (HFS is one) don't support unallocated holes within a file... 
1657                  * so we zero fill the intervening space between the old EOF and the offset 
1658                  * where the next chunk of real data begins.... ftruncate will also use this 
1659                  * routine to zero fill to the new EOF when growing a file... in this case, the 
1660                  * uio structure will not be provided 
1663                         if (headOff 
< uio
->uio_offset
) { 
1664                                 zero_cnt 
= uio
->uio_offset 
- headOff
; 
1667                 } else if (headOff 
< newEOF
) {   
1668                         zero_cnt 
= newEOF 
- headOff
; 
1672         if (flags 
& IO_TAILZEROFILL
) { 
1674                         zero_off1 
= uio
->uio_offset 
+ uio
->uio_resid
; 
1676                         if (zero_off1 
< tailOff
) 
1677                                 zero_cnt1 
= tailOff 
- zero_off1
; 
1680         if (zero_cnt 
== 0 && uio 
== (struct uio 
*) 0) { 
1681             KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
, 
1682                          retval
, 0, 0, 0, 0); 
1686         while ((total_size 
= (uio_resid 
+ zero_cnt 
+ zero_cnt1
)) && retval 
== 0) { 
1688                  * for this iteration of the loop, figure out where our starting point is 
1691                         start_offset 
= (int)(zero_off 
& PAGE_MASK_64
); 
1692                         upl_f_offset 
= zero_off 
- start_offset
; 
1693                 } else if (uio_resid
) { 
1694                         start_offset 
= (int)(uio
->uio_offset 
& PAGE_MASK_64
); 
1695                         upl_f_offset 
= uio
->uio_offset 
- start_offset
; 
1697                         start_offset 
= (int)(zero_off1 
& PAGE_MASK_64
); 
1698                         upl_f_offset 
= zero_off1 
- start_offset
; 
1700                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 46)) | DBG_FUNC_NONE
, 
1701                              (int)zero_off
, (int)zero_cnt
, (int)zero_off1
, (int)zero_cnt1
, 0); 
1703                 if (total_size 
> (MAX_UPL_TRANSFER 
* PAGE_SIZE
)) 
1704                         total_size 
= MAX_UPL_TRANSFER 
* PAGE_SIZE
; 
1706                 start_blkno 
= (daddr_t
)(upl_f_offset 
/ PAGE_SIZE_64
); 
1708                 if (uio 
&& !(vp
->v_flag 
& VNOCACHE_DATA
) && 
1709                    (flags 
& (IO_SYNC 
| IO_HEADZEROFILL 
| IO_TAILZEROFILL
)) == 0) { 
1711                          * assumption... total_size <= uio_resid 
1712                          * because IO_HEADZEROFILL and IO_TAILZEROFILL not set 
1714                         if ((start_offset 
+ total_size
) > (MAX_UPL_TRANSFER 
* PAGE_SIZE
)) 
1715                                 total_size 
-= start_offset
; 
1716                         xfer_resid 
= total_size
; 
1718                         retval 
= cluster_copy_ubc_data(vp
, uio
, &xfer_resid
, 1); 
1723                         uio_resid   
-= (total_size 
- xfer_resid
); 
1724                         total_size   
= xfer_resid
; 
1725                         start_offset 
= (int)(uio
->uio_offset 
& PAGE_MASK_64
); 
1726                         upl_f_offset 
= uio
->uio_offset 
- start_offset
; 
1728                         if (total_size 
== 0) { 
1731                                          * the write did not finish on a page boundary 
1732                                          * which will leave upl_f_offset pointing to the 
1733                                          * beginning of the last page written instead of 
1734                                          * the page beyond it... bump it in this case 
1735                                          * so that the cluster code records the last page 
1738                                         upl_f_offset 
+= PAGE_SIZE_64
; 
1746                  * compute the size of the upl needed to encompass 
1747                  * the requested write... limit each call to cluster_io 
1748                  * to the maximum UPL size... cluster_io will clip if 
1749                  * this exceeds the maximum io_size for the device, 
1750                  * make sure to account for  
1751                  * a starting offset that's not page aligned 
1753                 upl_size 
= (start_offset 
+ total_size 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
1755                 if (upl_size 
> (MAX_UPL_TRANSFER 
* PAGE_SIZE
)) 
1756                         upl_size 
= MAX_UPL_TRANSFER 
* PAGE_SIZE
; 
1758                 pages_in_upl 
= upl_size 
/ PAGE_SIZE
; 
1759                 io_size      
= upl_size 
- start_offset
; 
1761                 if ((long long)io_size 
> total_size
) 
1762                         io_size 
= total_size
; 
1764                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, io_size
, total_size
, 0, 0); 
1767                 kret 
= ubc_create_upl(vp
,  
1773                 if (kret 
!= KERN_SUCCESS
) 
1774                         panic("cluster_write: failed to get pagelist"); 
1776                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
, 
1777                         (int)upl
, (int)upl_f_offset
, start_offset
, 0, 0); 
1779                 if (start_offset 
&& !upl_valid_page(pl
, 0)) { 
1783                          * we're starting in the middle of the first page of the upl 
1784                          * and the page isn't currently valid, so we're going to have 
1785                          * to read it in first... this is a synchronous operation 
1787                         read_size 
= PAGE_SIZE
; 
1789                         if ((upl_f_offset 
+ read_size
) > newEOF
) 
1790                                 read_size 
= newEOF 
- upl_f_offset
; 
1792                         retval 
= cluster_io(vp
, upl
, 0, upl_f_offset
, read_size
, devblocksize
, 
1793                                             CL_READ
, (struct buf 
*)0, (struct clios 
*)0); 
1796                                  * we had an error during the read which causes us to abort 
1797                                  * the current cluster_write request... before we do, we need 
1798                                  * to release the rest of the pages in the upl without modifying 
1799                                  * there state and mark the failed page in error 
1801                                 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
); 
1802                                 ubc_upl_abort_range(upl
, 0, upl_size
,  UPL_ABORT_FREE_ON_EMPTY
); 
1804                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
, 
1805                                              (int)upl
, 0, 0, retval
, 0); 
1809                 if ((start_offset 
== 0 || upl_size 
> PAGE_SIZE
) && ((start_offset 
+ io_size
) & PAGE_MASK
)) { 
1811                          * the last offset we're writing to in this upl does not end on a page 
1812                          * boundary... if it's not beyond the old EOF, then we'll also need to 
1813                          * pre-read this page in if it isn't already valid 
1815                         upl_offset 
= upl_size 
- PAGE_SIZE
; 
1817                         if ((upl_f_offset 
+ start_offset 
+ io_size
) < oldEOF 
&& 
1818                             !upl_valid_page(pl
, upl_offset 
/ PAGE_SIZE
)) { 
1821                                 read_size 
= PAGE_SIZE
; 
1823                                 if ((upl_f_offset 
+ upl_offset 
+ read_size
) > newEOF
) 
1824                                         read_size 
= newEOF 
- (upl_f_offset 
+ upl_offset
); 
1826                                 retval 
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset 
+ upl_offset
, read_size
, devblocksize
, 
1827                                                     CL_READ
, (struct buf 
*)0, (struct clios 
*)0); 
1830                                          * we had an error during the read which causes us to abort 
1831                                          * the current cluster_write request... before we do, we 
1832                                          * need to release the rest of the pages in the upl without 
1833                                          * modifying there state and mark the failed page in error 
1835                                         ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
); 
1836                                         ubc_upl_abort_range(upl
, 0,          upl_size
,  UPL_ABORT_FREE_ON_EMPTY
); 
1838                                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
, 
1839                                                      (int)upl
, 0, 0, retval
, 0); 
1844                 xfer_resid 
= io_size
; 
1845                 io_offset 
= start_offset
; 
1847                 while (zero_cnt 
&& xfer_resid
) { 
1849                         if (zero_cnt 
< (long long)xfer_resid
) 
1850                                 bytes_to_zero 
= zero_cnt
; 
1852                                 bytes_to_zero 
= xfer_resid
; 
1854                         if ( !(flags 
& (IO_NOZEROVALID 
| IO_NOZERODIRTY
))) { 
1855                                 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
); 
1859                                 bytes_to_zero 
= min(bytes_to_zero
, PAGE_SIZE 
- (int)(zero_off 
& PAGE_MASK_64
)); 
1860                                 zero_pg_index 
= (int)((zero_off 
- upl_f_offset
) / PAGE_SIZE_64
); 
1862                                 if ( !upl_valid_page(pl
, zero_pg_index
)) { 
1863                                         cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);  
1865                                 } else if ((flags 
& (IO_NOZERODIRTY 
| IO_NOZEROVALID
)) == IO_NOZERODIRTY 
&& 
1866                                            !upl_dirty_page(pl
, zero_pg_index
)) { 
1867                                         cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);  
1870                         xfer_resid 
-= bytes_to_zero
; 
1871                         zero_cnt   
-= bytes_to_zero
; 
1872                         zero_off   
+= bytes_to_zero
; 
1873                         io_offset  
+= bytes_to_zero
; 
1875                 if (xfer_resid 
&& uio_resid
) { 
1876                         bytes_to_move 
= min(uio_resid
, xfer_resid
); 
1878                         retval 
= cluster_copy_upl_data(uio
, upl
, io_offset
, bytes_to_move
); 
1882                                 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_DUMP_PAGES 
| UPL_ABORT_FREE_ON_EMPTY
); 
1884                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
, 
1885                                              (int)upl
, 0, 0, retval
, 0); 
1887                                 uio_resid  
-= bytes_to_move
; 
1888                                 xfer_resid 
-= bytes_to_move
; 
1889                                 io_offset  
+= bytes_to_move
; 
1892                 while (xfer_resid 
&& zero_cnt1 
&& retval 
== 0) { 
1894                         if (zero_cnt1 
< (long long)xfer_resid
) 
1895                                 bytes_to_zero 
= zero_cnt1
; 
1897                                 bytes_to_zero 
= xfer_resid
; 
1899                         if ( !(flags 
& (IO_NOZEROVALID 
| IO_NOZERODIRTY
))) { 
1900                                 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);  
1904                                 bytes_to_zero 
= min(bytes_to_zero
, PAGE_SIZE 
- (int)(zero_off1 
& PAGE_MASK_64
)); 
1905                                 zero_pg_index 
= (int)((zero_off1 
- upl_f_offset
) / PAGE_SIZE_64
); 
1907                                 if ( !upl_valid_page(pl
, zero_pg_index
)) { 
1908                                         cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);  
1909                                 } else if ((flags 
& (IO_NOZERODIRTY 
| IO_NOZEROVALID
)) == IO_NOZERODIRTY 
&& 
1910                                            !upl_dirty_page(pl
, zero_pg_index
)) { 
1911                                         cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);  
1914                         xfer_resid 
-= bytes_to_zero
; 
1915                         zero_cnt1  
-= bytes_to_zero
; 
1916                         zero_off1  
+= bytes_to_zero
; 
1917                         io_offset  
+= bytes_to_zero
; 
1924                         io_size 
+= start_offset
; 
1926                         if ((upl_f_offset 
+ io_size
) >= newEOF 
&& io_size 
< upl_size
) { 
1928                                  * if we're extending the file with this write 
1929                                  * we'll zero fill the rest of the page so that 
1930                                  * if the file gets extended again in such a way as to leave a 
1931                                  * hole starting at this EOF, we'll have zero's in the correct spot 
1933                                 cluster_zero(upl
, io_size
, upl_size 
- io_size
, NULL
);  
1935                         if (flags 
& IO_SYNC
) 
1937                                  * if the IO_SYNC flag is set than we need to  
1938                                  * bypass any clusters and immediately issue 
1944                          * calculate the last logical block number  
1945                          * that this delayed I/O encompassed 
1947                         last_blkno 
= (upl_f_offset 
+ (off_t
)upl_size
) / PAGE_SIZE_64
; 
1949                         if (vp
->v_flag 
& VHASDIRTY
) { 
1951                                 if ( !(vp
->v_flag 
& VNOCACHE_DATA
)) { 
1953                                          * we've fallen into the sparse 
1954                                          * cluster method of delaying dirty pages 
1955                                          * first, we need to release the upl if we hold one 
1956                                          * since pages in it may be present in the sparse cluster map 
1957                                          * and may span 2 separate buckets there... if they do and  
1958                                          * we happen to have to flush a bucket to make room and it intersects 
1959                                          * this upl, a deadlock may result on page BUSY 
1962                                                 ubc_upl_commit_range(upl
, 0, upl_size
, 
1963                                                                      UPL_COMMIT_SET_DIRTY 
| UPL_COMMIT_INACTIVATE 
| UPL_COMMIT_FREE_ON_EMPTY
); 
1965                                         sparse_cluster_add(vp
, newEOF
, start_blkno
, last_blkno
); 
1970                                  * must have done cached writes that fell into 
1971                                  * the sparse cluster mechanism... we've switched 
1972                                  * to uncached writes on the file, so go ahead 
1973                                  * and push whatever's in the sparse map 
1974                                  * and switch back to normal clustering 
1976                                  * see the comment above concerning a possible deadlock... 
1979                                         ubc_upl_commit_range(upl
, 0, upl_size
, 
1980                                                              UPL_COMMIT_SET_DIRTY 
| UPL_COMMIT_INACTIVATE 
| UPL_COMMIT_FREE_ON_EMPTY
); 
1982                                          * setting upl_size to 0 keeps us from committing a 
1983                                          * second time in the start_new_cluster path 
1987                                 sparse_cluster_push(vp
, ubc_getsize(vp
), 1); 
1990                                  * no clusters of either type present at this point 
1991                                  * so just go directly to start_new_cluster since 
1992                                  * we know we need to delay this I/O since we've 
1993                                  * already released the pages back into the cache 
1994                                  * to avoid the deadlock with sparse_cluster_push 
1996                                 goto start_new_cluster
; 
2000                         if (vp
->v_clen 
== 0) 
2002                                  * no clusters currently present 
2004                                 goto start_new_cluster
; 
2006                         for (cl_index 
= 0; cl_index 
< vp
->v_clen
; cl_index
++) { 
2008                                  * check each cluster that we currently hold 
2009                                  * try to merge some or all of this write into 
2010                                  * one or more of the existing clusters... if 
2011                                  * any portion of the write remains, start a 
2014                                 if (start_blkno 
>= vp
->v_clusters
[cl_index
].start_pg
) { 
2016                                          * the current write starts at or after the current cluster 
2018                                         if (last_blkno 
<= (vp
->v_clusters
[cl_index
].start_pg 
+ MAX_UPL_TRANSFER
)) { 
2020                                                  * we have a write that fits entirely 
2021                                                  * within the existing cluster limits 
2023                                                 if (last_blkno 
> vp
->v_clusters
[cl_index
].last_pg
) 
2025                                                          * update our idea of where the cluster ends 
2027                                                         vp
->v_clusters
[cl_index
].last_pg 
= last_blkno
; 
2030                                         if (start_blkno 
< (vp
->v_clusters
[cl_index
].start_pg 
+ MAX_UPL_TRANSFER
)) { 
2032                                                  * we have a write that starts in the middle of the current cluster 
2033                                                  * but extends beyond the cluster's limit... we know this because 
2034                                                  * of the previous checks 
2035                                                  * we'll extend the current cluster to the max 
2036                                                  * and update the start_blkno for the current write to reflect that 
2037                                                  * the head of it was absorbed into this cluster... 
2038                                                  * note that we'll always have a leftover tail in this case since 
2039                                                  * full absorbtion would have occurred in the clause above 
2041                                                 vp
->v_clusters
[cl_index
].last_pg 
= vp
->v_clusters
[cl_index
].start_pg 
+ MAX_UPL_TRANSFER
; 
2044                                                         int  start_pg_in_upl
; 
2046                                                         start_pg_in_upl 
= upl_f_offset 
/ PAGE_SIZE_64
; 
2048                                                         if (start_pg_in_upl 
< vp
->v_clusters
[cl_index
].last_pg
) { 
2049                                                                 intersection 
= (vp
->v_clusters
[cl_index
].last_pg 
- start_pg_in_upl
) * PAGE_SIZE
; 
2051                                                                 ubc_upl_commit_range(upl
, upl_offset
, intersection
, 
2052                                                                                      UPL_COMMIT_SET_DIRTY 
| UPL_COMMIT_INACTIVATE 
| UPL_COMMIT_FREE_ON_EMPTY
); 
2053                                                                 upl_f_offset 
+= intersection
; 
2054                                                                 upl_offset   
+= intersection
; 
2055                                                                 upl_size     
-= intersection
; 
2058                                                 start_blkno 
= vp
->v_clusters
[cl_index
].last_pg
; 
2061                                          * we come here for the case where the current write starts 
2062                                          * beyond the limit of the existing cluster or we have a leftover 
2063                                          * tail after a partial absorbtion 
2065                                          * in either case, we'll check the remaining clusters before  
2066                                          * starting a new one 
2070                                          * the current write starts in front of the cluster we're currently considering 
2072                                         if ((vp
->v_clusters
[cl_index
].last_pg 
- start_blkno
) <= MAX_UPL_TRANSFER
) { 
2074                                                  * we can just merge the new request into 
2075                                                  * this cluster and leave it in the cache 
2076                                                  * since the resulting cluster is still  
2077                                                  * less than the maximum allowable size 
2079                                                 vp
->v_clusters
[cl_index
].start_pg 
= start_blkno
; 
2081                                                 if (last_blkno 
> vp
->v_clusters
[cl_index
].last_pg
) { 
2083                                                          * the current write completely 
2084                                                          * envelops the existing cluster and since 
2085                                                          * each write is limited to at most MAX_UPL_TRANSFER bytes 
2086                                                          * we can just use the start and last blocknos of the write 
2087                                                          * to generate the cluster limits 
2089                                                         vp
->v_clusters
[cl_index
].last_pg 
= last_blkno
; 
2095                                          * if we were to combine this write with the current cluster 
2096                                          * we would exceed the cluster size limit.... so, 
2097                                          * let's see if there's any overlap of the new I/O with 
2098                                          * the cluster we're currently considering... in fact, we'll 
2099                                          * stretch the cluster out to it's full limit and see if we 
2100                                          * get an intersection with the current write 
2103                                         if (last_blkno 
> vp
->v_clusters
[cl_index
].last_pg 
- MAX_UPL_TRANSFER
) { 
2105                                                  * the current write extends into the proposed cluster 
2106                                                  * clip the length of the current write after first combining it's 
2107                                                  * tail with the newly shaped cluster 
2109                                                 vp
->v_clusters
[cl_index
].start_pg 
= vp
->v_clusters
[cl_index
].last_pg 
- MAX_UPL_TRANSFER
; 
2112                                                         intersection 
= (last_blkno 
- vp
->v_clusters
[cl_index
].start_pg
) * PAGE_SIZE
; 
2114                                                         if (intersection 
> upl_size
) 
2116                                                                  * because the current write may consist of a number of pages found in the cache 
2117                                                                  * which are not part of the UPL, we may have an intersection that exceeds 
2118                                                                  * the size of the UPL that is also part of this write 
2120                                                                 intersection 
= upl_size
; 
2122                                                         ubc_upl_commit_range(upl
, upl_offset 
+ (upl_size 
- intersection
), intersection
, 
2123                                                                              UPL_COMMIT_SET_DIRTY 
| UPL_COMMIT_INACTIVATE 
| UPL_COMMIT_FREE_ON_EMPTY
); 
2124                                                         upl_size 
-= intersection
; 
2126                                                 last_blkno 
= vp
->v_clusters
[cl_index
].start_pg
; 
2129                                          * if we get here, there was no way to merge 
2130                                          * any portion of this write with this cluster  
2131                                          * or we could only merge part of it which  
2132                                          * will leave a tail... 
2133                                          * we'll check the remaining clusters before starting a new one 
2137                         if (cl_index 
< vp
->v_clen
) 
2139                                  * we found an existing cluster(s) that we 
2140                                  * could entirely merge this I/O into 
2144                         if (vp
->v_clen 
< MAX_CLUSTERS 
&& !(vp
->v_flag 
& VNOCACHE_DATA
)) 
2146                                  * we didn't find an existing cluster to 
2147                                  * merge into, but there's room to start 
2150                                 goto start_new_cluster
; 
2153                          * no exisitng cluster to merge with and no 
2154                          * room to start a new one... we'll try  
2155                          * pushing one of the existing ones... if none of 
2156                          * them are able to be pushed, we'll switch 
2157                          * to the sparse cluster mechanism 
2158                          * cluster_try_push updates v_clen to the 
2159                          * number of remaining clusters... and 
2160                          * returns the number of currently unused clusters 
2162                         if (vp
->v_flag 
& VNOCACHE_DATA
) 
2167                         if (cluster_try_push(vp
, newEOF
, can_delay
, 0) == 0) { 
2169                                  * no more room in the normal cluster mechanism 
2170                                  * so let's switch to the more expansive but expensive 
2171                                  * sparse mechanism.... 
2172                                  * first, we need to release the upl if we hold one 
2173                                  * since pages in it may be present in the sparse cluster map (after the cluster_switch) 
2174                                  * and may span 2 separate buckets there... if they do and  
2175                                  * we happen to have to flush a bucket to make room and it intersects 
2176                                  * this upl, a deadlock may result on page BUSY 
2179                                         ubc_upl_commit_range(upl
, upl_offset
, upl_size
, 
2180                                                              UPL_COMMIT_SET_DIRTY 
| UPL_COMMIT_INACTIVATE 
| UPL_COMMIT_FREE_ON_EMPTY
); 
2182                                 sparse_cluster_switch(vp
, newEOF
); 
2183                                 sparse_cluster_add(vp
, newEOF
, start_blkno
, last_blkno
); 
2188                          * we pushed one cluster successfully, so we must be sequentially writing this file 
2189                          * otherwise, we would have failed and fallen into the sparse cluster support 
2190                          * so let's take the opportunity to push out additional clusters as long as we 
2191                          * remain below the throttle... this will give us better I/O locality if we're 
2192                          * in a copy loop (i.e.  we won't jump back and forth between the read and write points 
2193                          * however, we don't want to push so much out that the write throttle kicks in and 
2194                          * hangs this thread up until some of the I/O completes... 
2196                         while (vp
->v_clen 
&& (vp
->v_numoutput 
<= (ASYNC_THROTTLE 
/ 2))) 
2197                                 cluster_try_push(vp
, newEOF
, 0, 0); 
2200                         if (vp
->v_clen 
== 0) 
2201                                 vp
->v_ciosiz 
= devblocksize
; 
2203                         vp
->v_clusters
[vp
->v_clen
].start_pg 
= start_blkno
; 
2204                         vp
->v_clusters
[vp
->v_clen
].last_pg  
= last_blkno
; 
2209                                 ubc_upl_commit_range(upl
, upl_offset
, upl_size
, 
2210                                                      UPL_COMMIT_SET_DIRTY 
| UPL_COMMIT_INACTIVATE 
| UPL_COMMIT_FREE_ON_EMPTY
); 
2214                          * in order to maintain some semblance of coherency with mapped writes 
2215                          * we need to write the cluster back out as a multiple of the PAGESIZE 
2216                          * unless the cluster encompasses the last page of the file... in this 
2217                          * case we'll round out to the nearest device block boundary 
2221                         if ((upl_f_offset 
+ io_size
) > newEOF
) { 
2222                                 io_size 
= newEOF 
- upl_f_offset
; 
2223                                 io_size 
= (io_size 
+ (devblocksize 
- 1)) & ~(devblocksize 
- 1); 
2226                         if (flags 
& IO_SYNC
) 
2227                                 io_flags 
= CL_THROTTLE 
| CL_COMMIT 
| CL_AGE
; 
2229                                 io_flags 
= CL_THROTTLE 
| CL_COMMIT 
| CL_AGE 
| CL_ASYNC
; 
2231                         if (vp
->v_flag 
& VNOCACHE_DATA
) 
2232                                 io_flags 
|= CL_DUMP
; 
2234                         retval 
= cluster_io(vp
, upl
, 0, upl_f_offset
, io_size
, devblocksize
, 
2235                                             io_flags
, (struct buf 
*)0, (struct clios 
*)0); 
2238         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
, 
2239                      retval
, 0, uio_resid
, 0, 0); 
2245 cluster_read(vp
, uio
, filesize
, devblocksize
, flags
) 
2262         if (!((vp
->v_flag 
& VNOCACHE_DATA
) && (uio
->uio_segflg 
== UIO_USERSPACE
))) 
2265              * go do a read through the cache if one of the following is true.... 
2266              *   NOCACHE is not true 
2267              *   the uio request doesn't target USERSPACE 
2269             return (cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
)); 
2272         while (uio
->uio_resid 
&& uio
->uio_offset 
< filesize 
&& retval 
== 0) 
2275              * we know we have a resid, so this is safe 
2276              * skip over any emtpy vectors 
2280             while (iov
->iov_len 
== 0) { 
2285             upl_size  
= PAGE_SIZE
; 
2286             upl_flags 
= UPL_QUERY_OBJECT_TYPE
; 
2288             if ((vm_map_get_upl(current_map(), 
2289                                (vm_offset_t
)iov
->iov_base 
& ~PAGE_MASK
, 
2290                                &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, 0)) != KERN_SUCCESS
) 
2293                  * the user app must have passed in an invalid address 
2299              * We check every vector target but if it is physically  
2300              * contiguous space, we skip the sanity checks. 
2302             if (upl_flags 
& UPL_PHYS_CONTIG
) 
2304                 retval 
= cluster_phys_read(vp
, uio
, filesize
, devblocksize
, flags
); 
2306             else if (uio
->uio_resid 
< PAGE_SIZE
) 
2309                  * we're here because we're don't have a physically contiguous target buffer 
2310                  * go do a read through the cache if 
2311                  *   the total xfer size is less than a page... 
2313                 return (cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
)); 
2315             else if (((int)uio
->uio_offset 
& PAGE_MASK
) || ((int)iov
->iov_base 
& PAGE_MASK
)) 
2317                 if (((int)uio
->uio_offset 
& PAGE_MASK
) == ((int)iov
->iov_base 
& PAGE_MASK
)) 
2320                      * Bring the file offset read up to a pagesize boundary 
2321                      * this will also bring the base address to a page boundary 
2322                      * since they both are currently on the same offset within a page 
2323                      * note: if we get here, uio->uio_resid is greater than PAGE_SIZE 
2324                      * so the computed clip_size must always be less than the current uio_resid 
2326                     clip_size 
= (PAGE_SIZE 
- (int)(uio
->uio_offset 
& PAGE_MASK_64
)); 
2329                      * Fake the resid going into the cluster_read_x call 
2330                      * and restore it on the way out. 
2332                     prev_resid 
= uio
->uio_resid
; 
2333                     uio
->uio_resid 
= clip_size
; 
2334                     retval 
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
); 
2335                     uio
->uio_resid 
= prev_resid 
- (clip_size 
- uio
->uio_resid
); 
2340                      * can't get both the file offset and the buffer offset aligned to a page boundary 
2341                      * so fire an I/O through the cache for this entire vector 
2343                     clip_size 
= iov
->iov_len
; 
2344                     prev_resid 
= uio
->uio_resid
; 
2345                     uio
->uio_resid 
= clip_size
; 
2346                     retval 
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
); 
2347                     uio
->uio_resid 
= prev_resid 
- (clip_size 
- uio
->uio_resid
); 
2353                  * If we come in here, we know the offset into 
2354                  * the file is on a pagesize boundary 
2357                 max_io_size 
= filesize 
- uio
->uio_offset
; 
2358                 clip_size 
= uio
->uio_resid
; 
2359                 if (iov
->iov_len 
< clip_size
) 
2360                   clip_size 
= iov
->iov_len
; 
2361                 if (max_io_size 
< clip_size
) 
2362                   clip_size 
= (int)max_io_size
; 
2364                 if (clip_size 
< PAGE_SIZE
) 
2367                      * Take care of the tail end of the read in this vector. 
2369                     prev_resid 
= uio
->uio_resid
; 
2370                     uio
->uio_resid 
= clip_size
; 
2371                     retval 
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
); 
2372                     uio
->uio_resid 
= prev_resid 
- (clip_size 
- uio
->uio_resid
); 
2376                     /* round clip_size down to a multiple of pagesize */ 
2377                     clip_size 
= clip_size 
& ~(PAGE_MASK
); 
2378                     prev_resid 
= uio
->uio_resid
; 
2379                     uio
->uio_resid 
= clip_size
; 
2380                     retval 
= cluster_nocopy_read(vp
, uio
, filesize
, devblocksize
, flags
); 
2381                     if ((retval
==0) && uio
->uio_resid
) 
2382                       retval 
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
); 
2383                     uio
->uio_resid 
= prev_resid 
- (clip_size 
- uio
->uio_resid
); 
2392 cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
) 
2399         upl_page_info_t 
*pl
; 
2401         vm_offset_t      upl_offset
; 
2410         off_t            last_ioread_offset
; 
2411         off_t            last_request_offset
; 
2412         u_int            size_of_prefetch
; 
2419         struct clios     iostate
; 
2420         u_int            max_rd_size 
= MAX_UPL_TRANSFER 
* PAGE_SIZE
; 
2421         u_int            rd_ahead_enabled 
= 1; 
2422         u_int            prefetch_enabled 
= 1; 
2425         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_START
, 
2426                      (int)uio
->uio_offset
, uio
->uio_resid
, (int)filesize
, devblocksize
, 0); 
2428         if (cluster_hard_throttle_on(vp
)) { 
2429                 rd_ahead_enabled 
= 0; 
2430                 prefetch_enabled 
= 0; 
2432                 max_rd_size 
= HARD_THROTTLE_MAXSIZE
; 
2434         if (vp
->v_flag 
& (VRAOFF
|VNOCACHE_DATA
)) 
2435                 rd_ahead_enabled 
= 0; 
2437         last_request_offset 
= uio
->uio_offset 
+ uio
->uio_resid
; 
2439         if (last_request_offset 
> filesize
) 
2440                 last_request_offset 
= filesize
; 
2441         b_lblkno 
= (u_int
)(uio
->uio_offset 
/ PAGE_SIZE_64
); 
2442         e_lblkno 
= (u_int
)((last_request_offset 
- 1) / PAGE_SIZE_64
); 
2444         if (vp
->v_ralen 
&& (vp
->v_lastr 
== b_lblkno 
|| (vp
->v_lastr 
+ 1) == b_lblkno
)) { 
2446                  * determine if we already have a read-ahead in the pipe courtesy of the 
2447                  * last read systemcall that was issued... 
2448                  * if so, pick up it's extent to determine where we should start 
2449                  * with respect to any read-ahead that might be necessary to  
2450                  * garner all the data needed to complete this read systemcall 
2452                 last_ioread_offset 
= (vp
->v_maxra 
* PAGE_SIZE_64
) + PAGE_SIZE_64
; 
2454                 if (last_ioread_offset 
< uio
->uio_offset
) 
2455                         last_ioread_offset 
= (off_t
)0; 
2456                 else if (last_ioread_offset 
> last_request_offset
) 
2457                         last_ioread_offset 
= last_request_offset
; 
2459                 last_ioread_offset 
= (off_t
)0; 
2461         while (uio
->uio_resid 
&& uio
->uio_offset 
< filesize 
&& retval 
== 0) { 
2463                  * compute the size of the upl needed to encompass 
2464                  * the requested read... limit each call to cluster_io 
2465                  * to the maximum UPL size... cluster_io will clip if 
2466                  * this exceeds the maximum io_size for the device, 
2467                  * make sure to account for  
2468                  * a starting offset that's not page aligned 
2470                 start_offset 
= (int)(uio
->uio_offset 
& PAGE_MASK_64
); 
2471                 upl_f_offset 
= uio
->uio_offset 
- (off_t
)start_offset
; 
2472                 max_size     
= filesize 
- uio
->uio_offset
; 
2474                 if ((off_t
)((unsigned int)uio
->uio_resid
) < max_size
) 
2475                         io_size 
= uio
->uio_resid
; 
2479                 if (!(vp
->v_flag 
& VNOCACHE_DATA
)) { 
2486                                  * if we keep finding the pages we need already in the cache, then 
2487                                  * don't bother to call cluster_rd_prefetch since it costs CPU cycles 
2488                                  * to determine that we have all the pages we need... once we miss in 
2489                                  * the cache and have issued an I/O, than we'll assume that we're likely 
2490                                  * to continue to miss in the cache and it's to our advantage to try and prefetch 
2492                                 if (last_request_offset 
&& last_ioread_offset 
&& (size_of_prefetch 
= (last_request_offset 
- last_ioread_offset
))) { 
2493                                         if ((last_ioread_offset 
- uio
->uio_offset
) <= max_rd_size 
&& prefetch_enabled
) { 
2495                                                  * we've already issued I/O for this request and 
2496                                                  * there's still work to do and 
2497                                                  * our prefetch stream is running dry, so issue a 
2498                                                  * pre-fetch I/O... the I/O latency will overlap 
2499                                                  * with the copying of the data 
2501                                                 if (size_of_prefetch 
> max_rd_size
) 
2502                                                         size_of_prefetch 
= max_rd_size
; 
2504                                                 size_of_prefetch 
= cluster_rd_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
, devblocksize
); 
2506                                                 last_ioread_offset 
+= (off_t
)(size_of_prefetch 
* PAGE_SIZE
); 
2508                                                 if (last_ioread_offset 
> last_request_offset
) 
2509                                                         last_ioread_offset 
= last_request_offset
; 
2513                                  * limit the size of the copy we're about to do so that  
2514                                  * we can notice that our I/O pipe is running dry and  
2515                                  * get the next I/O issued before it does go dry 
2517                                 if (last_ioread_offset 
&& io_size 
> ((MAX_UPL_TRANSFER 
* PAGE_SIZE
) / 4)) 
2518                                         io_resid 
= ((MAX_UPL_TRANSFER 
* PAGE_SIZE
) / 4); 
2522                                 io_requested 
= io_resid
; 
2524                                 retval 
= cluster_copy_ubc_data(vp
, uio
, &io_resid
, 0); 
2526                                 io_size 
-= (io_requested 
- io_resid
); 
2528                                 if (retval 
|| io_resid
) 
2530                                          * if we run into a real error or 
2531                                          * a page that is not in the cache 
2532                                          * we need to leave streaming mode 
2536                                 if ((io_size 
== 0 || last_ioread_offset 
== last_request_offset
) && rd_ahead_enabled
) { 
2538                                          * we're already finished the I/O for this read request 
2539                                          * let's see if we should do a read-ahead 
2541                                         cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
); 
2547                                 if (e_lblkno 
< vp
->v_lastr
) 
2549                                 vp
->v_lastr 
= e_lblkno
; 
2553                         start_offset 
= (int)(uio
->uio_offset 
& PAGE_MASK_64
); 
2554                         upl_f_offset 
= uio
->uio_offset 
- (off_t
)start_offset
; 
2555                         max_size     
= filesize 
- uio
->uio_offset
; 
2557                 if (io_size 
> max_rd_size
) 
2558                         io_size 
= max_rd_size
; 
2560                 upl_size 
= (start_offset 
+ io_size 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
2562                 if (upl_size 
> (MAX_UPL_TRANSFER 
* PAGE_SIZE
) / 4) 
2563                         upl_size 
= (MAX_UPL_TRANSFER 
* PAGE_SIZE
) / 4; 
2564                 pages_in_upl 
= upl_size 
/ PAGE_SIZE
; 
2566                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_START
, 
2567                              (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0); 
2569                 kret 
= ubc_create_upl(vp
,  
2575                 if (kret 
!= KERN_SUCCESS
) 
2576                         panic("cluster_read: failed to get pagelist"); 
2578                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_END
, 
2579                              (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0); 
2582                  * scan from the beginning of the upl looking for the first 
2583                  * non-valid page.... this will become the first page in 
2584                  * the request we're going to make to 'cluster_io'... if all 
2585                  * of the pages are valid, we won't call through to 'cluster_io' 
2587                 for (start_pg 
= 0; start_pg 
< pages_in_upl
; start_pg
++) { 
2588                         if (!upl_valid_page(pl
, start_pg
)) 
2593                  * scan from the starting invalid page looking for a valid 
2594                  * page before the end of the upl is reached, if we  
2595                  * find one, then it will be the last page of the request to 
2598                 for (last_pg 
= start_pg
; last_pg 
< pages_in_upl
; last_pg
++) { 
2599                         if (upl_valid_page(pl
, last_pg
)) 
2602                 iostate
.io_completed 
= 0; 
2603                 iostate
.io_issued 
= 0; 
2604                 iostate
.io_error 
= 0; 
2605                 iostate
.io_wanted 
= 0; 
2607                 if (start_pg 
< last_pg
) {                
2609                          * we found a range of 'invalid' pages that must be filled 
2610                          * if the last page in this range is the last page of the file 
2611                          * we may have to clip the size of it to keep from reading past 
2612                          * the end of the last physical block associated with the file 
2614                         upl_offset 
= start_pg 
* PAGE_SIZE
; 
2615                         io_size    
= (last_pg 
- start_pg
) * PAGE_SIZE
; 
2617                         if ((upl_f_offset 
+ upl_offset 
+ io_size
) > filesize
) 
2618                                 io_size 
= filesize 
- (upl_f_offset 
+ upl_offset
); 
2621                          * issue an asynchronous read to cluster_io 
2624                         error 
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset 
+ upl_offset
, 
2625                                            io_size
, devblocksize
, CL_READ 
| CL_ASYNC
, (struct buf 
*)0, &iostate
); 
2629                          * if the read completed successfully, or there was no I/O request 
2630                          * issued, than copy the data into user land via 'cluster_upl_copy_data' 
2631                          * we'll first add on any 'valid' 
2632                          * pages that were present in the upl when we acquired it. 
2636                         for (uio_last 
= last_pg
; uio_last 
< pages_in_upl
; uio_last
++) { 
2637                                 if (!upl_valid_page(pl
, uio_last
)) 
2641                          * compute size to transfer this round,  if uio->uio_resid is 
2642                          * still non-zero after this attempt, we'll loop around and 
2643                          * set up for another I/O. 
2645                         val_size 
= (uio_last 
* PAGE_SIZE
) - start_offset
; 
2647                         if (val_size 
> max_size
) 
2648                                 val_size 
= max_size
; 
2650                         if (val_size 
> uio
->uio_resid
) 
2651                                 val_size 
= uio
->uio_resid
; 
2653                         if (last_ioread_offset 
== 0) 
2654                                 last_ioread_offset 
= uio
->uio_offset 
+ val_size
; 
2656                         if ((size_of_prefetch 
= (last_request_offset 
- last_ioread_offset
)) && prefetch_enabled
) { 
2658                                  * if there's still I/O left to do for this request, and... 
2659                                  * we're not in hard throttle mode, then issue a 
2660                                  * pre-fetch I/O... the I/O latency will overlap 
2661                                  * with the copying of the data 
2663                                 size_of_prefetch 
= cluster_rd_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
, devblocksize
); 
2665                                 last_ioread_offset 
+= (off_t
)(size_of_prefetch 
* PAGE_SIZE
); 
2667                                 if (last_ioread_offset 
> last_request_offset
) 
2668                                         last_ioread_offset 
= last_request_offset
; 
2670                         } else if ((uio
->uio_offset 
+ val_size
) == last_request_offset
) { 
2672                                  * this transfer will finish this request, so... 
2673                                  * let's try to read ahead if we're in  
2674                                  * a sequential access pattern and we haven't 
2675                                  * explicitly disabled it 
2677                                 if (rd_ahead_enabled
) 
2678                                         cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
); 
2680                                 if (e_lblkno 
< vp
->v_lastr
) 
2682                                 vp
->v_lastr 
= e_lblkno
; 
2684                         while (iostate
.io_issued 
!= iostate
.io_completed
) { 
2685                                 iostate
.io_wanted 
= 1; 
2686                                 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO 
+ 1, "cluster_read_x", 0); 
2688                         if (iostate
.io_error
) 
2689                                 error 
= iostate
.io_error
; 
2691                                 retval 
= cluster_copy_upl_data(uio
, upl
, start_offset
, val_size
); 
2693                 if (start_pg 
< last_pg
) { 
2695                          * compute the range of pages that we actually issued an I/O for 
2696                          * and either commit them as valid if the I/O succeeded 
2697                          * or abort them if the I/O failed 
2699                         io_size 
= (last_pg 
- start_pg
) * PAGE_SIZE
; 
2701                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
, 
2702                                      (int)upl
, start_pg 
* PAGE_SIZE
, io_size
, error
, 0); 
2704                         if (error 
|| (vp
->v_flag 
& VNOCACHE_DATA
)) 
2705                                 ubc_upl_abort_range(upl
, start_pg 
* PAGE_SIZE
, io_size
, 
2706                                                 UPL_ABORT_DUMP_PAGES 
| UPL_ABORT_FREE_ON_EMPTY
); 
2708                                 ubc_upl_commit_range(upl
, start_pg 
* PAGE_SIZE
, io_size
,  
2709                                                      UPL_COMMIT_CLEAR_DIRTY 
| 
2710                                                      UPL_COMMIT_FREE_ON_EMPTY 
|  
2711                                                      UPL_COMMIT_INACTIVATE
); 
2713                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
, 
2714                                      (int)upl
, start_pg 
* PAGE_SIZE
, io_size
, error
, 0); 
2716                 if ((last_pg 
- start_pg
) < pages_in_upl
) { 
2721                          * the set of pages that we issued an I/O for did not encompass 
2722                          * the entire upl... so just release these without modifying 
2726                                 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
); 
2728                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
, 
2729                                              (int)upl
, -1, pages_in_upl 
- (last_pg 
- start_pg
), 0, 0); 
2733                                          * we found some already valid pages at the beginning of 
2734                                          * the upl commit these back to the inactive list with 
2737                                         for (cur_pg 
= 0; cur_pg 
< start_pg
; cur_pg
++) { 
2738                                                 commit_flags 
= UPL_COMMIT_FREE_ON_EMPTY 
 
2739                                                                    | UPL_COMMIT_INACTIVATE
; 
2741                                                 if (upl_dirty_page(pl
, cur_pg
)) 
2742                                                         commit_flags 
|= UPL_COMMIT_SET_DIRTY
; 
2744                                                 if ( !(commit_flags 
& UPL_COMMIT_SET_DIRTY
) && (vp
->v_flag 
& VNOCACHE_DATA
)) 
2745                                                         ubc_upl_abort_range(upl
, cur_pg 
* PAGE_SIZE
, PAGE_SIZE
, 
2746                                                                 UPL_ABORT_DUMP_PAGES 
| UPL_ABORT_FREE_ON_EMPTY
); 
2748                                                         ubc_upl_commit_range(upl
, cur_pg 
* PAGE_SIZE
,  
2749                                                                 PAGE_SIZE
, commit_flags
); 
2752                                 if (last_pg 
< uio_last
) { 
2754                                          * we found some already valid pages immediately after the 
2755                                          * pages we issued I/O for, commit these back to the 
2756                                          * inactive list with reference cleared 
2758                                         for (cur_pg 
= last_pg
; cur_pg 
< uio_last
; cur_pg
++) { 
2759                                                 commit_flags 
=  UPL_COMMIT_FREE_ON_EMPTY 
 
2760                                                                                 | UPL_COMMIT_INACTIVATE
; 
2762                                                 if (upl_dirty_page(pl
, cur_pg
)) 
2763                                                         commit_flags 
|= UPL_COMMIT_SET_DIRTY
; 
2765                                                 if ( !(commit_flags 
& UPL_COMMIT_SET_DIRTY
) && (vp
->v_flag 
& VNOCACHE_DATA
)) 
2766                                                         ubc_upl_abort_range(upl
, cur_pg 
* PAGE_SIZE
, PAGE_SIZE
, 
2767                                                                 UPL_ABORT_DUMP_PAGES 
| UPL_ABORT_FREE_ON_EMPTY
); 
2769                                                         ubc_upl_commit_range(upl
, cur_pg 
* PAGE_SIZE
,  
2770                                                                 PAGE_SIZE
, commit_flags
); 
2773                                 if (uio_last 
< pages_in_upl
) { 
2775                                          * there were some invalid pages beyond the valid pages 
2776                                          * that we didn't issue an I/O for, just release them 
2779                                         ubc_upl_abort_range(upl
, uio_last 
* PAGE_SIZE
, 
2780                                                             (pages_in_upl 
- uio_last
) * PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
); 
2783                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
, 
2784                                         (int)upl
, -1, -1, 0, 0); 
2790         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
, 
2791                      (int)uio
->uio_offset
, uio
->uio_resid
, vp
->v_lastr
, retval
, 0); 
2798 cluster_nocopy_read(vp
, uio
, filesize
, devblocksize
, flags
) 
2806         upl_page_info_t  
*pl
; 
2807         vm_offset_t      upl_offset
; 
2811         int              upl_needed_size
; 
2817         int              force_data_sync
; 
2819         struct clios     iostate
; 
2820         u_int            max_rd_size  
= MAX_UPL_TRANSFER 
* PAGE_SIZE
; 
2821         u_int            max_rd_ahead 
= MAX_UPL_TRANSFER 
* PAGE_SIZE 
* 2; 
2824         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_START
, 
2825                      (int)uio
->uio_offset
, uio
->uio_resid
, (int)filesize
, devblocksize
, 0); 
2828          * When we enter this routine, we know 
2829          *  -- the offset into the file is on a pagesize boundary 
2830          *  -- the resid is a page multiple 
2831          *  -- the resid will not exceed iov_len 
2834         iostate
.io_completed 
= 0; 
2835         iostate
.io_issued 
= 0; 
2836         iostate
.io_error 
= 0; 
2837         iostate
.io_wanted 
= 0; 
2841         if (cluster_hard_throttle_on(vp
)) { 
2842                 max_rd_size  
= HARD_THROTTLE_MAXSIZE
; 
2843                 max_rd_ahead 
= HARD_THROTTLE_MAXSIZE 
- 1; 
2845         while (uio
->uio_resid 
&& uio
->uio_offset 
< filesize 
&& retval 
== 0) { 
2847                 max_io_size 
= filesize 
- uio
->uio_offset
; 
2849                 if (max_io_size 
< (off_t
)((unsigned int)uio
->uio_resid
)) 
2850                         io_size 
= max_io_size
; 
2852                         io_size 
= uio
->uio_resid
; 
2855                  * First look for pages already in the cache 
2856                  * and move them to user space. 
2858                 retval 
= cluster_copy_ubc_data(vp
, uio
, &io_size
, 0); 
2862                          * we may have already spun some portion of this request 
2863                          * off as async requests... we need to wait for the I/O 
2864                          * to complete before returning 
2866                         goto wait_for_reads
; 
2869                  * If we are already finished with this read, then return 
2873                          * we may have already spun some portion of this request 
2874                          * off as async requests... we need to wait for the I/O 
2875                          * to complete before returning 
2877                         goto wait_for_reads
; 
2879                 max_io_size 
= io_size
; 
2881                 if (max_io_size 
> max_rd_size
) 
2882                         max_io_size 
= max_rd_size
; 
2886                 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset 
+ max_io_size
, UPL_ROP_ABSENT
, &io_size
); 
2890                          * we may have already spun some portion of this request 
2891                          * off as async requests... we need to wait for the I/O 
2892                          * to complete before returning 
2894                         goto wait_for_reads
; 
2896                 upl_offset 
= (vm_offset_t
)iov
->iov_base 
& PAGE_MASK
; 
2897                 upl_needed_size 
= (upl_offset 
+ io_size 
+ (PAGE_SIZE 
-1)) & ~PAGE_MASK
; 
2899                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_START
, 
2900                              (int)upl_offset
, upl_needed_size
, (int)iov
->iov_base
, io_size
, 0); 
2902                 for (force_data_sync 
= 0; force_data_sync 
< 3; force_data_sync
++) { 
2904                         upl_size 
= upl_needed_size
; 
2905                         upl_flags 
= UPL_FILE_IO 
| UPL_NO_SYNC 
| UPL_SET_INTERNAL 
| UPL_SET_LITE 
| UPL_SET_IO_WIRE
; 
2907                         kret 
= vm_map_get_upl(current_map(), 
2908                                               (vm_offset_t
)iov
->iov_base 
& ~PAGE_MASK
, 
2909                                               &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, force_data_sync
); 
2911                         if (kret 
!= KERN_SUCCESS
) { 
2912                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
, 
2913                                              (int)upl_offset
, upl_size
, io_size
, kret
, 0); 
2915                                  * cluster_nocopy_read: failed to get pagelist 
2917                                  * we may have already spun some portion of this request 
2918                                  * off as async requests... we need to wait for the I/O 
2919                                  * to complete before returning 
2921                                 goto wait_for_reads
; 
2923                         pages_in_pl 
= upl_size 
/ PAGE_SIZE
; 
2924                         pl 
= UPL_GET_INTERNAL_PAGE_LIST(upl
); 
2926                         for (i 
= 0; i 
< pages_in_pl
; i
++) { 
2927                                 if (!upl_valid_page(pl
, i
)) 
2930                         if (i 
== pages_in_pl
) 
2933                         ubc_upl_abort_range(upl
, (upl_offset 
& ~PAGE_MASK
), upl_size
,  
2934                                             UPL_ABORT_FREE_ON_EMPTY
); 
2936                 if (force_data_sync 
>= 3) { 
2937                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
, 
2938                                      (int)upl_offset
, upl_size
, io_size
, kret
, 0); 
2940                         goto wait_for_reads
; 
2943                  * Consider the possibility that upl_size wasn't satisfied. 
2945                 if (upl_size 
!= upl_needed_size
) 
2946                         io_size 
= (upl_size 
- (int)upl_offset
) & ~PAGE_MASK
; 
2949                         ubc_upl_abort_range(upl
, (upl_offset 
& ~PAGE_MASK
), upl_size
,  
2950                                             UPL_ABORT_FREE_ON_EMPTY
); 
2951                         goto wait_for_reads
; 
2953                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
, 
2954                              (int)upl_offset
, upl_size
, io_size
, kret
, 0); 
2957                  * request asynchronously so that we can overlap 
2958                  * the preparation of the next I/O 
2959                  * if there are already too many outstanding reads 
2960                  * wait until some have completed before issuing the next read 
2962                 while ((iostate
.io_issued 
- iostate
.io_completed
) > max_rd_ahead
) { 
2963                         iostate
.io_wanted 
= 1; 
2964                         tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO 
+ 1, "cluster_nocopy_read", 0); 
2966                 if (iostate
.io_error
) { 
2968                          * one of the earlier reads we issued ran into a hard error 
2969                          * don't issue any more reads, cleanup the UPL 
2970                          * that was just created but not used, then 
2971                          * go wait for any other reads to complete before 
2972                          * returning the error to the caller 
2974                         ubc_upl_abort_range(upl
, (upl_offset 
& ~PAGE_MASK
), upl_size
,  
2975                                             UPL_ABORT_FREE_ON_EMPTY
); 
2977                         goto wait_for_reads
; 
2979                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_START
, 
2980                              (int)upl
, (int)upl_offset
, (int)uio
->uio_offset
, io_size
, 0); 
2982                 retval 
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, 
2983                                    io_size
, devblocksize
, 
2984                                    CL_PRESERVE 
| CL_COMMIT 
| CL_READ 
| CL_ASYNC 
| CL_NOZERO
, 
2985                                    (struct buf 
*)0, &iostate
); 
2988                  * update the uio structure 
2990                 iov
->iov_base   
+= io_size
; 
2991                 iov
->iov_len    
-= io_size
; 
2992                 uio
->uio_resid  
-= io_size
; 
2993                 uio
->uio_offset 
+= io_size
; 
2995                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_END
, 
2996                              (int)upl
, (int)uio
->uio_offset
, (int)uio
->uio_resid
, retval
, 0); 
3002          * make sure all async reads that are part of this stream 
3003          * have completed before we return 
3005         while (iostate
.io_issued 
!= iostate
.io_completed
) { 
3006                 iostate
.io_wanted 
= 1; 
3007                 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO 
+ 1, "cluster_nocopy_read", 0); 
3009         if (iostate
.io_error
) 
3010                 retval 
= iostate
.io_error
; 
3012         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_END
, 
3013                      (int)uio
->uio_offset
, (int)uio
->uio_resid
, 6, retval
, 0); 
3020 cluster_phys_read(vp
, uio
, filesize
, devblocksize
, flags
) 
3027         upl_page_info_t 
*pl
; 
3029         vm_offset_t      upl_offset
; 
3035         int              upl_needed_size
; 
3040         struct clios     iostate
; 
3044          * When we enter this routine, we know 
3045          *  -- the resid will not exceed iov_len 
3046          *  -- the target address is physically contiguous 
3051         max_size 
= filesize 
- uio
->uio_offset
; 
3053         if (max_size 
> (off_t
)((unsigned int)iov
->iov_len
)) 
3054                 io_size 
= iov
->iov_len
; 
3058         upl_offset 
= (vm_offset_t
)iov
->iov_base 
& PAGE_MASK
; 
3059         upl_needed_size 
= upl_offset 
+ io_size
; 
3063         upl_size 
= upl_needed_size
; 
3064         upl_flags 
= UPL_FILE_IO 
| UPL_NO_SYNC 
| UPL_CLEAN_IN_PLACE 
| UPL_SET_INTERNAL 
| UPL_SET_LITE 
| UPL_SET_IO_WIRE
; 
3066         kret 
= vm_map_get_upl(current_map(), 
3067                               (vm_offset_t
)iov
->iov_base 
& ~PAGE_MASK
, 
3068                               &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0); 
3070         if (kret 
!= KERN_SUCCESS
) { 
3072                  * cluster_phys_read: failed to get pagelist 
3076         if (upl_size 
< upl_needed_size
) { 
3078                  * The upl_size wasn't satisfied. 
3080                 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
); 
3084         pl 
= ubc_upl_pageinfo(upl
); 
3086         dst_paddr 
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + ((addr64_t
)((u_int
)iov
->iov_base 
& PAGE_MASK
)); 
3088         while (((uio
->uio_offset 
& (devblocksize 
- 1)) || io_size 
< devblocksize
) && io_size
) { 
3091                 head_size 
= devblocksize 
- (int)(uio
->uio_offset 
& (devblocksize 
- 1)); 
3093                 if (head_size 
> io_size
) 
3094                         head_size 
= io_size
; 
3096                 error 
= cluster_align_phys_io(vp
, uio
, dst_paddr
, head_size
, devblocksize
, CL_READ
); 
3099                         ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
); 
3103                 upl_offset 
+= head_size
; 
3104                 dst_paddr  
+= head_size
; 
3105                 io_size    
-= head_size
; 
3107         tail_size 
= io_size 
& (devblocksize 
- 1); 
3108         io_size  
-= tail_size
; 
3110         iostate
.io_completed 
= 0; 
3111         iostate
.io_issued 
= 0; 
3112         iostate
.io_error 
= 0; 
3113         iostate
.io_wanted 
= 0; 
3115         while (io_size 
&& error 
== 0) { 
3118                 if (io_size 
> (MAX_UPL_TRANSFER 
* PAGE_SIZE
)) 
3119                         xsize 
= MAX_UPL_TRANSFER 
* PAGE_SIZE
; 
3123                  * request asynchronously so that we can overlap 
3124                  * the preparation of the next I/O... we'll do 
3125                  * the commit after all the I/O has completed 
3126                  * since its all issued against the same UPL 
3127                  * if there are already too many outstanding reads 
3128                  * wait until some have completed before issuing the next 
3130                 while ((iostate
.io_issued 
- iostate
.io_completed
) > (2 * MAX_UPL_TRANSFER 
* PAGE_SIZE
)) { 
3131                         iostate
.io_wanted 
= 1; 
3132                         tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO 
+ 1, "cluster_phys_read", 0); 
3135                 error 
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, xsize
, 0,  
3136                                    CL_READ 
| CL_NOZERO 
| CL_DEV_MEMORY 
| CL_ASYNC
, 
3137                                    (struct buf 
*)0, &iostate
); 
3139                  * The cluster_io read was issued successfully, 
3140                  * update the uio structure 
3143                         uio
->uio_resid  
-= xsize
; 
3144                         iov
->iov_len    
-= xsize
; 
3145                         iov
->iov_base   
+= xsize
; 
3146                         uio
->uio_offset 
+= xsize
; 
3148                         upl_offset      
+= xsize
; 
3153          * make sure all async reads that are part of this stream 
3154          * have completed before we proceed 
3156         while (iostate
.io_issued 
!= iostate
.io_completed
) { 
3157                 iostate
.io_wanted 
= 1; 
3158                 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO 
+ 1, "cluster_phys_read", 0); 
3160         if (iostate
.io_error
) { 
3161                 error 
= iostate
.io_error
; 
3163         if (error 
== 0 && tail_size
) 
3164                 error 
= cluster_align_phys_io(vp
, uio
, dst_paddr
, tail_size
, devblocksize
, CL_READ
); 
3167          * just release our hold on the physically contiguous 
3168          * region without changing any state 
3170         ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
); 
3177  * generate advisory I/O's in the largest chunks possible 
3178  * the completed pages will be released into the VM cache 
3181 advisory_read(vp
, filesize
, f_offset
, resid
, devblocksize
) 
3188         upl_page_info_t 
*pl
; 
3190         vm_offset_t      upl_offset
; 
3204         if (!UBCINFOEXISTS(vp
)) 
3207         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_START
, 
3208                      (int)f_offset
, resid
, (int)filesize
, devblocksize
, 0); 
3210         while (resid 
&& f_offset 
< filesize 
&& retval 
== 0) { 
3212                  * compute the size of the upl needed to encompass 
3213                  * the requested read... limit each call to cluster_io 
3214                  * to the maximum UPL size... cluster_io will clip if 
3215                  * this exceeds the maximum io_size for the device, 
3216                  * make sure to account for  
3217                  * a starting offset that's not page aligned 
3219                 start_offset 
= (int)(f_offset 
& PAGE_MASK_64
); 
3220                 upl_f_offset 
= f_offset 
- (off_t
)start_offset
; 
3221                 max_size     
= filesize 
- f_offset
; 
3223                 if (resid 
< max_size
) 
3228                 upl_size 
= (start_offset 
+ io_size 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
3229                 if (upl_size 
> (MAX_UPL_TRANSFER 
* PAGE_SIZE
)) 
3230                         upl_size 
= MAX_UPL_TRANSFER 
* PAGE_SIZE
; 
3234                  * return the number of contiguously present pages in the cache 
3235                  * starting at upl_f_offset within the file 
3237                 ubc_range_op(vp
, upl_f_offset
, upl_f_offset 
+ upl_size
, UPL_ROP_PRESENT
, &skip_range
); 
3241                          * skip over pages already present in the cache 
3243                         io_size 
= skip_range 
- start_offset
; 
3245                         f_offset 
+= io_size
; 
3248                         if (skip_range 
== upl_size
) 
3251                          * have to issue some real I/O 
3252                          * at this point, we know it's starting on a page boundary 
3253                          * because we've skipped over at least the first page in the request 
3256                         upl_f_offset 
+= skip_range
; 
3257                         upl_size     
-= skip_range
; 
3259                 pages_in_upl 
= upl_size 
/ PAGE_SIZE
; 
3261                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_START
, 
3262                              (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0); 
3264                 kret 
= ubc_create_upl(vp
,  
3269                                                 UPL_RET_ONLY_ABSENT 
| UPL_SET_LITE
); 
3270                 if (kret 
!= KERN_SUCCESS
) 
3275                  * before we start marching forward, we must make sure we end on  
3276                  * a present page, otherwise we will be working with a freed 
3279                 for (last_pg 
= pages_in_upl 
- 1; last_pg 
>= 0; last_pg
--) { 
3280                         if (upl_page_present(pl
, last_pg
)) 
3283                 pages_in_upl 
= last_pg 
+ 1; 
3286                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_END
, 
3287                              (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0); 
3290                 for (last_pg 
= 0; last_pg 
< pages_in_upl
; ) { 
3292                          * scan from the beginning of the upl looking for the first 
3293                          * page that is present.... this will become the first page in 
3294                          * the request we're going to make to 'cluster_io'... if all 
3295                          * of the pages are absent, we won't call through to 'cluster_io' 
3297                         for (start_pg 
= last_pg
; start_pg 
< pages_in_upl
; start_pg
++) { 
3298                                 if (upl_page_present(pl
, start_pg
)) 
3303                          * scan from the starting present page looking for an absent 
3304                          * page before the end of the upl is reached, if we  
3305                          * find one, then it will terminate the range of pages being 
3306                          * presented to 'cluster_io' 
3308                         for (last_pg 
= start_pg
; last_pg 
< pages_in_upl
; last_pg
++) { 
3309                                 if (!upl_page_present(pl
, last_pg
)) 
3313                         if (last_pg 
> start_pg
) {                
3315                                  * we found a range of pages that must be filled 
3316                                  * if the last page in this range is the last page of the file 
3317                                  * we may have to clip the size of it to keep from reading past 
3318                                  * the end of the last physical block associated with the file 
3320                                 upl_offset 
= start_pg 
* PAGE_SIZE
; 
3321                                 io_size    
= (last_pg 
- start_pg
) * PAGE_SIZE
; 
3323                                 if ((upl_f_offset 
+ upl_offset 
+ io_size
) > filesize
) 
3324                                         io_size 
= filesize 
- (upl_f_offset 
+ upl_offset
); 
3327                                  * issue an asynchronous read to cluster_io 
3329                                 retval 
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset 
+ upl_offset
, io_size
, devblocksize
, 
3330                                                     CL_ASYNC 
| CL_READ 
| CL_COMMIT 
| CL_AGE
, (struct buf 
*)0, (struct clios 
*)0); 
3336                         ubc_upl_abort(upl
, 0); 
3338                 io_size 
= upl_size 
- start_offset
; 
3340                 if (io_size 
> resid
) 
3342                 f_offset 
+= io_size
; 
3346         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_END
, 
3347                      (int)f_offset
, resid
, retval
, 0, 0); 
3359         if (!UBCINFOEXISTS(vp
) || (vp
->v_clen 
== 0 && !(vp
->v_flag 
& VHASDIRTY
))) 
3362         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_START
, 
3363                      vp
->v_flag 
& VHASDIRTY
, vp
->v_clen
, 0, 0, 0); 
3365         if (vp
->v_flag 
& VHASDIRTY
) { 
3366                 sparse_cluster_push(vp
, ubc_getsize(vp
), 1); 
3371                 retval 
= cluster_try_push(vp
, ubc_getsize(vp
), 0, 1); 
3373         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_END
, 
3374                      vp
->v_flag 
& VHASDIRTY
, vp
->v_clen
, retval
, 0, 0); 
3387         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0); 
3389         if (vp
->v_flag 
& VHASDIRTY
) { 
3390                 vfs_drt_control(&(vp
->v_scmap
), 0); 
3392                 vp
->v_flag 
&= ~VHASDIRTY
; 
3394         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_END
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0); 
3399 cluster_try_push(vp
, EOF
, can_delay
, push_all
) 
3411         struct v_cluster l_clusters
[MAX_CLUSTERS
]; 
3414          * make a local 'sorted' copy of the clusters 
3415          * and clear vp->v_clen so that new clusters can 
3418         for (cl_index 
= 0; cl_index 
< vp
->v_clen
; cl_index
++) { 
3419                 for (min_index 
= -1, cl_index1 
= 0; cl_index1 
< vp
->v_clen
; cl_index1
++) { 
3420                         if (vp
->v_clusters
[cl_index1
].start_pg 
== vp
->v_clusters
[cl_index1
].last_pg
) 
3422                         if (min_index 
== -1) 
3423                                 min_index 
= cl_index1
; 
3424                         else if (vp
->v_clusters
[cl_index1
].start_pg 
< vp
->v_clusters
[min_index
].start_pg
) 
3425                                 min_index 
= cl_index1
; 
3427                 if (min_index 
== -1) 
3429                 l_clusters
[cl_index
].start_pg 
= vp
->v_clusters
[min_index
].start_pg
; 
3430                 l_clusters
[cl_index
].last_pg  
= vp
->v_clusters
[min_index
].last_pg
; 
3432                 vp
->v_clusters
[min_index
].start_pg 
= vp
->v_clusters
[min_index
].last_pg
; 
3437         if (can_delay 
&& cl_len 
== MAX_CLUSTERS
) { 
3441                  * determine if we appear to be writing the file sequentially 
3442                  * if not, by returning without having pushed any clusters 
3443                  * we will cause this vnode to be pushed into the sparse cluster mechanism 
3444                  * used for managing more random I/O patterns 
3446                  * we know that we've got all clusters currently in use and the next write doesn't fit into one of them... 
3447                  * that's why we're in try_push with can_delay true... 
3449                  * check to make sure that all the clusters except the last one are 'full'... and that each cluster 
3450                  * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above 
3451                  * so we can just make a simple pass through up, to but not including the last one... 
3452                  * note that last_pg is not inclusive, so it will be equal to the start_pg of the next cluster if they 
3455                  * we let the last one be partial as long as it was adjacent to the previous one... 
3456                  * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out 
3457                  * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world... 
3459                 for (i 
= 0; i 
< MAX_CLUSTERS 
- 1; i
++) { 
3460                         if ((l_clusters
[i
].last_pg 
- l_clusters
[i
].start_pg
) != MAX_UPL_TRANSFER
) 
3462                         if (l_clusters
[i
].last_pg 
!= l_clusters
[i
+1].start_pg
) 
3466         for (cl_index 
= 0; cl_index 
< cl_len
; cl_index
++) { 
3468                  * try to push each cluster in turn...  cluster_push_x may not 
3469                  * push the cluster if can_delay is TRUE and the cluster doesn't 
3470                  * meet the critera for an immediate push 
3472                 if (cluster_push_x(vp
, EOF
, l_clusters
[cl_index
].start_pg
, l_clusters
[cl_index
].last_pg
, can_delay
)) { 
3473                         l_clusters
[cl_index
].start_pg 
= 0; 
3474                         l_clusters
[cl_index
].last_pg  
= 0; 
3483         if (cl_len 
> cl_pushed
) { 
3485                 * we didn't push all of the clusters, so 
3486                 * lets try to merge them back in to the vnode 
3488                 if ((MAX_CLUSTERS 
- vp
->v_clen
) < (cl_len 
- cl_pushed
)) { 
3490                          * we picked up some new clusters while we were trying to 
3491                          * push the old ones (I don't think this can happen because 
3492                          * I'm holding the lock, but just in case)... the sum of the 
3493                          * leftovers plus the new cluster count exceeds our ability 
3494                          * to represent them, so switch to the sparse cluster mechanism 
3498                          * first collect the new clusters sitting in the vp 
3500                         sparse_cluster_switch(vp
, EOF
); 
3502                         for (cl_index 
= 0, cl_index1 
= 0; cl_index 
< cl_len
; cl_index
++) { 
3503                                 if (l_clusters
[cl_index
].start_pg 
== l_clusters
[cl_index
].last_pg
) 
3505                                 vp
->v_clusters
[cl_index1
].start_pg 
= l_clusters
[cl_index
].start_pg
; 
3506                                 vp
->v_clusters
[cl_index1
].last_pg  
= l_clusters
[cl_index
].last_pg
; 
3511                          * update the cluster count 
3513                         vp
->v_clen 
= cl_index1
; 
3516                          * and collect the original clusters that were moved into the  
3517                          * local storage for sorting purposes 
3519                         sparse_cluster_switch(vp
, EOF
); 
3523                          * we've got room to merge the leftovers back in 
3524                          * just append them starting at the next 'hole' 
3525                          * represented by vp->v_clen 
3527                         for (cl_index 
= 0, cl_index1 
= vp
->v_clen
; cl_index 
< cl_len
; cl_index
++) { 
3528                                 if (l_clusters
[cl_index
].start_pg 
== l_clusters
[cl_index
].last_pg
) 
3531                                 vp
->v_clusters
[cl_index1
].start_pg 
= l_clusters
[cl_index
].start_pg
; 
3532                                 vp
->v_clusters
[cl_index1
].last_pg  
= l_clusters
[cl_index
].last_pg
; 
3537                          * update the cluster count 
3539                         vp
->v_clen 
= cl_index1
; 
3542         return(MAX_CLUSTERS 
- vp
->v_clen
); 
3548 cluster_push_x(vp
, EOF
, first
, last
, can_delay
) 
3555         upl_page_info_t 
*pl
; 
3557         vm_offset_t      upl_offset
; 
3570         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_START
, 
3571                      vp
->v_clen
, first
, last
, EOF
, 0); 
3573         if ((pages_in_upl 
= last 
- first
) == 0) { 
3574                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 0, 0, 0, 0); 
3578         upl_size 
= pages_in_upl 
* PAGE_SIZE
; 
3579         upl_f_offset 
= ((off_t
)first
) * PAGE_SIZE_64
; 
3581         if (upl_f_offset 
+ upl_size 
>= EOF
) { 
3583                 if (upl_f_offset 
>= EOF
) { 
3585                          * must have truncated the file and missed  
3586                          * clearing a dangling cluster (i.e. it's completely 
3587                          * beyond the new EOF 
3589                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 1, 0, 0, 0); 
3593                 size 
= EOF 
- upl_f_offset
; 
3595                 upl_size 
= (size 
+ (PAGE_SIZE 
- 1)) & ~PAGE_MASK
; 
3596                 pages_in_upl 
= upl_size 
/ PAGE_SIZE
; 
3600         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, size
, 0, 0, 0); 
3602         if (vp
->v_flag 
& VNOCACHE_DATA
) 
3603                 upl_flags 
= UPL_COPYOUT_FROM 
| UPL_RET_ONLY_DIRTY 
| UPL_SET_LITE 
| UPL_WILL_BE_DUMPED
; 
3605                 upl_flags 
= UPL_COPYOUT_FROM 
| UPL_RET_ONLY_DIRTY 
| UPL_SET_LITE
; 
3607         kret 
= ubc_create_upl(vp
,  
3613         if (kret 
!= KERN_SUCCESS
) 
3614                 panic("cluster_push: failed to get pagelist"); 
3616         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
, (int)upl
, upl_f_offset
, 0, 0, 0); 
3619          * since we only asked for the dirty pages back 
3620          * it's possible that we may only get a few or even none, so... 
3621          * before we start marching forward, we must make sure we know 
3622          * where the last present page is in the UPL, otherwise we could 
3623          * end up working with a freed upl due to the FREE_ON_EMPTY semantics 
3624          * employed by commit_range and abort_range. 
3626         for (last_pg 
= pages_in_upl 
- 1; last_pg 
>= 0; last_pg
--) { 
3627                 if (upl_page_present(pl
, last_pg
)) 
3630         pages_in_upl 
= last_pg 
+ 1; 
3632         if (pages_in_upl 
== 0) { 
3633                 ubc_upl_abort(upl
, 0); 
3635                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 2, 0, 0, 0); 
3639         for (last_pg 
= 0; last_pg 
< pages_in_upl
; ) { 
3641                  * find the next dirty page in the UPL 
3642                  * this will become the first page in the  
3643                  * next I/O to generate 
3645                 for (start_pg 
= last_pg
; start_pg 
< pages_in_upl
; start_pg
++) { 
3646                         if (upl_dirty_page(pl
, start_pg
)) 
3648                         if (upl_page_present(pl
, start_pg
)) 
3650                                  * RET_ONLY_DIRTY will return non-dirty 'precious' pages 
3651                                  * just release these unchanged since we're not going 
3652                                  * to steal them or change their state 
3654                                 ubc_upl_abort_range(upl
, start_pg 
* PAGE_SIZE
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
); 
3656                 if (start_pg 
>= pages_in_upl
) 
3658                          * done... no more dirty pages to push 
3661                 if (start_pg 
> last_pg
) 
3663                          * skipped over some non-dirty pages 
3665                         size 
-= ((start_pg 
- last_pg
) * PAGE_SIZE
); 
3668                  * find a range of dirty pages to write 
3670                 for (last_pg 
= start_pg
; last_pg 
< pages_in_upl
; last_pg
++) { 
3671                         if (!upl_dirty_page(pl
, last_pg
)) 
3674                 upl_offset 
= start_pg 
* PAGE_SIZE
; 
3676                 io_size 
= min(size
, (last_pg 
- start_pg
) * PAGE_SIZE
); 
3678                 if (vp
->v_flag 
& VNOCACHE_DATA
) 
3679                         io_flags 
= CL_THROTTLE 
| CL_COMMIT 
| CL_ASYNC 
| CL_DUMP
; 
3681                         io_flags 
= CL_THROTTLE 
| CL_COMMIT 
| CL_ASYNC
; 
3683                 cluster_io(vp
, upl
, upl_offset
, upl_f_offset 
+ upl_offset
, io_size
, vp
->v_ciosiz
, io_flags
, (struct buf 
*)0, (struct clios 
*)0); 
3687         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 3, 0, 0, 0); 
3694 sparse_cluster_switch(struct vnode 
*vp
, off_t EOF
) 
3698         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_START
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0); 
3700         if ( !(vp
->v_flag 
& VHASDIRTY
)) { 
3701                 vp
->v_flag 
|= VHASDIRTY
; 
3705         for (cl_index 
= 0; cl_index 
< vp
->v_clen
; cl_index
++) { 
3710                 for (start_pg 
= vp
->v_clusters
[cl_index
].start_pg
; start_pg 
< vp
->v_clusters
[cl_index
].last_pg
; start_pg
++) { 
3712                         if (ubc_page_op(vp
, (off_t
)(((off_t
)start_pg
) * PAGE_SIZE_64
), 0, 0, &flags
) == KERN_SUCCESS
) { 
3713                                 if (flags 
& UPL_POP_DIRTY
) 
3714                                         sparse_cluster_add(vp
, EOF
, start_pg
, start_pg 
+ 1); 
3718         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_END
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0); 
3723 sparse_cluster_push(struct vnode 
*vp
, off_t EOF
, int push_all
) 
3730         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_START
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, push_all
, 0); 
3733                 vfs_drt_control(&(vp
->v_scmap
), 1); 
3736                 if (vfs_drt_get_cluster(&(vp
->v_scmap
), &offset
, &length
) != KERN_SUCCESS
) { 
3737                         vp
->v_flag 
&= ~VHASDIRTY
; 
3741                 first 
= (daddr_t
)(offset 
/ PAGE_SIZE_64
); 
3742                 last  
= (daddr_t
)((offset 
+ length
) / PAGE_SIZE_64
); 
3744                 cluster_push_x(vp
, EOF
, first
, last
, 0); 
3746                 vp
->v_scdirty 
-= (last 
- first
); 
3751         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_END
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0); 
3756 sparse_cluster_add(struct vnode 
*vp
, off_t EOF
, daddr_t first
, daddr_t last
) 
3762         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_START
, (int)vp
->v_scmap
, vp
->v_scdirty
, first
, last
, 0); 
3764         offset 
= (off_t
)first 
* PAGE_SIZE_64
; 
3765         length 
= (last 
- first
) * PAGE_SIZE
; 
3767         while (vfs_drt_mark_pages(&(vp
->v_scmap
), offset
, length
, &new_dirty
) != KERN_SUCCESS
) { 
3769                  * no room left in the map 
3770                  * only a partial update was done 
3771                  * push out some pages and try again 
3773                 vp
->v_scdirty 
+= new_dirty
; 
3775                 sparse_cluster_push(vp
, EOF
, 0); 
3777                 offset 
+= (new_dirty 
* PAGE_SIZE_64
); 
3778                 length 
-= (new_dirty 
* PAGE_SIZE
); 
3780         vp
->v_scdirty 
+= new_dirty
; 
3782         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_END
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0); 
3787 cluster_align_phys_io(struct vnode 
*vp
, struct uio 
*uio
, addr64_t usr_paddr
, int xsize
, int devblocksize
, int flags
) 
3790         upl_page_info_t  
*pl
; 
3798         kret 
= ubc_create_upl(vp
, 
3799                               uio
->uio_offset 
& ~PAGE_MASK_64
, 
3805         if (kret 
!= KERN_SUCCESS
) 
3808         if (!upl_valid_page(pl
, 0)) { 
3810                  * issue a synchronous read to cluster_io 
3812                 error 
= cluster_io(vp
, upl
, 0, uio
->uio_offset 
& ~PAGE_MASK_64
, PAGE_SIZE
, devblocksize
, 
3813                                    CL_READ
, (struct buf 
*)0, (struct clios 
*)0); 
3815                           ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES 
| UPL_ABORT_FREE_ON_EMPTY
); 
3820         ubc_paddr 
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + (addr64_t
)(uio
->uio_offset 
& PAGE_MASK_64
); 
3823  *      NOTE:  There is no prototype for the following in BSD. It, and the definitions 
3824  *      of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in 
3825  *      osfmk/ppc/mappings.h.  They are not included here because there appears to be no 
3826  *      way to do so without exporting them to kexts as well. 
3828         if (flags 
& CL_READ
) 
3829 //              copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);    /* Copy physical to physical and flush the destination */ 
3830                 copypv(ubc_paddr
, usr_paddr
, xsize
,        2 |        1 |        4);    /* Copy physical to physical and flush the destination */ 
3832 //              copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);    /* Copy physical to physical and flush the source */ 
3833                 copypv(usr_paddr
, ubc_paddr
, xsize
,        2 |        1 |        8);    /* Copy physical to physical and flush the source */ 
3835         if ( !(flags 
& CL_READ
) || (upl_valid_page(pl
, 0) && upl_dirty_page(pl
, 0))) { 
3837                  * issue a synchronous write to cluster_io 
3839                 error 
= cluster_io(vp
, upl
, 0, uio
->uio_offset 
& ~PAGE_MASK_64
, PAGE_SIZE
, devblocksize
, 
3840                                         0, (struct buf 
*)0, (struct clios 
*)0); 
3843                 uio
->uio_offset 
+= xsize
; 
3844                 iov
->iov_base   
+= xsize
; 
3845                 iov
->iov_len    
-= xsize
; 
3846                 uio
->uio_resid  
-= xsize
; 
3848         ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES 
| UPL_ABORT_FREE_ON_EMPTY
); 
3856 cluster_copy_upl_data(struct uio 
*uio
, upl_t upl
, int upl_offset
, int xsize
) 
3863         upl_page_info_t 
*pl
; 
3864         boolean_t funnel_state 
= FALSE
; 
3867         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
, 
3868                      (int)uio
->uio_offset
, uio
->uio_resid
, upl_offset
, xsize
, 0); 
3870         if (xsize 
>= (16 * 1024)) 
3871                 funnel_state 
= thread_funnel_set(kernel_flock
, FALSE
); 
3873         segflg 
= uio
->uio_segflg
; 
3878           case UIO_USERISPACE
: 
3879                 uio
->uio_segflg 
= UIO_PHYS_USERSPACE
; 
3883                 uio
->uio_segflg 
= UIO_PHYS_SYSSPACE
; 
3886         pl 
= ubc_upl_pageinfo(upl
); 
3888         pg_index  
= upl_offset 
/ PAGE_SIZE
; 
3889         pg_offset 
= upl_offset 
& PAGE_MASK
; 
3890         csize     
= min(PAGE_SIZE 
- pg_offset
, xsize
); 
3892         while (xsize 
&& retval 
== 0) { 
3895                 paddr 
= ((addr64_t
)upl_phys_page(pl
, pg_index
) << 12) + pg_offset
; 
3897                 retval 
= uiomove64(paddr
, csize
, uio
); 
3902                 csize     
= min(PAGE_SIZE
, xsize
); 
3904         uio
->uio_segflg 
= segflg
; 
3906         if (funnel_state 
== TRUE
) 
3907                 thread_funnel_set(kernel_flock
, TRUE
); 
3909         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
, 
3910                      (int)uio
->uio_offset
, uio
->uio_resid
, retval
, segflg
, 0); 
3917 cluster_copy_ubc_data(struct vnode 
*vp
, struct uio 
*uio
, int *io_resid
, int mark_dirty
) 
3925         memory_object_control_t  control
; 
3926         int       op_flags 
= UPL_POP_SET 
| UPL_POP_BUSY
; 
3927         boolean_t funnel_state 
= FALSE
; 
3930         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
, 
3931                      (int)uio
->uio_offset
, uio
->uio_resid
, 0, *io_resid
, 0); 
3933         control 
= ubc_getobject(vp
, UBC_FLAGS_NONE
); 
3934         if (control 
== MEMORY_OBJECT_CONTROL_NULL
) { 
3935                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
, 
3936                              (int)uio
->uio_offset
, uio
->uio_resid
, retval
, 3, 0); 
3941                 op_flags 
|= UPL_POP_DIRTY
; 
3943         segflg 
= uio
->uio_segflg
; 
3948           case UIO_USERISPACE
: 
3949                 uio
->uio_segflg 
= UIO_PHYS_USERSPACE
; 
3953                 uio
->uio_segflg 
= UIO_PHYS_SYSSPACE
; 
3956         io_size      
= *io_resid
; 
3957         start_offset 
= (int)(uio
->uio_offset 
& PAGE_MASK_64
); 
3958         f_offset     
= uio
->uio_offset 
- start_offset
; 
3959         xsize        
= min(PAGE_SIZE 
- start_offset
, io_size
); 
3961         while (io_size 
&& retval 
== 0) { 
3964                 if (ubc_page_op_with_control(control
, f_offset
, op_flags
, &pgframe
, 0) != KERN_SUCCESS
) 
3967                 if (funnel_state 
== FALSE 
&& io_size 
>= (16 * 1024)) 
3968                         funnel_state 
= thread_funnel_set(kernel_flock
, FALSE
); 
3970                 retval 
= uiomove64((addr64_t
)(((addr64_t
)pgframe 
<< 12) + start_offset
), xsize
, uio
); 
3972                 ubc_page_op_with_control(control
, f_offset
, UPL_POP_CLR 
| UPL_POP_BUSY
, 0, 0); 
3976                 f_offset     
= uio
->uio_offset
; 
3977                 xsize        
= min(PAGE_SIZE
, io_size
); 
3979         uio
->uio_segflg 
= segflg
; 
3980         *io_resid       
= io_size
; 
3982         if (funnel_state 
== TRUE
) 
3983                 thread_funnel_set(kernel_flock
, TRUE
); 
3985         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
, 
3986                      (int)uio
->uio_offset
, uio
->uio_resid
, retval
, 0x80000000 | segflg
, 0); 
3993 is_file_clean(struct vnode 
*vp
, off_t filesize
) 
3997         int   total_dirty 
= 0; 
3999         for (f_offset 
= 0; f_offset 
< filesize
; f_offset 
+= PAGE_SIZE_64
) { 
4000                 if (ubc_page_op(vp
, f_offset
, 0, 0, &flags
) == KERN_SUCCESS
) { 
4001                         if (flags 
& UPL_POP_DIRTY
) { 
4015  * Dirty region tracking/clustering mechanism. 
4017  * This code (vfs_drt_*) provides a mechanism for tracking and clustering 
4018  * dirty regions within a larger space (file).  It is primarily intended to 
4019  * support clustering in large files with many dirty areas. 
4021  * The implementation assumes that the dirty regions are pages. 
4023  * To represent dirty pages within the file, we store bit vectors in a 
4024  * variable-size circular hash. 
4028  * Bitvector size.  This determines the number of pages we group in a 
4029  * single hashtable entry.  Each hashtable entry is aligned to this 
4030  * size within the file. 
4032 #define DRT_BITVECTOR_PAGES             256 
4035  * File offset handling. 
4037  * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES; 
4038  * the correct formula is  (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1) 
4040 #define DRT_ADDRESS_MASK                (~((1 << 20) - 1)) 
4041 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK) 
4044  * Hashtable address field handling. 
4046  * The low-order bits of the hashtable address are used to conserve 
4049  * DRT_HASH_COUNT_MASK must be large enough to store the range 
4050  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value 
4051  * to indicate that the bucket is actually unoccupied. 
4053 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK) 
4054 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \ 
4056                 (scm)->scm_hashtable[(i)].dhe_control =                                                 \ 
4057                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \ 
4059 #define DRT_HASH_COUNT_MASK             0x1ff 
4060 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK) 
4061 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \ 
4063                 (scm)->scm_hashtable[(i)].dhe_control =                                                                 \ 
4064                     ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \ 
4066 #define DRT_HASH_CLEAR(scm, i)                                                                                          \ 
4068                 (scm)->scm_hashtable[(i)].dhe_control = 0;                                                              \ 
4070 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK) 
4071 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK) 
4072 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \ 
4074                 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \ 
4075                 DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \ 
4080  * Hash table moduli. 
4082  * Since the hashtable entry's size is dependent on the size of 
4083  * the bitvector, and since the hashtable size is constrained to 
4084  * both being prime and fitting within the desired allocation 
4085  * size, these values need to be manually determined. 
4087  * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes. 
4089  * The small hashtable allocation is 1024 bytes, so the modulus is 23. 
4090  * The large hashtable allocation is 16384 bytes, so the modulus is 401. 
4092 #define DRT_HASH_SMALL_MODULUS  23 
4093 #define DRT_HASH_LARGE_MODULUS  401 
4095 #define DRT_SMALL_ALLOCATION    1024    /* 104 bytes spare */ 
4096 #define DRT_LARGE_ALLOCATION    16384   /* 344 bytes spare */ 
4098 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */ 
4101  * Hashtable bitvector handling. 
4103  * Bitvector fields are 32 bits long. 
4106 #define DRT_HASH_SET_BIT(scm, i, bit)                           \ 
4107         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32)) 
4109 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \ 
4110         (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32)) 
4112 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \ 
4113         ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32))) 
4115 #define DRT_BITVECTOR_CLEAR(scm, i)                             \ 
4116         bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t)) 
4118 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \ 
4119         bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \ 
4120             &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \ 
4121             (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t)) 
4128 struct vfs_drt_hashentry 
{ 
4129         u_int64_t       dhe_control
; 
4130         u_int32_t       dhe_bitvector
[DRT_BITVECTOR_PAGES 
/ 32]; 
4134  * Dirty Region Tracking structure. 
4136  * The hashtable is allocated entirely inside the DRT structure. 
4138  * The hash is a simple circular prime modulus arrangement, the structure 
4139  * is resized from small to large if it overflows. 
4142 struct vfs_drt_clustermap 
{ 
4143         u_int32_t               scm_magic
;      /* sanity/detection */ 
4144 #define DRT_SCM_MAGIC           0x12020003 
4145         u_int32_t               scm_modulus
;    /* current ring size */ 
4146         u_int32_t               scm_buckets
;    /* number of occupied buckets */ 
4147         u_int32_t               scm_lastclean
;  /* last entry we cleaned */ 
4148         u_int32_t               scm_iskips
;     /* number of slot skips */ 
4150         struct vfs_drt_hashentry scm_hashtable
[0]; 
4154 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus) 
4155 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus) 
4158  * Debugging codes and arguments. 
4160 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */ 
4161 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */ 
4162 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */ 
4163 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */ 
4164 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length, 
4167                                                            /* 1 (clean, no map) */ 
4168                                                            /* 2 (map alloc fail) */ 
4169                                                            /* 3, resid (partial) */ 
4170 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87)) 
4171 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets, 
4172                                                             * lastclean, iskips */ 
4175 static void             vfs_drt_sanity(struct vfs_drt_clustermap 
*cmap
); 
4176 static kern_return_t    
vfs_drt_alloc_map(struct vfs_drt_clustermap 
**cmapp
); 
4177 static kern_return_t    
vfs_drt_free_map(struct vfs_drt_clustermap 
*cmap
); 
4178 static kern_return_t    
vfs_drt_search_index(struct vfs_drt_clustermap 
*cmap
, 
4179         u_int64_t offset
, int *indexp
); 
4180 static kern_return_t    
vfs_drt_get_index(struct vfs_drt_clustermap 
**cmapp
, 
4184 static kern_return_t    
vfs_drt_do_mark_pages( 
4190 static void             vfs_drt_trace( 
4191         struct vfs_drt_clustermap 
*cmap
, 
4200  * Allocate and initialise a sparse cluster map. 
4202  * Will allocate a new map, resize or compact an existing map. 
4204  * XXX we should probably have at least one intermediate map size, 
4205  * as the 1:16 ratio seems a bit drastic. 
4207 static kern_return_t
 
4208 vfs_drt_alloc_map(struct vfs_drt_clustermap 
**cmapp
) 
4210         struct vfs_drt_clustermap 
*cmap
, *ocmap
; 
4213         int             nsize
, i
, active_buckets
, index
, copycount
; 
4220          * Decide on the size of the new map. 
4222         if (ocmap 
== NULL
) { 
4223                 nsize 
= DRT_HASH_SMALL_MODULUS
; 
4225                 /* count the number of active buckets in the old map */ 
4227                 for (i 
= 0; i 
< ocmap
->scm_modulus
; i
++) { 
4228                         if (!DRT_HASH_VACANT(ocmap
, i
) && 
4229                             (DRT_HASH_GET_COUNT(ocmap
, i
) != 0)) 
4233                  * If we're currently using the small allocation, check to 
4234                  * see whether we should grow to the large one. 
4236                 if (ocmap
->scm_modulus 
== DRT_HASH_SMALL_MODULUS
) { 
4237                         /* if the ring is nearly full */ 
4238                         if (active_buckets 
> (DRT_HASH_SMALL_MODULUS 
- 5)) { 
4239                                 nsize 
= DRT_HASH_LARGE_MODULUS
; 
4241                                 nsize 
= DRT_HASH_SMALL_MODULUS
; 
4244                         /* already using the large modulus */ 
4245                         nsize 
= DRT_HASH_LARGE_MODULUS
; 
4247                          * If the ring is completely full, there's 
4248                          * nothing useful for us to do.  Behave as 
4249                          * though we had compacted into the new 
4252                         if (active_buckets 
>= DRT_HASH_LARGE_MODULUS
) 
4253                                 return(KERN_SUCCESS
); 
4258          * Allocate and initialise the new map. 
4261         kret 
= kmem_alloc(kernel_map
, (vm_offset_t 
*)&cmap
, 
4262             (nsize 
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION 
: DRT_LARGE_ALLOCATION
); 
4263         if (kret 
!= KERN_SUCCESS
) 
4265         cmap
->scm_magic 
= DRT_SCM_MAGIC
; 
4266         cmap
->scm_modulus 
= nsize
; 
4267         cmap
->scm_buckets 
= 0; 
4268         cmap
->scm_lastclean 
= 0; 
4269         cmap
->scm_iskips 
= 0; 
4270         for (i 
= 0; i 
< cmap
->scm_modulus
; i
++) { 
4271                 DRT_HASH_CLEAR(cmap
, i
); 
4272                 DRT_HASH_VACATE(cmap
, i
); 
4273                 DRT_BITVECTOR_CLEAR(cmap
, i
); 
4277          * If there's an old map, re-hash entries from it into the new map. 
4280         if (ocmap 
!= NULL
) { 
4281                 for (i 
= 0; i 
< ocmap
->scm_modulus
; i
++) { 
4282                         /* skip empty buckets */ 
4283                         if (DRT_HASH_VACANT(ocmap
, i
) || 
4284                             (DRT_HASH_GET_COUNT(ocmap
, i
) == 0)) 
4287                         offset 
= DRT_HASH_GET_ADDRESS(ocmap
, i
); 
4288                         kret 
= vfs_drt_get_index(&cmap
, offset
, &index
, 1); 
4289                         if (kret 
!= KERN_SUCCESS
) { 
4290                                 /* XXX need to bail out gracefully here */ 
4291                                 panic("vfs_drt: new cluster map mysteriously too small"); 
4294                         DRT_HASH_COPY(ocmap
, i
, cmap
, index
); 
4299         /* log what we've done */ 
4300         vfs_drt_trace(cmap
, DRT_DEBUG_ALLOC
, copycount
, 0, 0, 0); 
4303          * It's important to ensure that *cmapp always points to  
4304          * a valid map, so we must overwrite it before freeing 
4308         if (ocmap 
!= NULL
) { 
4309                 /* emit stats into trace buffer */ 
4310                 vfs_drt_trace(ocmap
, DRT_DEBUG_SCMDATA
, 
4313                               ocmap
->scm_lastclean
, 
4316                 vfs_drt_free_map(ocmap
); 
4318         return(KERN_SUCCESS
); 
4323  * Free a sparse cluster map. 
4325 static kern_return_t
 
4326 vfs_drt_free_map(struct vfs_drt_clustermap 
*cmap
) 
4330         kmem_free(kernel_map
, (vm_offset_t
)cmap
,  
4331                   (cmap
->scm_modulus 
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION 
: DRT_LARGE_ALLOCATION
); 
4332         return(KERN_SUCCESS
); 
4337  * Find the hashtable slot currently occupied by an entry for the supplied offset. 
4339 static kern_return_t
 
4340 vfs_drt_search_index(struct vfs_drt_clustermap 
*cmap
, u_int64_t offset
, int *indexp
) 
4343         int             index
, i
, tries
; 
4345         offset 
= DRT_ALIGN_ADDRESS(offset
); 
4346         index 
= DRT_HASH(cmap
, offset
); 
4348         /* traverse the hashtable */ 
4349         for (i 
= 0; i 
< cmap
->scm_modulus
; i
++) { 
4352                  * If the slot is vacant, we can stop. 
4354                 if (DRT_HASH_VACANT(cmap
, index
)) 
4358                  * If the address matches our offset, we have success. 
4360                 if (DRT_HASH_GET_ADDRESS(cmap
, index
) == offset
) { 
4362                         return(KERN_SUCCESS
); 
4366                  * Move to the next slot, try again. 
4368                 index 
= DRT_HASH_NEXT(cmap
, index
); 
4373         return(KERN_FAILURE
); 
4377  * Find the hashtable slot for the supplied offset.  If we haven't allocated 
4378  * one yet, allocate one and populate the address field.  Note that it will 
4379  * not have a nonzero page count and thus will still technically be free, so 
4380  * in the case where we are called to clean pages, the slot will remain free. 
4382 static kern_return_t
 
4383 vfs_drt_get_index(struct vfs_drt_clustermap 
**cmapp
, u_int64_t offset
, int *indexp
, int recursed
) 
4385         struct vfs_drt_clustermap 
*cmap
; 
4391         /* look for an existing entry */ 
4392         kret 
= vfs_drt_search_index(cmap
, offset
, indexp
); 
4393         if (kret 
== KERN_SUCCESS
) 
4396         /* need to allocate an entry */ 
4397         offset 
= DRT_ALIGN_ADDRESS(offset
); 
4398         index 
= DRT_HASH(cmap
, offset
); 
4400         /* scan from the index forwards looking for a vacant slot */ 
4401         for (i 
= 0; i 
< cmap
->scm_modulus
; i
++) { 
4403                 if (DRT_HASH_VACANT(cmap
, index
) || DRT_HASH_GET_COUNT(cmap
,index
) == 0) { 
4404                         cmap
->scm_buckets
++; 
4405                         if (index 
< cmap
->scm_lastclean
) 
4406                                 cmap
->scm_lastclean 
= index
; 
4407                         DRT_HASH_SET_ADDRESS(cmap
, index
, offset
); 
4408                         DRT_HASH_SET_COUNT(cmap
, index
, 0); 
4409                         DRT_BITVECTOR_CLEAR(cmap
, index
); 
4411                         vfs_drt_trace(cmap
, DRT_DEBUG_INSERT
, (int)offset
, i
, 0, 0); 
4412                         return(KERN_SUCCESS
); 
4414                 cmap
->scm_iskips 
+= i
; 
4415                 index 
= DRT_HASH_NEXT(cmap
, index
); 
4419          * We haven't found a vacant slot, so the map is full.  If we're not 
4420          * already recursed, try reallocating/compacting it. 
4423                 return(KERN_FAILURE
); 
4424         kret 
= vfs_drt_alloc_map(cmapp
); 
4425         if (kret 
== KERN_SUCCESS
) { 
4426                 /* now try to insert again */ 
4427                 kret 
= vfs_drt_get_index(cmapp
, offset
, indexp
, 1); 
4433  * Implementation of set dirty/clean. 
4435  * In the 'clean' case, not finding a map is OK. 
4437 static kern_return_t
 
4438 vfs_drt_do_mark_pages( 
4445         struct vfs_drt_clustermap 
*cmap
, **cmapp
; 
4447         int             i
, index
, pgoff
, pgcount
, setcount
, ecount
; 
4449         cmapp 
= (struct vfs_drt_clustermap 
**)private; 
4452         vfs_drt_trace(cmap
, DRT_DEBUG_MARK 
| DBG_FUNC_START
, (int)offset
, (int)length
, dirty
, 0); 
4454         if (setcountp 
!= NULL
) 
4457         /* allocate a cluster map if we don't already have one */ 
4459                 /* no cluster map, nothing to clean */ 
4461                         vfs_drt_trace(cmap
, DRT_DEBUG_MARK 
| DBG_FUNC_END
, 1, 0, 0, 0); 
4462                         return(KERN_SUCCESS
); 
4464                 kret 
= vfs_drt_alloc_map(cmapp
); 
4465                 if (kret 
!= KERN_SUCCESS
) { 
4466                         vfs_drt_trace(cmap
, DRT_DEBUG_MARK 
| DBG_FUNC_END
, 2, 0, 0, 0); 
4473          * Iterate over the length of the region. 
4475         while (length 
> 0) { 
4477                  * Get the hashtable index for this offset. 
4479                  * XXX this will add blank entries if we are clearing a range 
4480                  * that hasn't been dirtied. 
4482                 kret 
= vfs_drt_get_index(cmapp
, offset
, &index
, 0); 
4483                 cmap 
= *cmapp
;  /* may have changed! */ 
4484                 /* this may be a partial-success return */ 
4485                 if (kret 
!= KERN_SUCCESS
) { 
4486                         if (setcountp 
!= NULL
) 
4487                                 *setcountp 
= setcount
; 
4488                         vfs_drt_trace(cmap
, DRT_DEBUG_MARK 
| DBG_FUNC_END
, 3, (int)length
, 0, 0); 
4494                  * Work out how many pages we're modifying in this 
4497                 pgoff 
= (offset 
- DRT_ALIGN_ADDRESS(offset
)) / PAGE_SIZE
; 
4498                 pgcount 
= min((length 
/ PAGE_SIZE
), (DRT_BITVECTOR_PAGES 
- pgoff
)); 
4501                  * Iterate over pages, dirty/clearing as we go. 
4503                 ecount 
= DRT_HASH_GET_COUNT(cmap
, index
); 
4504                 for (i 
= 0; i 
< pgcount
; i
++) { 
4506                                 if (!DRT_HASH_TEST_BIT(cmap
, index
, pgoff 
+ i
)) { 
4507                                         DRT_HASH_SET_BIT(cmap
, index
, pgoff 
+ i
); 
4512                                 if (DRT_HASH_TEST_BIT(cmap
, index
, pgoff 
+ i
)) { 
4513                                         DRT_HASH_CLEAR_BIT(cmap
, index
, pgoff 
+ i
); 
4519                 DRT_HASH_SET_COUNT(cmap
, index
, ecount
); 
4521                 offset 
+= pgcount 
* PAGE_SIZE
; 
4522                 length 
-= pgcount 
* PAGE_SIZE
; 
4524         if (setcountp 
!= NULL
) 
4525                 *setcountp 
= setcount
; 
4527         vfs_drt_trace(cmap
, DRT_DEBUG_MARK 
| DBG_FUNC_END
, 0, setcount
, 0, 0); 
4529         return(KERN_SUCCESS
); 
4533  * Mark a set of pages as dirty/clean. 
4535  * This is a public interface. 
4538  *      Pointer to storage suitable for holding a pointer.  Note that 
4539  *      this must either be NULL or a value set by this function. 
4542  *      Current file size in bytes. 
4545  *      Offset of the first page to be marked as dirty, in bytes.  Must be 
4549  *      Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE. 
4552  *      Number of pages newly marked dirty by this call (optional). 
4554  * Returns KERN_SUCCESS if all the pages were successfully marked. 
4556 static kern_return_t
 
4557 vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, int *setcountp
) 
4559         /* XXX size unused, drop from interface */ 
4560         return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, setcountp
, 1)); 
4563 static kern_return_t
 
4564 vfs_drt_unmark_pages(void **cmapp
, off_t offset
, u_int length
) 
4566         return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0)); 
4570  * Get a cluster of dirty pages. 
4572  * This is a public interface. 
4575  *      Pointer to storage managed by drt_mark_pages.  Note that this must 
4576  *      be NULL or a value set by drt_mark_pages. 
4579  *      Returns the byte offset into the file of the first page in the cluster. 
4582  *      Returns the length in bytes of the cluster of dirty pages. 
4584  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there 
4585  * are no dirty pages meeting the minmum size criteria.  Private storage will 
4586  * be released if there are no more dirty pages left in the map 
4589 static kern_return_t
 
4590 vfs_drt_get_cluster(void **cmapp
, off_t 
*offsetp
, u_int 
*lengthp
) 
4592         struct vfs_drt_clustermap 
*cmap
; 
4595         int             index
, i
, j
, fs
, ls
; 
4598         if ((cmapp 
== NULL
) || (*cmapp 
== NULL
)) 
4599                 return(KERN_FAILURE
); 
4602         /* walk the hashtable */ 
4603         for (offset 
= 0, j 
= 0; j 
< cmap
->scm_modulus
; offset 
+= (DRT_BITVECTOR_PAGES 
* PAGE_SIZE
), j
++) { 
4604                 index 
= DRT_HASH(cmap
, offset
); 
4606                 if (DRT_HASH_VACANT(cmap
, index
) || (DRT_HASH_GET_COUNT(cmap
, index
) == 0)) 
4609                 /* scan the bitfield for a string of bits */ 
4612                 for (i 
= 0; i 
< DRT_BITVECTOR_PAGES
; i
++) { 
4613                         if (DRT_HASH_TEST_BIT(cmap
, index
, i
)) { 
4619                         /*  didn't find any bits set */ 
4620                         panic("vfs_drt: entry summary count > 0 but no bits set in map"); 
4622                 for (ls 
= 0; i 
< DRT_BITVECTOR_PAGES
; i
++, ls
++) { 
4623                         if (!DRT_HASH_TEST_BIT(cmap
, index
, i
)) 
4627                 /* compute offset and length, mark pages clean */ 
4628                 offset 
= DRT_HASH_GET_ADDRESS(cmap
, index
) + (PAGE_SIZE 
* fs
); 
4629                 length 
= ls 
* PAGE_SIZE
; 
4630                 vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0); 
4631                 cmap
->scm_lastclean 
= index
; 
4633                 /* return successful */ 
4634                 *offsetp 
= (off_t
)offset
; 
4637                 vfs_drt_trace(cmap
, DRT_DEBUG_RETCLUSTER
, (int)offset
, (int)length
, 0, 0); 
4638                 return(KERN_SUCCESS
); 
4641          * We didn't find anything... hashtable is empty 
4642          * emit stats into trace buffer and 
4645         vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
, 
4648                       cmap
->scm_lastclean
, 
4651         vfs_drt_free_map(cmap
); 
4654         return(KERN_FAILURE
); 
4658 static kern_return_t
 
4659 vfs_drt_control(void **cmapp
, int op_type
) 
4661         struct vfs_drt_clustermap 
*cmap
; 
4664         if ((cmapp 
== NULL
) || (*cmapp 
== NULL
)) 
4665                 return(KERN_FAILURE
); 
4670                 /* emit stats into trace buffer */ 
4671                 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
, 
4674                               cmap
->scm_lastclean
, 
4677                 vfs_drt_free_map(cmap
); 
4682                 cmap
->scm_lastclean 
= 0; 
4685         return(KERN_SUCCESS
); 
4691  * Emit a summary of the state of the clustermap into the trace buffer 
4692  * along with some caller-provided data. 
4695 vfs_drt_trace(struct vfs_drt_clustermap 
*cmap
, int code
, int arg1
, int arg2
, int arg3
, int arg4
) 
4697         KERNEL_DEBUG(code
, arg1
, arg2
, arg3
, arg4
, 0); 
4701  * Perform basic sanity check on the hash entry summary count 
4702  * vs. the actual bits set in the entry. 
4705 vfs_drt_sanity(struct vfs_drt_clustermap 
*cmap
) 
4710         for (index 
= 0; index 
< cmap
->scm_modulus
; index
++) { 
4711                 if (DRT_HASH_VACANT(cmap
, index
)) 
4714                 for (bits_on 
= 0, i 
= 0; i 
< DRT_BITVECTOR_PAGES
; i
++) { 
4715                         if (DRT_HASH_TEST_BIT(cmap
, index
, i
)) 
4718                 if (bits_on 
!= DRT_HASH_GET_COUNT(cmap
, index
)) 
4719                         panic("bits_on = %d,  index = %d\n", bits_on
, index
);