2  * Copyright (c) 2003-2004 Apple Computer, Inc. All rights reserved. 
   4  * @APPLE_LICENSE_HEADER_START@ 
   6  * The contents of this file constitute Original Code as defined in and 
   7  * are subject to the Apple Public Source License Version 1.1 (the 
   8  * "License").  You may not use this file except in compliance with the 
   9  * License.  Please obtain a copy of the License at 
  10  * http://www.apple.com/publicsource and read it before using this file. 
  12  * This Original Code and all software distributed under the License are 
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the 
  17  * License for the specific language governing rights and limitations 
  20  * @APPLE_LICENSE_HEADER_END@ 
  26  *              1) ramesh is looking into how to replace taking a reference on 
  27  *                      the user's map (vm_map_reference()) since it is believed that  
  28  *                      would not hold the process for us. 
  29  *              2) david is looking into a way for us to set the priority of the 
  30  *                      worker threads to match that of the user's thread when the  
  31  *                      async IO was queued. 
  36  * This file contains support for the POSIX 1003.1B AIO/LIO facility. 
  39 #include <sys/systm.h> 
  40 #include <sys/fcntl.h> 
  41 #include <sys/file_internal.h> 
  42 #include <sys/filedesc.h> 
  43 #include <sys/kernel.h> 
  44 #include <sys/vnode_internal.h> 
  45 #include <sys/malloc.h> 
  46 #include <sys/mount_internal.h> 
  47 #include <sys/param.h> 
  48 #include <sys/proc_internal.h> 
  49 #include <sys/sysctl.h> 
  50 #include <sys/unistd.h> 
  53 #include <sys/aio_kern.h> 
  54 #include <sys/sysproto.h> 
  56 #include <machine/limits.h> 
  58 #include <mach/mach_types.h> 
  59 #include <kern/kern_types.h> 
  60 #include <kern/zalloc.h> 
  61 #include <kern/task.h> 
  62 #include <kern/sched_prim.h> 
  64 #include <vm/vm_map.h> 
  66 #include <sys/kdebug.h> 
  67 #define AIO_work_queued                                 1 
  68 #define AIO_worker_wake                                 2 
  69 #define AIO_completion_sig                              3 
  70 #define AIO_completion_cleanup_wait             4 
  71 #define AIO_completion_cleanup_wake             5 
  72 #define AIO_completion_suspend_wake     6 
  73 #define AIO_fsync_delay                                 7 
  75 #define AIO_cancel_async_workq                  11 
  76 #define AIO_cancel_sync_workq                   12 
  77 #define AIO_cancel_activeq                              13 
  78 #define AIO_cancel_doneq                                14 
  84 #define AIO_error_val                                   61 
  85 #define AIO_error_activeq                               62 
  86 #define AIO_error_workq                                 63 
  88 #define AIO_return_val                                  71 
  89 #define AIO_return_activeq                              72 
  90 #define AIO_return_workq                                73 
  93 #define AIO_exit_sleep                                  91 
  95 #define AIO_close_sleep                                 101 
  96 #define AIO_suspend                                             110 
  97 #define AIO_suspend_sleep                               111 
  98 #define AIO_worker_thread                               120 
 102 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT 
 106  * aio requests queue up on the aio_async_workq or lio_sync_workq (for  
 107  * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq  
 108  * (proc.aio_activeq) when one of our worker threads start the IO.  
 109  * And finally, requests move to the per process aio_doneq (proc.aio_doneq) 
 110  * when the IO request completes.  The request remains on aio_doneq until  
 111  * user process calls aio_return or the process exits, either way that is our  
 112  * trigger to release aio resources.  
 116         int                                                                     aio_async_workq_count
;  /* entries on aio_async_workq */ 
 117         int                                                                     lio_sync_workq_count
;   /* entries on lio_sync_workq */ 
 118         int                                                                     aio_active_count
;       /* entries on all active queues (proc.aio_activeq) */ 
 119         int                                                                     aio_done_count
;         /* entries on all done queues (proc.aio_doneq) */ 
 120         TAILQ_HEAD( , aio_workq_entry 
)         aio_async_workq
; 
 121         TAILQ_HEAD( , aio_workq_entry 
)         lio_sync_workq
; 
 123 typedef struct aio_anchor_cb aio_anchor_cb
; 
 127  * Notes on aio sleep / wake channels. 
 128  * We currently pick a couple fields within the proc structure that will allow 
 129  * us sleep channels that currently do not collide with any other kernel routines. 
 130  * At this time, for binary compatibility reasons, we cannot create new proc fields. 
 132 #define AIO_SUSPEND_SLEEP_CHAN  p_estcpu 
 133 #define AIO_CLEANUP_SLEEP_CHAN  p_pctcpu 
 137  * aysnc IO locking macros used to protect critical sections. 
 139 #define AIO_LOCK        lck_mtx_lock(aio_lock) 
 140 #define AIO_UNLOCK      lck_mtx_unlock(aio_lock) 
 146 static int                      aio_active_requests_for_process( struct proc 
*procp 
); 
 147 static boolean_t        
aio_delay_fsync_request( aio_workq_entry 
*entryp 
); 
 148 static int                      aio_free_request( aio_workq_entry 
*entryp
, vm_map_t the_map 
); 
 149 static int                      aio_get_all_queues_count( void ); 
 150 static int                      aio_get_process_count( struct proc 
*procp 
); 
 151 static aio_workq_entry 
*  aio_get_some_work( void ); 
 152 static boolean_t        
aio_last_group_io( aio_workq_entry 
*entryp 
); 
 153 static void                     aio_mark_requests( aio_workq_entry 
*entryp 
); 
 154 static int                      aio_queue_async_request( struct proc 
*procp
,  
 157 static int                      aio_validate( aio_workq_entry 
*entryp 
); 
 158 static void                     aio_work_thread( void ); 
 159 static int                      do_aio_cancel(  struct proc 
*p
,  
 162                                                                         boolean_t wait_for_completion
, 
 163                                                                         boolean_t disable_notification 
); 
 164 static void                     do_aio_completion( aio_workq_entry 
*entryp 
); 
 165 static int                      do_aio_fsync( aio_workq_entry 
*entryp 
); 
 166 static int                      do_aio_read( aio_workq_entry 
*entryp 
); 
 167 static int                      do_aio_write( aio_workq_entry 
*entryp 
); 
 168 static void             do_munge_aiocb( struct aiocb 
*my_aiocbp
, struct user_aiocb 
*the_user_aiocbp 
); 
 169 static boolean_t        
is_already_queued(      struct proc 
*procp
,  
 170                                                                                 user_addr_t aiocbp 
); 
 171 static int                      lio_create_async_entry( struct proc 
*procp
,  
 175                                                                                          aio_workq_entry 
**entrypp 
); 
 176 static int                      lio_create_sync_entry( struct proc 
*procp
,  
 179                                                                                         aio_workq_entry 
**entrypp 
); 
 183  *  EXTERNAL PROTOTYPES 
 186 /* in ...bsd/kern/sys_generic.c */ 
 187 extern int                      dofileread( struct proc 
*p
, struct fileproc 
*fp
, int fd
,  
 188                                                                 user_addr_t bufp
, user_size_t nbyte
,  
 189                                                                 off_t offset
, int flags
, user_ssize_t 
*retval 
); 
 190 extern int                      dofilewrite( struct proc 
*p
, struct fileproc 
*fp
, int fd
,  
 191                                                                  user_addr_t bufp
, user_size_t nbyte
, off_t offset
,  
 192                                                                  int flags
, user_ssize_t 
*retval 
); 
 195  * aio external global variables. 
 197 extern int aio_max_requests
;                            /* AIO_MAX - configurable */ 
 198 extern int aio_max_requests_per_process
;        /* AIO_PROCESS_MAX - configurable */ 
 199 extern int aio_worker_threads
;                          /* AIO_THREAD_COUNT - configurable */ 
 203  * aio static variables. 
 205 static aio_anchor_cb            aio_anchor
; 
 206 static lck_mtx_t 
*              aio_lock
; 
 207 static lck_grp_t 
*              aio_lock_grp
; 
 208 static lck_attr_t 
*             aio_lock_attr
; 
 209 static lck_grp_attr_t 
*         aio_lock_grp_attr
; 
 210 static struct zone              
*aio_workq_zonep
; 
 216  * aio_cancel - attempt to cancel one or more async IO requests currently 
 217  * outstanding against file descriptor uap->fd.  If uap->aiocbp is not  
 218  * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp 
 219  * is NULL then all outstanding async IO request for the given file 
 220  * descriptor are cancelled (if possible). 
 224 aio_cancel( struct proc 
*p
, struct aio_cancel_args 
*uap
, int *retval 
) 
 226         struct user_aiocb               my_aiocb
; 
 229         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel
)) | DBG_FUNC_START
, 
 230                           (int)p
, (int)uap
->aiocbp
, 0, 0, 0 ); 
 232         /* quick check to see if there are any async IO requests queued up */ 
 234         result 
= aio_get_all_queues_count( ); 
 242         if ( uap
->aiocbp 
!= USER_ADDR_NULL 
) { 
 243                 if ( !IS_64BIT_PROCESS(p
) ) { 
 244                         struct aiocb aiocb32
; 
 246                         result 
= copyin( uap
->aiocbp
, &aiocb32
, sizeof(aiocb32
) ); 
 248                                 do_munge_aiocb( &aiocb32
, &my_aiocb 
); 
 250                         result 
= copyin( uap
->aiocbp
, &my_aiocb
, sizeof(my_aiocb
) ); 
 257                 /* NOTE - POSIX standard says a mismatch between the file */ 
 258                 /* descriptor passed in and the file descriptor embedded in */ 
 259                 /* the aiocb causes unspecified results.  We return EBADF in */ 
 260                 /* that situation.  */ 
 261                 if ( uap
->fd 
!= my_aiocb
.aio_fildes 
) { 
 266         result 
= do_aio_cancel( p
, uap
->fd
, uap
->aiocbp
, FALSE
, FALSE 
); 
 268         if ( result 
!= -1 ) { 
 277         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel
)) | DBG_FUNC_END
, 
 278                           (int)p
, (int)uap
->aiocbp
, result
, 0, 0 ); 
 286  * _aio_close - internal function used to clean up async IO requests for  
 287  * a file descriptor that is closing.   
 291 __private_extern__ 
void 
 292 _aio_close( struct proc 
*p
, int fd 
) 
 296         /* quick check to see if there are any async IO requests queued up */ 
 298         count 
= aio_get_all_queues_count( ); 
 303         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close
)) | DBG_FUNC_START
, 
 304                           (int)p
, fd
, 0, 0, 0 ); 
 306         /* cancel all async IO requests on our todo queues for this file descriptor */ 
 307         error 
= do_aio_cancel( p
, fd
, 0, TRUE
, FALSE 
); 
 308         if ( error 
== AIO_NOTCANCELED 
) { 
 310                  * AIO_NOTCANCELED is returned when we find an aio request for this process  
 311                  * and file descriptor on the active async IO queue.  Active requests cannot  
 312                  * be cancelled so we must wait for them to complete.  We will get a special  
 313                  * wake up call on our channel used to sleep for ALL active requests to  
 314                  * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used   
 315                  * when we must wait for all active aio requests.   
 318                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close_sleep
)) | DBG_FUNC_NONE
, 
 319                                   (int)p
, fd
, 0, 0, 0 ); 
 321                 tsleep( &p
->AIO_CLEANUP_SLEEP_CHAN
, PRIBIO
, "aio_close", 0 ); 
 324         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close
)) | DBG_FUNC_END
, 
 325                           (int)p
, fd
, 0, 0, 0 ); 
 333  * aio_error - return the error status associated with the async IO 
 334  * request referred to by uap->aiocbp.  The error status is the errno 
 335  * value that would be set by the corresponding IO request (read, wrtie, 
 336  * fdatasync, or sync). 
 340 aio_error( struct proc 
*p
, struct aio_error_args 
*uap
, int *retval 
) 
 342         aio_workq_entry                         
*entryp
; 
 345         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error
)) | DBG_FUNC_START
, 
 346                           (int)p
, (int)uap
->aiocbp
, 0, 0, 0 ); 
 350         /* quick check to see if there are any async IO requests queued up */ 
 351         if ( aio_get_all_queues_count( ) < 1 ) { 
 356         /* look for a match on our queue of async IO requests that have completed */ 
 357         TAILQ_FOREACH( entryp
, &p
->aio_doneq
, aio_workq_link 
) { 
 358                 if ( entryp
->uaiocbp 
== uap
->aiocbp 
) { 
 359                         *retval 
= entryp
->errorval
; 
 361                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_val
)) | DBG_FUNC_NONE
, 
 362                                            (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 ); 
 367         /* look for a match on our queue of active async IO requests */ 
 368         TAILQ_FOREACH( entryp
, &p
->aio_activeq
, aio_workq_link 
) { 
 369                 if ( entryp
->uaiocbp 
== uap
->aiocbp 
) { 
 370                         *retval 
= EINPROGRESS
; 
 372                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_activeq
)) | DBG_FUNC_NONE
, 
 373                                            (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 ); 
 378         /* look for a match on our queue of todo work */ 
 379         TAILQ_FOREACH( entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link 
) { 
 380                 if ( p 
== entryp
->procp 
&& entryp
->uaiocbp 
== uap
->aiocbp 
) { 
 381                         *retval 
= EINPROGRESS
; 
 383                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_workq
)) | DBG_FUNC_NONE
, 
 384                                            (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 ); 
 391         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error
)) | DBG_FUNC_END
, 
 392                           (int)p
, (int)uap
->aiocbp
, error
, 0, 0 ); 
 401  * aio_fsync - asynchronously force all IO operations associated  
 402  * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and  
 403  * queued at the time of the call to the synchronized completion state. 
 404  * NOTE - we do not support op O_DSYNC at this point since we do not support the  
 409 aio_fsync( struct proc 
*p
, struct aio_fsync_args 
*uap
, int *retval 
) 
 414         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync
)) | DBG_FUNC_START
, 
 415                           (int)p
, (int)uap
->aiocbp
, uap
->op
, 0, 0 ); 
 418         /* 0 := O_SYNC for binary backward compatibility with Panther */ 
 419         if (uap
->op 
== O_SYNC 
|| uap
->op 
== 0) 
 420                 fsync_kind 
= AIO_FSYNC
; 
 421 #if 0 // we don't support fdatasync() call yet 
 422         else if ( uap
->op 
== O_DSYNC 
) 
 423                 fsync_kind 
= AIO_DSYNC
; 
 431         error 
= aio_queue_async_request( p
, uap
->aiocbp
, fsync_kind 
); 
 436         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync
)) | DBG_FUNC_END
, 
 437                           (int)p
, (int)uap
->aiocbp
, error
, 0, 0 ); 
 444 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the  
 445  * file descriptor (uap->aiocbp->aio_fildes) into the buffer  
 446  * (uap->aiocbp->aio_buf). 
 450 aio_read( struct proc 
*p
, struct aio_read_args 
*uap
, int *retval 
) 
 454         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_read
)) | DBG_FUNC_START
, 
 455                           (int)p
, (int)uap
->aiocbp
, 0, 0, 0 ); 
 459         error 
= aio_queue_async_request( p
, uap
->aiocbp
, AIO_READ 
); 
 463         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_read
)) | DBG_FUNC_END
, 
 464                           (int)p
, (int)uap
->aiocbp
, error
, 0, 0 ); 
 472  * aio_return - return the return status associated with the async IO 
 473  * request referred to by uap->aiocbp.  The return status is the value 
 474  * that would be returned by corresponding IO request (read, wrtie, 
 475  * fdatasync, or sync).  This is where we release kernel resources  
 476  * held for async IO call associated with the given aiocb pointer. 
 480 aio_return( struct proc 
*p
, struct aio_return_args 
*uap
, user_ssize_t 
*retval 
) 
 482         aio_workq_entry                         
*entryp
; 
 486         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return
)) | DBG_FUNC_START
, 
 487                           (int)p
, (int)uap
->aiocbp
, 0, 0, 0 ); 
 493         /* quick check to see if there are any async IO requests queued up */ 
 494         if ( aio_get_all_queues_count( ) < 1 ) { 
 499         /* look for a match on our queue of async IO requests that have completed */ 
 500         TAILQ_FOREACH( entryp
, &p
->aio_doneq
, aio_workq_link 
) { 
 501                 if ( entryp
->uaiocbp 
== uap
->aiocbp 
) { 
 502                         TAILQ_REMOVE( &p
->aio_doneq
, entryp
, aio_workq_link 
); 
 503                         aio_anchor
.aio_done_count
--; 
 506                         *retval 
= entryp
->returnval
; 
 508                         /* we cannot free requests that are still completing */ 
 509                         if ( (entryp
->flags 
& AIO_COMPLETION
) == 0 ) { 
 512                                 my_map 
= entryp
->aio_map
; 
 513                                 entryp
->aio_map 
= VM_MAP_NULL
; 
 516                                 aio_free_request( entryp
, my_map 
); 
 519                                 /* tell completion code to free this request */ 
 520                                 entryp
->flags 
|= AIO_DO_FREE
; 
 522                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_val
)) | DBG_FUNC_NONE
, 
 523                                            (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 ); 
 528         /* look for a match on our queue of active async IO requests */ 
 529         TAILQ_FOREACH( entryp
, &p
->aio_activeq
, aio_workq_link 
) { 
 530                 if ( entryp
->uaiocbp 
== uap
->aiocbp 
) { 
 532                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_activeq
)) | DBG_FUNC_NONE
, 
 533                                            (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 ); 
 538         /* look for a match on our queue of todo work */ 
 539         TAILQ_FOREACH( entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link 
) { 
 540                 if ( p 
== entryp
->procp 
&& entryp
->uaiocbp 
== uap
->aiocbp 
) { 
 542                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_workq
)) | DBG_FUNC_NONE
, 
 543                                            (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 ); 
 552         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return
)) | DBG_FUNC_END
, 
 553                           (int)p
, (int)uap
->aiocbp
, error
, 0, 0 ); 
 561  * _aio_exec - internal function used to clean up async IO requests for  
 562  * a process that is going away due to exec().  We cancel any async IOs    
 563  * we can and wait for those already active.  We also disable signaling 
 564  * for cancelled or active aio requests that complete.  
 565  * This routine MAY block! 
 568 __private_extern__ 
void 
 569 _aio_exec( struct proc 
*p 
) 
 572         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exec
)) | DBG_FUNC_START
, 
 573                           (int)p
, 0, 0, 0, 0 ); 
 577         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exec
)) | DBG_FUNC_END
, 
 578                           (int)p
, 0, 0, 0, 0 ); 
 586  * _aio_exit - internal function used to clean up async IO requests for  
 587  * a process that is terminating (via exit() or exec() ).  We cancel any async IOs    
 588  * we can and wait for those already active.  We also disable signaling 
 589  * for cancelled or active aio requests that complete.  This routine MAY block! 
 592 __private_extern__ 
void 
 593 _aio_exit( struct proc 
*p 
) 
 596         aio_workq_entry                 
*entryp
; 
 598         /* quick check to see if there are any async IO requests queued up */ 
 600         count 
= aio_get_all_queues_count( ); 
 606         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit
)) | DBG_FUNC_START
, 
 607                           (int)p
, 0, 0, 0, 0 ); 
 610          * cancel async IO requests on the todo work queue and wait for those   
 611          * already active to complete.  
 613         error 
= do_aio_cancel( p
, 0, 0, TRUE
, TRUE 
); 
 614         if ( error 
== AIO_NOTCANCELED 
) { 
 616                  * AIO_NOTCANCELED is returned when we find an aio request for this process  
 617                  * on the active async IO queue.  Active requests cannot be cancelled so we  
 618                  * must wait for them to complete.  We will get a special wake up call on  
 619                  * our channel used to sleep for ALL active requests to complete.  This sleep  
 620                  * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all  
 621                  * active aio requests.   
 624                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit_sleep
)) | DBG_FUNC_NONE
, 
 625                                   (int)p
, 0, 0, 0, 0 ); 
 627                 tsleep( &p
->AIO_CLEANUP_SLEEP_CHAN
, PRIBIO
, "aio_exit", 0 ); 
 630         /* release all aio resources used by this process */ 
 632         entryp 
= TAILQ_FIRST( &p
->aio_doneq 
); 
 633         while ( entryp 
!= NULL 
) { 
 634                 aio_workq_entry                 
*next_entryp
; 
 636                 next_entryp 
= TAILQ_NEXT( entryp
, aio_workq_link 
); 
 637                 TAILQ_REMOVE( &p
->aio_doneq
, entryp
, aio_workq_link 
); 
 638                 aio_anchor
.aio_done_count
--; 
 641                 /* we cannot free requests that are still completing */ 
 642                 if ( (entryp
->flags 
& AIO_COMPLETION
) == 0 ) { 
 645                         my_map 
= entryp
->aio_map
; 
 646                         entryp
->aio_map 
= VM_MAP_NULL
; 
 648                         aio_free_request( entryp
, my_map 
); 
 650                         /* need to start over since aio_doneq may have been */ 
 651                         /* changed while we were away.  */ 
 653                         entryp 
= TAILQ_FIRST( &p
->aio_doneq 
); 
 657                         /* tell completion code to free this request */ 
 658                         entryp
->flags 
|= AIO_DO_FREE
; 
 659                 entryp 
= next_entryp
; 
 663         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit
)) | DBG_FUNC_END
, 
 664                           (int)p
, 0, 0, 0, 0 ); 
 672  * do_aio_cancel - cancel async IO requests (if possible).  We get called by 
 673  * aio_cancel, close, and at exit.   
 674  * There are three modes of operation: 1) cancel all async IOs for a process -  
 675  * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd  
 676  * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given 
 678  * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all  
 679  * target async IO requests, AIO_NOTCANCELED if we could not cancel all  
 680  * target async IO requests, and AIO_ALLDONE if all target async IO requests  
 681  * were already complete. 
 682  * WARNING - do not deference aiocbp in this routine, it may point to user  
 683  * land data that has not been copied in (when called from aio_cancel() ) 
 687 do_aio_cancel(  struct proc 
*p
, int fd
, user_addr_t aiocbp
,  
 688                                 boolean_t wait_for_completion
, boolean_t disable_notification 
) 
 690         aio_workq_entry                 
*entryp
; 
 695         /* look for a match on our queue of async todo work. */ 
 697         entryp 
= TAILQ_FIRST( &aio_anchor
.aio_async_workq 
); 
 698         while ( entryp 
!= NULL 
) { 
 699                 aio_workq_entry                 
*next_entryp
; 
 701                 next_entryp 
= TAILQ_NEXT( entryp
, aio_workq_link 
); 
 702                 if ( p 
== entryp
->procp 
) { 
 703                         if ( (aiocbp 
== USER_ADDR_NULL 
&& fd 
== 0) || 
 704                                  (aiocbp 
!= USER_ADDR_NULL 
&& entryp
->uaiocbp 
== aiocbp
) || 
 705                                  (aiocbp 
== USER_ADDR_NULL 
&& fd 
== entryp
->aiocb
.aio_fildes
) ) { 
 706                                 /* we found a match so we remove the entry from the */ 
 707                                 /* todo work queue and place it on the done queue */ 
 708                                 TAILQ_REMOVE( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link 
); 
 709                                 aio_anchor
.aio_async_workq_count
--; 
 710                                 entryp
->errorval 
= ECANCELED
; 
 711                                 entryp
->returnval 
= -1; 
 712                                 if ( disable_notification 
) 
 713                                         entryp
->flags 
|= AIO_DISABLE
; /* flag for special completion processing */ 
 714                                 result 
= AIO_CANCELED
; 
 716                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_async_workq
)) | DBG_FUNC_NONE
, 
 717                                                           (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 ); 
 719                                 TAILQ_INSERT_TAIL( &p
->aio_doneq
, entryp
, aio_workq_link 
); 
 720                                 aio_anchor
.aio_done_count
++; 
 722                                 entryp
->flags 
|= AIO_COMPLETION
; 
 725                                 /* do completion processing for this request */ 
 726                                 do_aio_completion( entryp 
); 
 729                                 entryp
->flags 
&= ~AIO_COMPLETION
; 
 730                                 if ( (entryp
->flags 
& AIO_DO_FREE
) != 0 ) { 
 733                                         my_map 
= entryp
->aio_map
; 
 734                                         entryp
->aio_map 
= VM_MAP_NULL
; 
 736                                         aio_free_request( entryp
, my_map 
); 
 741                                 if ( aiocbp 
!= USER_ADDR_NULL 
) { 
 745                                 /* need to start over since aio_async_workq may have been */ 
 746                                 /* changed while we were away doing completion processing.  */ 
 748                                 entryp 
= TAILQ_FIRST( &aio_anchor
.aio_async_workq 
); 
 752                 entryp 
= next_entryp
; 
 756          * look for a match on our queue of synchronous todo work.  This will  
 757          * be a rare occurrence but could happen if a process is terminated while  
 758          * processing a lio_listio call.  
 760         entryp 
= TAILQ_FIRST( &aio_anchor
.lio_sync_workq 
); 
 761         while ( entryp 
!= NULL 
) { 
 762                 aio_workq_entry                 
*next_entryp
; 
 764                 next_entryp 
= TAILQ_NEXT( entryp
, aio_workq_link 
); 
 765                 if ( p 
== entryp
->procp 
) { 
 766                         if ( (aiocbp 
== USER_ADDR_NULL 
&& fd 
== 0) || 
 767                                  (aiocbp 
!= USER_ADDR_NULL 
&& entryp
->uaiocbp 
== aiocbp
) || 
 768                                  (aiocbp 
== USER_ADDR_NULL 
&& fd 
== entryp
->aiocb
.aio_fildes
) ) { 
 769                                 /* we found a match so we remove the entry from the */ 
 770                                 /* todo work queue and place it on the done queue */ 
 771                                 TAILQ_REMOVE( &aio_anchor
.lio_sync_workq
, entryp
, aio_workq_link 
); 
 772                                 aio_anchor
.lio_sync_workq_count
--; 
 773                                 entryp
->errorval 
= ECANCELED
; 
 774                                 entryp
->returnval 
= -1; 
 775                                 if ( disable_notification 
) 
 776                                         entryp
->flags 
|= AIO_DISABLE
; /* flag for special completion processing */ 
 777                                 result 
= AIO_CANCELED
; 
 779                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_sync_workq
)) | DBG_FUNC_NONE
, 
 780                                                           (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 ); 
 782                                 TAILQ_INSERT_TAIL( &p
->aio_doneq
, entryp
, aio_workq_link 
); 
 783                                 aio_anchor
.aio_done_count
++; 
 785                                 if ( aiocbp 
!= USER_ADDR_NULL 
) { 
 791                 entryp 
= next_entryp
; 
 795          * look for a match on our queue of active async IO requests and  
 796          * return AIO_NOTCANCELED result.  
 798         TAILQ_FOREACH( entryp
, &p
->aio_activeq
, aio_workq_link 
) { 
 799                 if ( (aiocbp 
== USER_ADDR_NULL 
&& fd 
== 0) || 
 800                          (aiocbp 
!= USER_ADDR_NULL 
&& entryp
->uaiocbp 
== aiocbp
) || 
 801                          (aiocbp 
== USER_ADDR_NULL 
&& fd 
== entryp
->aiocb
.aio_fildes
) ) { 
 802                         result 
= AIO_NOTCANCELED
; 
 804                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_activeq
)) | DBG_FUNC_NONE
, 
 805                                                   (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 ); 
 807                         if ( wait_for_completion 
) 
 808                                 entryp
->flags 
|= AIO_WAITING
; /* flag for special completion processing */ 
 809                         if ( disable_notification 
) 
 810                                 entryp
->flags 
|= AIO_DISABLE
; /* flag for special completion processing */ 
 811                         if ( aiocbp 
!= USER_ADDR_NULL 
) { 
 819          * if we didn't find any matches on the todo or active queues then look for a  
 820          * match on our queue of async IO requests that have completed and if found  
 821          * return AIO_ALLDONE result.   
 823         if ( result 
== -1 ) { 
 824                 TAILQ_FOREACH( entryp
, &p
->aio_doneq
, aio_workq_link 
) { 
 825                 if ( (aiocbp 
== USER_ADDR_NULL 
&& fd 
== 0) || 
 826                          (aiocbp 
!= USER_ADDR_NULL 
&& entryp
->uaiocbp 
== aiocbp
) || 
 827                          (aiocbp 
== USER_ADDR_NULL 
&& fd 
== entryp
->aiocb
.aio_fildes
) ) { 
 828                                 result 
= AIO_ALLDONE
; 
 830                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_doneq
)) | DBG_FUNC_NONE
, 
 831                                                           (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 ); 
 833                                 if ( aiocbp 
!= USER_ADDR_NULL 
) { 
 844 } /* do_aio_cancel */ 
 848  * aio_suspend - suspend the calling thread until at least one of the async 
 849  * IO operations referenced by uap->aiocblist has completed, until a signal 
 850  * interrupts the function, or uap->timeoutp time interval (optional) has 
 852  * Returns 0 if one or more async IOs have completed else -1 and errno is 
 853  * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt 
 858 aio_suspend( struct proc 
*p
, struct aio_suspend_args 
*uap
, int *retval 
) 
 863         struct user_timespec ts
; 
 864         aio_workq_entry         
*entryp
; 
 865         user_addr_t                     
*aiocbpp
; 
 867         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend
)) | DBG_FUNC_START
, 
 868                           (int)p
, uap
->nent
, 0, 0, 0 ); 
 874         /* quick check to see if there are any async IO requests queued up */ 
 876         count 
= aio_get_all_queues_count( ); 
 880                 goto ExitThisRoutine
; 
 883         if ( uap
->nent 
< 1 || uap
->nent 
> aio_max_requests_per_process 
) { 
 885                 goto ExitThisRoutine
; 
 888         if ( uap
->timeoutp 
!= USER_ADDR_NULL 
) { 
 889                 if ( proc_is64bit(p
) ) { 
 890                         error 
= copyin( uap
->timeoutp
, &ts
, sizeof(ts
) ); 
 893                         struct timespec temp
; 
 894                         error 
= copyin( uap
->timeoutp
, &temp
, sizeof(temp
) ); 
 896                                 ts
.tv_sec 
= temp
.tv_sec
; 
 897                                 ts
.tv_nsec 
= temp
.tv_nsec
; 
 902                         goto ExitThisRoutine
; 
 905                 if ( ts
.tv_nsec 
< 0 || ts
.tv_nsec 
>= 1000000000 ) { 
 907                         goto ExitThisRoutine
; 
 910                 nanoseconds_to_absolutetime( (uint64_t)ts
.tv_sec 
* NSEC_PER_SEC 
+ ts
.tv_nsec
,  
 912                 clock_absolutetime_interval_to_deadline( abstime
, &abstime 
); 
 915         /* we reserve enough space for largest possible pointer size */ 
 916         MALLOC( aiocbpp
, user_addr_t 
*, (uap
->nent 
* sizeof(user_addr_t
)), M_TEMP
, M_WAITOK 
); 
 917         if ( aiocbpp 
== NULL 
) { 
 919                 goto ExitThisRoutine
; 
 922         /* copyin our aiocb pointers from list */ 
 923         error 
= copyin( uap
->aiocblist
, aiocbpp
,  
 924                                         proc_is64bit(p
) ? (uap
->nent 
* sizeof(user_addr_t
))  
 925                                                                         : (uap
->nent 
* sizeof(uintptr_t)) ); 
 928                 goto ExitThisRoutine
; 
 931         /* we depend on a list of user_addr_t's so we need to munge and expand */ 
 932         /* when these pointers came from a 32-bit process */ 
 933         if ( !proc_is64bit(p
) && sizeof(uintptr_t) < sizeof(user_addr_t
) ) { 
 934                 /* position to the last entry and work back from there */ 
 935                 uintptr_t       *my_ptrp 
= ((uintptr_t *)aiocbpp
) + (uap
->nent 
- 1); 
 936                 user_addr_t 
*my_addrp 
= aiocbpp 
+ (uap
->nent 
- 1); 
 937                 for (i 
= 0; i 
< uap
->nent
; i
++, my_ptrp
--, my_addrp
--) { 
 938                         *my_addrp 
= (user_addr_t
) (*my_ptrp
); 
 942         /* check list of aio requests to see if any have completed */ 
 944         for ( i 
= 0; i 
< uap
->nent
; i
++ ) { 
 947                 /* NULL elements are legal so check for 'em */ 
 948                 aiocbp 
= *(aiocbpp 
+ i
); 
 949                 if ( aiocbp 
== USER_ADDR_NULL 
) 
 952                 /* return immediately if any aio request in the list is done */ 
 953                 TAILQ_FOREACH( entryp
, &p
->aio_doneq
, aio_workq_link 
) { 
 954                         if ( entryp
->uaiocbp 
== aiocbp 
) { 
 958                                 goto ExitThisRoutine
; 
 961         } /* for ( ; i < uap->nent; ) */ 
 963         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend_sleep
)) | DBG_FUNC_NONE
, 
 964                           (int)p
, uap
->nent
, 0, 0, 0 ); 
 967          * wait for an async IO to complete or a signal fires or timeout expires.  
 968          * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal  
 969          * interrupts us.  If an async IO completes before a signal fires or our  
 970          * timeout expires, we get a wakeup call from aio_work_thread(). 
 972         assert_wait_deadline( (event_t
) &p
->AIO_SUSPEND_SLEEP_CHAN
, THREAD_ABORTSAFE
, abstime 
); 
 975         error 
= thread_block( THREAD_CONTINUE_NULL 
); 
 977         if ( error 
== THREAD_AWAKENED 
) { 
 978                 /* got our wakeup call from aio_work_thread() */ 
 982         else if ( error 
== THREAD_TIMED_OUT 
) { 
 983                 /* our timeout expired */ 
 987                 /* we were interrupted */ 
 992         if ( aiocbpp 
!= NULL 
) 
 993                 FREE( aiocbpp
, M_TEMP 
); 
 995         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend
)) | DBG_FUNC_END
, 
 996                           (int)p
, uap
->nent
, error
, 0, 0 ); 
1003 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the  
1004  * file descriptor (uap->aiocbp->aio_fildes) from the buffer  
1005  * (uap->aiocbp->aio_buf). 
1009 aio_write( struct proc 
*p
, struct aio_write_args 
*uap
, int *retval 
) 
1015         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_write
)) | DBG_FUNC_START
, 
1016                           (int)p
, (int)uap
->aiocbp
, 0, 0, 0 ); 
1018         error 
= aio_queue_async_request( p
, uap
->aiocbp
, AIO_WRITE 
); 
1022         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_write
)) | DBG_FUNC_END
, 
1023                           (int)p
, (int)uap
->aiocbp
, error
, 0, 0 ); 
1031  * lio_listio - initiate a list of IO requests.  We process the list of aiocbs 
1032  * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT). 
1033  * The caller gets error and return status for each aiocb in the list via aio_error  
1034  * and aio_return.  We must keep completed requests until released by the  
1039 lio_listio( struct proc 
*p
, struct lio_listio_args 
*uap
, int *retval 
) 
1045         aio_workq_entry 
*                       *entryp_listp
; 
1046         user_addr_t                                     
*aiocbpp
; 
1048         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_listio
)) | DBG_FUNC_START
, 
1049                           (int)p
, uap
->nent
, uap
->mode
, 0, 0 ); 
1051         entryp_listp 
= NULL
; 
1055         if ( !(uap
->mode 
== LIO_NOWAIT 
|| uap
->mode 
== LIO_WAIT
) ) { 
1056                 call_result 
= EINVAL
; 
1060         if ( uap
->nent 
< 1 || uap
->nent 
> AIO_LISTIO_MAX 
) { 
1061                 call_result 
= EINVAL
; 
1066          * we use group_tag to mark IO requests for delayed completion processing 
1067          * which means we wait until all IO requests in the group have completed  
1068          * before we either return to the caller when mode is LIO_WAIT or signal 
1069          * user when mode is LIO_NOWAIT.  
1071         group_tag 
= random(); 
1074          * allocate a list of aio_workq_entry pointers that we will use to queue 
1075          * up all our requests at once while holding our lock. 
1077         MALLOC( entryp_listp
, void *, (uap
->nent 
* sizeof(aio_workq_entry 
*)), M_TEMP
, M_WAITOK 
); 
1078         if ( entryp_listp 
== NULL 
) { 
1079                 call_result 
= EAGAIN
; 
1083         /* we reserve enough space for largest possible pointer size */ 
1084         MALLOC( aiocbpp
, user_addr_t 
*, (uap
->nent 
* sizeof(user_addr_t
)), M_TEMP
, M_WAITOK 
); 
1085         if ( aiocbpp 
== NULL 
) { 
1086                 call_result 
= EAGAIN
; 
1090         /* copyin our aiocb pointers from list */ 
1091         result 
= copyin( uap
->aiocblist
, aiocbpp
,  
1092                                         IS_64BIT_PROCESS(p
) ? (uap
->nent 
* sizeof(user_addr_t
))  
1093                                                                                 : (uap
->nent 
* sizeof(uintptr_t)) ); 
1094         if ( result 
!= 0 ) { 
1095                 call_result 
= EAGAIN
; 
1099         /* we depend on a list of user_addr_t's so we need to munge and expand */ 
1100         /* when these pointers came from a 32-bit process */ 
1101         if ( !IS_64BIT_PROCESS(p
) && sizeof(uintptr_t) < sizeof(user_addr_t
) ) { 
1102                 /* position to the last entry and work back from there */ 
1103                 uintptr_t       *my_ptrp 
= ((uintptr_t *)aiocbpp
) + (uap
->nent 
- 1); 
1104                 user_addr_t 
*my_addrp 
= aiocbpp 
+ (uap
->nent 
- 1); 
1105                 for (i 
= 0; i 
< uap
->nent
; i
++, my_ptrp
--, my_addrp
--) { 
1106                         *my_addrp 
= (user_addr_t
) (*my_ptrp
); 
1110         /* process list of aio requests */ 
1111         for ( i 
= 0; i 
< uap
->nent
; i
++ ) { 
1112                 user_addr_t my_aiocbp
;  
1114                 *(entryp_listp 
+ i
) = NULL
; 
1115                 my_aiocbp 
= *(aiocbpp 
+ i
); 
1117                 /* NULL elements are legal so check for 'em */ 
1118                 if ( my_aiocbp 
== USER_ADDR_NULL 
) 
1121                 if ( uap
->mode 
== LIO_NOWAIT 
) 
1122                         result 
= lio_create_async_entry( p
, my_aiocbp
, uap
->sigp
,  
1123                                                                                          group_tag
, (entryp_listp 
+ i
) ); 
1125                         result 
= lio_create_sync_entry( p
, my_aiocbp
, group_tag
,  
1126                                                                                         (entryp_listp 
+ i
) ); 
1128                 if ( result 
!= 0 && call_result 
== -1 ) 
1129                         call_result 
= result
; 
1133          * we need to protect this section since we do not want any of these grouped  
1134          * IO requests to begin until we have them all on the queue. 
1137         for ( i 
= 0; i 
< uap
->nent
; i
++ ) { 
1138                 aio_workq_entry                         
*entryp
; 
1140                 /* NULL elements are legal so check for 'em */ 
1141                 entryp 
= *(entryp_listp 
+ i
); 
1142                 if ( entryp 
== NULL 
) 
1145                 /* check our aio limits to throttle bad or rude user land behavior */ 
1146                 if ( aio_get_all_queues_count( ) >= aio_max_requests 
||  
1147                          aio_get_process_count( entryp
->procp 
) >= aio_max_requests_per_process 
|| 
1148                          is_already_queued( entryp
->procp
, entryp
->uaiocbp 
) == TRUE 
) { 
1151                         my_map 
= entryp
->aio_map
; 
1152                         entryp
->aio_map 
= VM_MAP_NULL
; 
1153                         if ( call_result 
== -1 ) 
1154                                 call_result 
= EAGAIN
;  
1156                         aio_free_request( entryp
, my_map 
); 
1161                 /* place the request on the appropriate queue */ 
1162                 if ( uap
->mode 
== LIO_NOWAIT 
) { 
1163                         TAILQ_INSERT_TAIL( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link 
); 
1164                         aio_anchor
.aio_async_workq_count
++; 
1166                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_NONE
, 
1167                                           (int)p
, (int)entryp
->uaiocbp
, 0, 0, 0 ); 
1170                         TAILQ_INSERT_TAIL( &aio_anchor
.lio_sync_workq
, entryp
, aio_workq_link 
); 
1171                         aio_anchor
.lio_sync_workq_count
++; 
1175         if ( uap
->mode 
== LIO_NOWAIT 
) {  
1176                 /* caller does not want to wait so we'll fire off a worker thread and return */ 
1177                 wakeup_one( (caddr_t
) &aio_anchor
.aio_async_workq 
); 
1180                 aio_workq_entry                 
*entryp
; 
1184                  * mode is LIO_WAIT - handle the IO requests now. 
1186                 entryp 
= TAILQ_FIRST( &aio_anchor
.lio_sync_workq 
); 
1187                 while ( entryp 
!= NULL 
) { 
1188                         if ( p 
== entryp
->procp 
&& group_tag 
== entryp
->group_tag 
) { 
1190                                 TAILQ_REMOVE( &aio_anchor
.lio_sync_workq
, entryp
, aio_workq_link 
); 
1191                                 aio_anchor
.lio_sync_workq_count
--; 
1194                                 if ( (entryp
->flags 
& AIO_READ
) != 0 ) { 
1195                                         error 
= do_aio_read( entryp 
); 
1197                                 else if ( (entryp
->flags 
& AIO_WRITE
) != 0 ) { 
1198                                         error 
= do_aio_write( entryp 
); 
1200                                 else if ( (entryp
->flags 
& AIO_FSYNC
) != 0 ) { 
1201                                         error 
= do_aio_fsync( entryp 
); 
1204                                         printf( "%s - unknown aio request - flags 0x%02X \n",  
1205                                                         __FUNCTION__
, entryp
->flags 
); 
1208                                 entryp
->errorval 
= error
;        
1209                                 if ( error 
!= 0 && call_result 
== -1 ) 
1213                                 /* we're done with the IO request so move it on the done queue */ 
1214                                 TAILQ_INSERT_TAIL( &p
->aio_doneq
, entryp
, aio_workq_link 
); 
1215                                 aio_anchor
.aio_done_count
++; 
1216                                 p
->aio_done_count
++; 
1218                                 /* need to start over since lio_sync_workq may have been changed while we */ 
1219                                 /* were away doing the IO.  */ 
1220                                 entryp 
= TAILQ_FIRST( &aio_anchor
.lio_sync_workq 
); 
1222                         } /* p == entryp->procp */ 
1224                         entryp 
= TAILQ_NEXT( entryp
, aio_workq_link 
); 
1225         } /* while ( entryp != NULL ) */ 
1226         } /* uap->mode == LIO_WAIT */ 
1229         /* call_result == -1 means we had no trouble queueing up requests */ 
1230         if ( call_result 
== -1 ) { 
1236         if ( entryp_listp 
!= NULL 
) 
1237                 FREE( entryp_listp
, M_TEMP 
); 
1238         if ( aiocbpp 
!= NULL 
) 
1239                 FREE( aiocbpp
, M_TEMP 
); 
1241         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_listio
)) | DBG_FUNC_END
, 
1242                           (int)p
, call_result
, 0, 0, 0 ); 
1244         return( call_result 
); 
1250  * aio worker thread.  this is where all the real work gets done. 
1251  * we get a wake up call on sleep channel &aio_anchor.aio_async_workq  
1252  * after new work is queued up. 
1256 aio_work_thread( void ) 
1258         aio_workq_entry                 
*entryp
; 
1262                 entryp 
= aio_get_some_work(); 
1263         if ( entryp 
== NULL 
) { 
1265                  * aio worker threads wait for some work to get queued up  
1266                  * by aio_queue_async_request.  Once some work gets queued  
1267                  * it will wake up one of these worker threads just before  
1268                  * returning to our caller in user land. 
1270                         assert_wait( (event_t
) &aio_anchor
.aio_async_workq
, THREAD_UNINT 
); 
1273                         thread_block( (thread_continue_t
)aio_work_thread 
); 
1278                         vm_map_t                currentmap
; 
1279                         vm_map_t                oldmap 
= VM_MAP_NULL
; 
1280                         task_t                  oldaiotask 
= TASK_NULL
; 
1281                         struct uthread  
*uthreadp 
= NULL
; 
1285                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_thread
)) | DBG_FUNC_START
, 
1286                                                   (int)entryp
->procp
, (int)entryp
->uaiocbp
, entryp
->flags
, 0, 0 ); 
1289                          * Assume the target's address space identity for the duration 
1292                         currentmap 
= get_task_map( (current_proc())->task 
); 
1293                         if ( currentmap 
!= entryp
->aio_map 
) { 
1294                                 uthreadp 
= (struct uthread 
*) get_bsdthread_info(current_thread()); 
1295                                 oldaiotask 
= uthreadp
->uu_aio_task
; 
1296                                 uthreadp
->uu_aio_task 
= entryp
->procp
->task
; 
1297                                 oldmap 
= vm_map_switch( entryp
->aio_map 
); 
1300                         if ( (entryp
->flags 
& AIO_READ
) != 0 ) { 
1301                                 error 
= do_aio_read( entryp 
); 
1303                         else if ( (entryp
->flags 
& AIO_WRITE
) != 0 ) { 
1304                                 error 
= do_aio_write( entryp 
); 
1306                         else if ( (entryp
->flags 
& AIO_FSYNC
) != 0 ) { 
1307                                 error 
= do_aio_fsync( entryp 
); 
1310                                 printf( "%s - unknown aio request - flags 0x%02X \n",  
1311                                                 __FUNCTION__
, entryp
->flags 
); 
1314                         entryp
->errorval 
= error
;                
1315                         if ( currentmap 
!= entryp
->aio_map 
) { 
1316                                 (void) vm_map_switch( oldmap 
); 
1317                                 uthreadp
->uu_aio_task 
= oldaiotask
; 
1320                         /* we're done with the IO request so pop it off the active queue and */ 
1321                         /* push it on the done queue */ 
1323                         TAILQ_REMOVE( &entryp
->procp
->aio_activeq
, entryp
, aio_workq_link 
); 
1324                         aio_anchor
.aio_active_count
--; 
1325                         entryp
->procp
->aio_active_count
--; 
1326                         TAILQ_INSERT_TAIL( &entryp
->procp
->aio_doneq
, entryp
, aio_workq_link 
); 
1327                         aio_anchor
.aio_done_count
++; 
1328                         entryp
->procp
->aio_done_count
++; 
1329                         entryp
->flags 
|= AIO_COMPLETION
; 
1331                         /* remove our reference to the user land map. */ 
1332                         if ( VM_MAP_NULL 
!= entryp
->aio_map 
) { 
1335                                 my_map 
= entryp
->aio_map
; 
1336                                 entryp
->aio_map 
= VM_MAP_NULL
; 
1337                                 AIO_UNLOCK
;  /* must unlock before calling vm_map_deallocate() */ 
1338                                 vm_map_deallocate( my_map 
); 
1344                         do_aio_completion( entryp 
); 
1346                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_thread
)) | DBG_FUNC_END
, 
1347                                                   (int)entryp
->procp
, (int)entryp
->uaiocbp
, entryp
->errorval
,  
1348                                                   entryp
->returnval
, 0 ); 
1351                         entryp
->flags 
&= ~AIO_COMPLETION
; 
1352                         if ( (entryp
->flags 
& AIO_DO_FREE
) != 0 ) { 
1355                                 my_map 
= entryp
->aio_map
; 
1356                                 entryp
->aio_map 
= VM_MAP_NULL
; 
1358                                 aio_free_request( entryp
, my_map 
); 
1367 } /* aio_work_thread */ 
1371  * aio_get_some_work - get the next async IO request that is ready to be executed. 
1372  * aio_fsync complicates matters a bit since we cannot do the fsync until all async 
1373  * IO requests at the time the aio_fsync call came in have completed. 
1374  * NOTE - AIO_LOCK must be held by caller 
1377 static aio_workq_entry 
* 
1378 aio_get_some_work( void ) 
1380         aio_workq_entry                         
*entryp
; 
1382         /* pop some work off the work queue and add to our active queue */ 
1383         for ( entryp 
= TAILQ_FIRST( &aio_anchor
.aio_async_workq 
); 
1385                   entryp 
= TAILQ_NEXT( entryp
, aio_workq_link 
) ) { 
1387                 if ( (entryp
->flags 
& AIO_FSYNC
) != 0 ) { 
1388                         /* leave aio_fsync calls on the work queue if there are IO */ 
1389                         /* requests on the active queue for the same file descriptor. */ 
1390                         if ( aio_delay_fsync_request( entryp 
) ) { 
1392                                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync_delay
)) | DBG_FUNC_NONE
, 
1393                                                           (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 ); 
1400         if ( entryp 
!= NULL 
) { 
1401                 TAILQ_REMOVE( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link 
); 
1402                 aio_anchor
.aio_async_workq_count
--; 
1403                 TAILQ_INSERT_TAIL( &entryp
->procp
->aio_activeq
, entryp
, aio_workq_link 
); 
1404                 aio_anchor
.aio_active_count
++; 
1405                 entryp
->procp
->aio_active_count
++; 
1410 } /* aio_get_some_work */ 
1414  * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at 
1415  * this time.  Delay will happen when there are any active IOs for the same file  
1416  * descriptor that were queued at time the aio_sync call was queued.   
1417  * NOTE - AIO_LOCK must be held by caller 
1420 aio_delay_fsync_request( aio_workq_entry 
*entryp 
) 
1422         aio_workq_entry                 
*my_entryp
; 
1424         TAILQ_FOREACH( my_entryp
, &entryp
->procp
->aio_activeq
, aio_workq_link 
) { 
1425                 if ( my_entryp
->fsyncp 
!= USER_ADDR_NULL 
&& 
1426                          entryp
->uaiocbp 
== my_entryp
->fsyncp 
&& 
1427                          entryp
->aiocb
.aio_fildes 
== my_entryp
->aiocb
.aio_fildes 
) { 
1434 } /* aio_delay_fsync_request */ 
1438  * aio_queue_async_request - queue up an async IO request on our work queue then 
1439  * wake up one of our worker threads to do the actual work.  We get a reference 
1440  * to our caller's user land map in order to keep it around while we are 
1441  * processing the request.  
1445 aio_queue_async_request( struct proc 
*procp
, user_addr_t aiocbp
, int kindOfIO 
) 
1447         aio_workq_entry                 
*entryp
; 
1450         entryp 
= (aio_workq_entry 
*) zalloc( aio_workq_zonep 
); 
1451         if ( entryp 
== NULL 
) { 
1455         bzero( entryp
, sizeof(*entryp
) ); 
1457         /* fill in the rest of the aio_workq_entry */ 
1458         entryp
->procp 
= procp
; 
1459         entryp
->uaiocbp 
= aiocbp
; 
1460         entryp
->flags 
|= kindOfIO
; 
1461         entryp
->aio_map 
= VM_MAP_NULL
; 
1463         if ( !IS_64BIT_PROCESS(procp
) ) { 
1464                 struct aiocb aiocb32
; 
1466                 result 
= copyin( aiocbp
, &aiocb32
, sizeof(aiocb32
) ); 
1468                         do_munge_aiocb( &aiocb32
, &entryp
->aiocb 
); 
1470                 result 
= copyin( aiocbp
, &entryp
->aiocb
, sizeof(entryp
->aiocb
) ); 
1472         if ( result 
!= 0 ) { 
1477         /* do some more validation on the aiocb and embedded file descriptor */ 
1478         result 
= aio_validate( entryp 
); 
1482         /* get a reference to the user land map in order to keep it around */ 
1483         entryp
->aio_map 
= get_task_map( procp
->task 
); 
1484         vm_map_reference( entryp
->aio_map 
); 
1488         if ( is_already_queued( entryp
->procp
, entryp
->uaiocbp 
) == TRUE 
) { 
1494         /* check our aio limits to throttle bad or rude user land behavior */ 
1495         if ( aio_get_all_queues_count( ) >= aio_max_requests 
||  
1496                  aio_get_process_count( procp 
) >= aio_max_requests_per_process 
) { 
1503          * aio_fsync calls sync up all async IO requests queued at the time  
1504          * the aio_fsync call was made.  So we mark each currently queued async  
1505          * IO with a matching file descriptor as must complete before we do the  
1506          * fsync.  We set the fsyncp field of each matching async IO  
1507          * request with the aiocb pointer passed in on the aio_fsync call to  
1508          * know which IOs must complete before we process the aio_fsync call.  
1510         if ( (kindOfIO 
& AIO_FSYNC
) != 0 ) 
1511                 aio_mark_requests( entryp 
); 
1513         /* queue up on our aio asynchronous work queue */ 
1514         TAILQ_INSERT_TAIL( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link 
); 
1515         aio_anchor
.aio_async_workq_count
++; 
1517         wakeup_one( (caddr_t
) &aio_anchor
.aio_async_workq 
); 
1520         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_NONE
, 
1521                           (int)procp
, (int)aiocbp
, 0, 0, 0 ); 
1526         if ( entryp 
!= NULL 
) { 
1527                 /* this entry has not been queued up so no worries about unlocked */ 
1528                 /* state and aio_map */ 
1529                 aio_free_request( entryp
, entryp
->aio_map 
); 
1534 } /* aio_queue_async_request */ 
1538  * lio_create_async_entry - allocate an aio_workq_entry and fill it in. 
1539  * If all goes well return 0 and pass the aio_workq_entry pointer back to 
1540  * our caller.  We get a reference to our caller's user land map in order to keep  
1541  * it around while we are processing the request.   
1542  * lio_listio calls behave differently at completion they do completion notification  
1543  * when all async IO requests have completed.  We use group_tag to tag IO requests  
1544  * that behave in the delay notification manner.  
1548 lio_create_async_entry( struct proc 
*procp
, user_addr_t aiocbp
,  
1549                                                  user_addr_t sigp
, long group_tag
, 
1550                                                  aio_workq_entry 
**entrypp 
) 
1552         aio_workq_entry                         
*entryp
; 
1555         entryp 
= (aio_workq_entry 
*) zalloc( aio_workq_zonep 
); 
1556         if ( entryp 
== NULL 
) { 
1560         bzero( entryp
, sizeof(*entryp
) ); 
1562         /* fill in the rest of the aio_workq_entry */ 
1563         entryp
->procp 
= procp
; 
1564         entryp
->uaiocbp 
= aiocbp
; 
1565         entryp
->flags 
|= AIO_LIO
; 
1566         entryp
->group_tag 
= group_tag
; 
1567         entryp
->aio_map 
= VM_MAP_NULL
; 
1569         if ( !IS_64BIT_PROCESS(procp
) ) { 
1570                 struct aiocb aiocb32
; 
1572                 result 
= copyin( aiocbp
, &aiocb32
, sizeof(aiocb32
) ); 
1574                         do_munge_aiocb( &aiocb32
, &entryp
->aiocb 
); 
1576                 result 
= copyin( aiocbp
, &entryp
->aiocb
, sizeof(entryp
->aiocb
) ); 
1578         if ( result 
!= 0 ) { 
1583         /* look for lio_listio LIO_NOP requests and ignore them. */ 
1584         /* Not really an error, but we need to free our aio_workq_entry.  */ 
1585         if ( entryp
->aiocb
.aio_lio_opcode 
== LIO_NOP 
) { 
1590         /* use sigevent passed in to lio_listio for each of our calls, but only */ 
1591         /* do completion notification after the last request completes. */ 
1592         if ( sigp 
!= USER_ADDR_NULL 
) { 
1593                 if ( !IS_64BIT_PROCESS(procp
) ) { 
1594                         struct sigevent sigevent32
; 
1596                         result 
= copyin( sigp
, &sigevent32
, sizeof(sigevent32
) ); 
1597                         if ( result 
== 0 ) { 
1598                                 /* also need to munge aio_sigevent since it contains pointers */ 
1599                                 /* special case here.  since we do not know if sigev_value is an */ 
1600                                 /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */ 
1601                                 /* means if we send this info back to user space we need to remember */ 
1602                                 /* sigev_value was not expanded for the 32-bit case.  */ 
1603                                 /* NOTE - this does NOT affect us since we don't support sigev_value */ 
1604                                 /* yet in the aio context.  */ 
1606                                 entryp
->aiocb
.aio_sigevent
.sigev_notify 
= sigevent32
.sigev_notify
; 
1607                                 entryp
->aiocb
.aio_sigevent
.sigev_signo 
= sigevent32
.sigev_signo
; 
1608                                 entryp
->aiocb
.aio_sigevent
.sigev_value
.size_equivalent
.sival_int 
=  
1609                                         sigevent32
.sigev_value
.sival_int
; 
1610                                 entryp
->aiocb
.aio_sigevent
.sigev_notify_function 
=  
1611                                         CAST_USER_ADDR_T(sigevent32
.sigev_notify_function
); 
1612                                 entryp
->aiocb
.aio_sigevent
.sigev_notify_attributes 
=  
1613                                         CAST_USER_ADDR_T(sigevent32
.sigev_notify_attributes
); 
1616                         result 
= copyin( sigp
, &entryp
->aiocb
.aio_sigevent
, sizeof(entryp
->aiocb
.aio_sigevent
) ); 
1618                 if ( result 
!= 0 ) { 
1624         /* do some more validation on the aiocb and embedded file descriptor */ 
1625         result 
= aio_validate( entryp 
); 
1629         /* get a reference to the user land map in order to keep it around */ 
1630         entryp
->aio_map 
= get_task_map( procp
->task 
); 
1631         vm_map_reference( entryp
->aio_map 
); 
1637         if ( entryp 
!= NULL 
) 
1638                 zfree( aio_workq_zonep
, entryp 
); 
1642 } /* lio_create_async_entry */ 
1646  * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO 
1647  * requests at the moment the aio_fsync call is queued.  We use aio_workq_entry.fsyncp 
1648  * to mark each async IO that must complete before the fsync is done.  We use the uaiocbp 
1649  * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests. 
1650  * NOTE - AIO_LOCK must be held by caller 
1654 aio_mark_requests( aio_workq_entry 
*entryp 
) 
1656         aio_workq_entry                 
*my_entryp
; 
1658         TAILQ_FOREACH( my_entryp
, &entryp
->procp
->aio_activeq
, aio_workq_link 
) { 
1659                 if ( entryp
->aiocb
.aio_fildes 
== my_entryp
->aiocb
.aio_fildes 
) { 
1660                         my_entryp
->fsyncp 
= entryp
->uaiocbp
; 
1664         TAILQ_FOREACH( my_entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link 
) { 
1665                 if ( entryp
->procp 
== my_entryp
->procp 
&& 
1666                          entryp
->aiocb
.aio_fildes 
== my_entryp
->aiocb
.aio_fildes 
) { 
1667                         my_entryp
->fsyncp 
= entryp
->uaiocbp
; 
1671 } /* aio_mark_requests */ 
1675  * lio_create_sync_entry - allocate an aio_workq_entry and fill it in. 
1676  * If all goes well return 0 and pass the aio_workq_entry pointer back to 
1678  * lio_listio calls behave differently at completion they do completion notification  
1679  * when all async IO requests have completed.  We use group_tag to tag IO requests  
1680  * that behave in the delay notification manner.  
1684 lio_create_sync_entry( struct proc 
*procp
, user_addr_t aiocbp
,  
1685                                                 long group_tag
, aio_workq_entry 
**entrypp 
) 
1687         aio_workq_entry                         
*entryp
; 
1690         entryp 
= (aio_workq_entry 
*) zalloc( aio_workq_zonep 
); 
1691         if ( entryp 
== NULL 
) { 
1695         bzero( entryp
, sizeof(*entryp
) ); 
1697         /* fill in the rest of the aio_workq_entry */ 
1698         entryp
->procp 
= procp
; 
1699         entryp
->uaiocbp 
= aiocbp
; 
1700         entryp
->flags 
|= AIO_LIO
; 
1701         entryp
->group_tag 
= group_tag
; 
1702         entryp
->aio_map 
= VM_MAP_NULL
; 
1704         if ( !IS_64BIT_PROCESS(procp
) ) { 
1705                 struct aiocb aiocb32
; 
1707                 result 
= copyin( aiocbp
, &aiocb32
, sizeof(aiocb32
) ); 
1709                         do_munge_aiocb( &aiocb32
, &entryp
->aiocb 
); 
1711                 result 
= copyin( aiocbp
, &entryp
->aiocb
, sizeof(entryp
->aiocb
) ); 
1713         if ( result 
!= 0 ) { 
1718         /* look for lio_listio LIO_NOP requests and ignore them. */ 
1719         /* Not really an error, but we need to free our aio_workq_entry.  */ 
1720         if ( entryp
->aiocb
.aio_lio_opcode 
== LIO_NOP 
) { 
1725         result 
= aio_validate( entryp 
); 
1726         if ( result 
!= 0 ) { 
1734         if ( entryp 
!= NULL 
) 
1735                 zfree( aio_workq_zonep
, entryp 
); 
1739 } /* lio_create_sync_entry */ 
1743  * aio_free_request - remove our reference on the user land map and 
1744  * free the work queue entry resources. 
1745  * We are not holding the lock here thus aio_map is passed in and 
1746  * zeroed while we did have the lock. 
1750 aio_free_request( aio_workq_entry 
*entryp
, vm_map_t the_map 
) 
1752         /* remove our reference to the user land map. */ 
1753         if ( VM_MAP_NULL 
!= the_map 
) { 
1754                 vm_map_deallocate( the_map 
); 
1757         zfree( aio_workq_zonep
, entryp 
); 
1761 } /* aio_free_request */ 
1764 /* aio_validate - validate the aiocb passed in by one of the aio syscalls. 
1768 aio_validate( aio_workq_entry 
*entryp 
)  
1770         struct fileproc                                 
*fp
; 
1776         if ( (entryp
->flags 
& AIO_LIO
) != 0 ) { 
1777                 if ( entryp
->aiocb
.aio_lio_opcode 
== LIO_READ 
) 
1778                         entryp
->flags 
|= AIO_READ
; 
1779                 else if ( entryp
->aiocb
.aio_lio_opcode 
== LIO_WRITE 
) 
1780                         entryp
->flags 
|= AIO_WRITE
; 
1781                 else if ( entryp
->aiocb
.aio_lio_opcode 
== LIO_NOP 
) 
1788         if ( (entryp
->flags 
& (AIO_WRITE 
| AIO_FSYNC
)) != 0 ) { 
1792         if ( (entryp
->flags 
& (AIO_READ 
| AIO_WRITE
)) != 0 ) { 
1793                 // LP64todo - does max value for aio_nbytes need to grow?  
1794                 if ( entryp
->aiocb
.aio_nbytes 
> INT_MAX         
|| 
1795                          entryp
->aiocb
.aio_buf 
== USER_ADDR_NULL 
|| 
1796                          entryp
->aiocb
.aio_offset 
< 0 ) 
1800         /* validate aiocb.aio_sigevent.  at this point we only support sigev_notify 
1801          * equal to SIGEV_SIGNAL or SIGEV_NONE.  this means sigev_value,  
1802          * sigev_notify_function, and sigev_notify_attributes are ignored. 
1804         if ( entryp
->aiocb
.aio_sigevent
.sigev_notify 
== SIGEV_SIGNAL 
) { 
1806                 /* make sure we have a valid signal number */ 
1807                 signum 
= entryp
->aiocb
.aio_sigevent
.sigev_signo
; 
1808                 if ( signum 
<= 0 || signum 
>= NSIG 
||  
1809                          signum 
== SIGKILL 
|| signum 
== SIGSTOP 
) 
1812         else if ( entryp
->aiocb
.aio_sigevent
.sigev_notify 
!= SIGEV_NONE 
) 
1815         /* validate the file descriptor and that the file was opened 
1816          * for the appropriate read / write access. 
1818         proc_fdlock(entryp
->procp
); 
1820         result 
= fp_lookup( entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp 
, 1); 
1821         if ( result 
== 0 ) { 
1822                 if ( (fp
->f_fglob
->fg_flag 
& flag
) == 0 ) { 
1823                         /* we don't have read or write access */ 
1826                 else if ( fp
->f_fglob
->fg_type 
!= DTYPE_VNODE 
) { 
1827                         /* this is not a file */ 
1830                         fp
->f_flags 
|= FP_AIOISSUED
; 
1832                 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp 
, 1); 
1838         proc_fdunlock(entryp
->procp
); 
1842 } /* aio_validate */ 
1846  * aio_get_process_count - runs through our queues that hold outstanding  
1847  * async IO reqests and totals up number of requests for the given 
1849  * NOTE - caller must hold aio lock!  
1853 aio_get_process_count( struct proc 
*procp 
)  
1855         aio_workq_entry                         
*entryp
; 
1858         /* begin with count of completed async IO requests for this process */ 
1859         count 
= procp
->aio_done_count
; 
1861         /* add in count of active async IO requests for this process */ 
1862         count 
+= procp
->aio_active_count
; 
1864         /* look for matches on our queue of asynchronous todo work */ 
1865         TAILQ_FOREACH( entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link 
) { 
1866                 if ( procp 
== entryp
->procp 
) { 
1871         /* look for matches on our queue of synchronous todo work */ 
1872         TAILQ_FOREACH( entryp
, &aio_anchor
.lio_sync_workq
, aio_workq_link 
) { 
1873                 if ( procp 
== entryp
->procp 
) { 
1880 } /* aio_get_process_count */ 
1884  * aio_get_all_queues_count - get total number of entries on all aio work queues.   
1885  * NOTE - caller must hold aio lock!  
1889 aio_get_all_queues_count( void )  
1893         count 
= aio_anchor
.aio_async_workq_count
; 
1894         count 
+= aio_anchor
.lio_sync_workq_count
; 
1895         count 
+= aio_anchor
.aio_active_count
; 
1896         count 
+= aio_anchor
.aio_done_count
; 
1900 } /* aio_get_all_queues_count */ 
1904  * do_aio_completion.  Handle async IO completion.   
1908 do_aio_completion( aio_workq_entry 
*entryp 
)  
1910         /* signal user land process if appropriate */ 
1911         if ( entryp
->aiocb
.aio_sigevent
.sigev_notify 
== SIGEV_SIGNAL 
&& 
1912                  (entryp
->flags 
& AIO_DISABLE
) == 0 ) { 
1915                  * if group_tag is non zero then make sure this is the last IO request 
1916                  * in the group before we signal. 
1918                 if ( entryp
->group_tag 
== 0 ||  
1919                          (entryp
->group_tag 
!= 0 && aio_last_group_io( entryp 
)) ) { 
1920                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_sig
)) | DBG_FUNC_NONE
, 
1921                                                   (int)entryp
->procp
, (int)entryp
->uaiocbp
,  
1922                                                   entryp
->aiocb
.aio_sigevent
.sigev_signo
, 0, 0 ); 
1924                         psignal( entryp
->procp
, entryp
->aiocb
.aio_sigevent
.sigev_signo 
); 
1930          * need to handle case where a process is trying to exit, exec, or close 
1931          * and is currently waiting for active aio requests to complete.  If   
1932          * AIO_WAITING is set then we need to look to see if there are any  
1933          * other requests in the active queue for this process.  If there are  
1934          * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.  If  
1935          * there are some still active then do nothing - we only want to wakeup  
1936          * when all active aio requests for the process are complete.  
1938         if ( (entryp
->flags 
& AIO_WAITING
) != 0 ) { 
1939                 int             active_requests
; 
1941                 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wait
)) | DBG_FUNC_NONE
, 
1942                                           (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 ); 
1945                 active_requests 
= aio_active_requests_for_process( entryp
->procp 
); 
1947                 if ( active_requests 
< 1 ) { 
1948                         /* no active aio requests for this process, continue exiting */ 
1949                         wakeup_one( (caddr_t
) &entryp
->procp
->AIO_CLEANUP_SLEEP_CHAN 
); 
1951                         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wake
)) | DBG_FUNC_NONE
, 
1952                                                   (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 ); 
1959          * aio_suspend case when a signal was not requested.  In that scenario we   
1960          * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.    
1961          * NOTE - the assumption here is that this wakeup call is inexpensive. 
1962          * we really only need to do this when an aio_suspend call is pending. 
1963          * If we find the wakeup call should be avoided we could mark the  
1964          * async IO requests given in the list provided by aio_suspend and only 
1965          * call wakeup for them.  If we do mark them we should unmark them after 
1966          * the aio_suspend wakes up. 
1969         wakeup_one( (caddr_t
) &entryp
->procp
->AIO_SUSPEND_SLEEP_CHAN 
);  
1972         KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_suspend_wake
)) | DBG_FUNC_NONE
, 
1973                                   (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 ); 
1977 } /* do_aio_completion */ 
1981  * aio_last_group_io - checks to see if this is the last unfinished IO request 
1982  * for the given group_tag.  Returns TRUE if there are no other active IO  
1983  * requests for this group or FALSE if the are active IO requests  
1984  * NOTE - AIO_LOCK must be held by caller 
1988 aio_last_group_io( aio_workq_entry 
*entryp 
)  
1990         aio_workq_entry                         
*my_entryp
; 
1992         /* look for matches on our queue of active async IO requests */ 
1993         TAILQ_FOREACH( my_entryp
, &entryp
->procp
->aio_activeq
, aio_workq_link 
) { 
1994                 if ( my_entryp
->group_tag 
== entryp
->group_tag 
) 
1998         /* look for matches on our queue of asynchronous todo work */ 
1999         TAILQ_FOREACH( my_entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link 
) { 
2000                 if ( my_entryp
->group_tag 
== entryp
->group_tag 
) 
2004         /* look for matches on our queue of synchronous todo work */ 
2005         TAILQ_FOREACH( my_entryp
, &aio_anchor
.lio_sync_workq
, aio_workq_link 
) { 
2006                 if ( my_entryp
->group_tag 
== entryp
->group_tag 
) 
2012 } /* aio_last_group_io */ 
2019 do_aio_read( aio_workq_entry 
*entryp 
) 
2021         struct fileproc                         
*fp
; 
2024         if ( (error 
= fp_lookup(entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp 
, 0)) ) 
2026         if ( (fp
->f_fglob
->fg_flag 
& FREAD
) == 0 ) { 
2027                 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0); 
2031                 error 
= dofileread( entryp
->procp
, fp
, entryp
->aiocb
.aio_fildes
,  
2032                                                         entryp
->aiocb
.aio_buf
,  
2033                                                         entryp
->aiocb
.aio_nbytes
, 
2034                                                         entryp
->aiocb
.aio_offset
, FOF_OFFSET
,  
2035                                                         &entryp
->returnval 
); 
2036                 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0); 
2039                 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0); 
2052 do_aio_write( aio_workq_entry 
*entryp 
) 
2054         struct fileproc                 
*fp
; 
2057         if ( (error 
= fp_lookup(entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp 
, 0)) ) 
2059         if ( (fp
->f_fglob
->fg_flag 
& FWRITE
) == 0 ) { 
2060                 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0); 
2064                 /* NB: tell dofilewrite the offset, and to use the proc cred */ 
2065                 error 
= dofilewrite( entryp
->procp
, 
2067                                      entryp
->aiocb
.aio_fildes
, 
2068                                      entryp
->aiocb
.aio_buf
, 
2069                                      entryp
->aiocb
.aio_nbytes
, 
2070                                      entryp
->aiocb
.aio_offset
, 
2071                                      FOF_OFFSET 
| FOF_PCRED
, 
2072                                      &entryp
->returnval
); 
2074                 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0); 
2077                 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0); 
2083 } /* do_aio_write */ 
2087  * aio_active_requests_for_process - return number of active async IO 
2088  * requests for the given process. 
2089  * NOTE - caller must hold aio lock! 
2093 aio_active_requests_for_process( struct proc 
*procp 
) 
2096         return( procp
->aio_active_count 
); 
2098 } /* aio_active_requests_for_process */ 
2105 do_aio_fsync( aio_workq_entry 
*entryp 
) 
2107         struct vfs_context      context
; 
2109         struct fileproc         
*fp
; 
2113          * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.   
2114          * AIO_DSYNC is caught before we queue up a request and flagged as an error.   
2115          * The following was shamelessly extracted from fsync() implementation.  
2118         error 
= fp_getfvp( entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
, &vp
); 
2120                 if ( (error 
= vnode_getwithref(vp
)) ) { 
2121                         fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0); 
2122                         entryp
->returnval 
= -1; 
2125                 context
.vc_proc 
= entryp
->procp
; 
2126                 context
.vc_ucred 
= fp
->f_fglob
->fg_cred
; 
2128                 error 
= VNOP_FSYNC( vp
, MNT_WAIT
, &context
); 
2130                 (void)vnode_put(vp
); 
2132                 fp_drop(entryp
->procp
, entryp
->aiocb
.aio_fildes
, fp
, 0); 
2135                 entryp
->returnval 
= -1; 
2139 } /* do_aio_fsync */ 
2143  * is_already_queued - runs through our queues to see if the given   
2144  * aiocbp / process is there.  Returns TRUE if there is a match 
2145  * on any of our aio queues. 
2146  * NOTE - callers must hold aio lock! 
2150 is_already_queued(      struct proc 
*procp
,  
2151                                         user_addr_t aiocbp 
)  
2153         aio_workq_entry                 
*entryp
; 
2158         /* look for matches on our queue of async IO requests that have completed */ 
2159         TAILQ_FOREACH( entryp
, &procp
->aio_doneq
, aio_workq_link 
) { 
2160                 if ( aiocbp 
== entryp
->uaiocbp 
) { 
2162                         goto ExitThisRoutine
; 
2166         /* look for matches on our queue of active async IO requests */ 
2167         TAILQ_FOREACH( entryp
, &procp
->aio_activeq
, aio_workq_link 
) { 
2168                 if ( aiocbp 
== entryp
->uaiocbp 
) { 
2170                         goto ExitThisRoutine
; 
2174         /* look for matches on our queue of asynchronous todo work */ 
2175         TAILQ_FOREACH( entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link 
) { 
2176                 if ( procp 
== entryp
->procp 
&& aiocbp 
== entryp
->uaiocbp 
) { 
2178                         goto ExitThisRoutine
; 
2182         /* look for matches on our queue of synchronous todo work */ 
2183         TAILQ_FOREACH( entryp
, &aio_anchor
.lio_sync_workq
, aio_workq_link 
) { 
2184                 if ( procp 
== entryp
->procp 
&& aiocbp 
== entryp
->uaiocbp 
) { 
2186                         goto ExitThisRoutine
; 
2193 } /* is_already_queued */ 
2197  * aio initialization 
2199 __private_extern__ 
void 
2204         aio_lock_grp_attr 
= lck_grp_attr_alloc_init(); 
2205         aio_lock_grp 
= lck_grp_alloc_init("aio", aio_lock_grp_attr
); 
2206         aio_lock_attr 
= lck_attr_alloc_init(); 
2208         aio_lock 
= lck_mtx_alloc_init(aio_lock_grp
, aio_lock_attr
); 
2211         TAILQ_INIT( &aio_anchor
.aio_async_workq 
);       
2212         TAILQ_INIT( &aio_anchor
.lio_sync_workq 
);        
2213         aio_anchor
.aio_async_workq_count 
= 0; 
2214         aio_anchor
.lio_sync_workq_count 
= 0; 
2215         aio_anchor
.aio_active_count 
= 0; 
2216         aio_anchor
.aio_done_count 
= 0; 
2219         i 
= sizeof( aio_workq_entry 
); 
2220         aio_workq_zonep 
= zinit( i
, i 
* aio_max_requests
, i 
* aio_max_requests
, "aiowq" ); 
2222         _aio_create_worker_threads( aio_worker_threads 
); 
2230  * aio worker threads created here. 
2232 __private_extern__ 
void 
2233 _aio_create_worker_threads( int num 
) 
2237         /* create some worker threads to handle the async IO requests */ 
2238         for ( i 
= 0; i 
< num
; i
++ ) { 
2241                 myThread 
= kernel_thread( kernel_task
, aio_work_thread 
); 
2242                 if ( THREAD_NULL 
== myThread 
) { 
2243                         printf( "%s - failed to create a work thread \n", __FUNCTION__ 
);  
2249 } /* _aio_create_worker_threads */ 
2252  * Return the current activation utask 
2257         return  ((struct uthread 
*)get_bsdthread_info(current_thread()))->uu_aio_task
;   
2262  * In the case of an aiocb from a 
2263  * 32-bit process we need to expand some longs and pointers to the correct 
2264  * sizes in order to let downstream code always work on the same type of 
2265  * aiocb (in our case that is a user_aiocb) 
2268 do_munge_aiocb( struct aiocb 
*my_aiocbp
, struct user_aiocb 
*the_user_aiocbp 
)  
2270         the_user_aiocbp
->aio_fildes 
= my_aiocbp
->aio_fildes
; 
2271         the_user_aiocbp
->aio_offset 
= my_aiocbp
->aio_offset
; 
2272         the_user_aiocbp
->aio_buf 
= CAST_USER_ADDR_T(my_aiocbp
->aio_buf
); 
2273         the_user_aiocbp
->aio_nbytes 
= my_aiocbp
->aio_nbytes
; 
2274         the_user_aiocbp
->aio_reqprio 
= my_aiocbp
->aio_reqprio
; 
2275         the_user_aiocbp
->aio_lio_opcode 
= my_aiocbp
->aio_lio_opcode
; 
2277         /* special case here.  since we do not know if sigev_value is an */ 
2278         /* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */ 
2279         /* means if we send this info back to user space we need to remember */ 
2280         /* sigev_value was not expanded for the 32-bit case.  */ 
2281         /* NOTE - this does NOT affect us since we don't support sigev_value */ 
2282         /* yet in the aio context.  */ 
2284         the_user_aiocbp
->aio_sigevent
.sigev_notify 
= my_aiocbp
->aio_sigevent
.sigev_notify
; 
2285         the_user_aiocbp
->aio_sigevent
.sigev_signo 
= my_aiocbp
->aio_sigevent
.sigev_signo
; 
2286         the_user_aiocbp
->aio_sigevent
.sigev_value
.size_equivalent
.sival_int 
=  
2287                 my_aiocbp
->aio_sigevent
.sigev_value
.sival_int
; 
2288         the_user_aiocbp
->aio_sigevent
.sigev_notify_function 
=  
2289                 CAST_USER_ADDR_T(my_aiocbp
->aio_sigevent
.sigev_notify_function
); 
2290         the_user_aiocbp
->aio_sigevent
.sigev_notify_attributes 
=  
2291                 CAST_USER_ADDR_T(my_aiocbp
->aio_sigevent
.sigev_notify_attributes
);