2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
29 * 1) ramesh is looking into how to replace taking a reference on
30 * the user's map (vm_map_reference()) since it is believed that
31 * would not hold the process for us.
32 * 2) david is looking into a way for us to set the priority of the
33 * worker threads to match that of the user's thread when the
34 * async IO was queued.
39 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
42 #include <sys/systm.h>
44 #include <sys/fcntl.h>
46 #include <sys/filedesc.h>
47 #include <sys/kernel.h>
48 #include <sys/vnode.h>
49 #include <sys/malloc.h>
50 #include <sys/mount.h>
51 #include <sys/param.h>
53 #include <sys/sysctl.h>
54 #include <sys/unistd.h>
57 #include <sys/aio_kern.h>
59 #include <machine/limits.h>
60 #include <kern/zalloc.h>
61 #include <kern/task.h>
63 #include <sys/kdebug.h>
64 #define AIO_work_queued 1
65 #define AIO_worker_wake 2
66 #define AIO_completion_sig 3
67 #define AIO_completion_cleanup_wait 4
68 #define AIO_completion_cleanup_wake 5
69 #define AIO_completion_suspend_wake 6
70 #define AIO_fsync_delay 7
72 #define AIO_cancel_async_workq 11
73 #define AIO_cancel_sync_workq 12
74 #define AIO_cancel_activeq 13
75 #define AIO_cancel_doneq 14
81 #define AIO_error_val 61
82 #define AIO_error_activeq 62
83 #define AIO_error_workq 63
85 #define AIO_return_val 71
86 #define AIO_return_activeq 72
87 #define AIO_return_workq 73
90 #define AIO_exit_sleep 91
92 #define AIO_close_sleep 101
93 #define AIO_suspend 110
94 #define AIO_suspend_sleep 111
95 #define AIO_worker_thread 120
99 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
103 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
104 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
105 * (proc.aio_activeq) when one of our worker threads start the IO.
106 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
107 * when the IO request completes. The request remains on aio_doneq until
108 * user process calls aio_return or the process exits, either way that is our
109 * trigger to release aio resources.
113 int aio_async_workq_count
; /* entries on aio_async_workq */
114 int lio_sync_workq_count
; /* entries on lio_sync_workq */
115 int aio_active_count
; /* entries on all active queues (proc.aio_activeq) */
116 int aio_done_count
; /* entries on all done queues (proc.aio_doneq) */
117 TAILQ_HEAD( , aio_workq_entry
) aio_async_workq
;
118 TAILQ_HEAD( , aio_workq_entry
) lio_sync_workq
;
120 typedef struct aio_anchor_cb aio_anchor_cb
;
124 * Notes on aio sleep / wake channels.
125 * We currently pick a couple fields within the proc structure that will allow
126 * us sleep channels that currently do not collide with any other kernel routines.
127 * At this time, for binary compatibility reasons, we cannot create new proc fields.
129 #define AIO_SUSPEND_SLEEP_CHAN p_estcpu
130 #define AIO_CLEANUP_SLEEP_CHAN p_pctcpu
134 * aysnc IO locking macros used to protect critical sections.
136 #define AIO_LOCK usimple_lock( &aio_lock )
137 #define AIO_UNLOCK usimple_unlock( &aio_lock )
143 static int aio_active_requests_for_process( struct proc
*procp
);
144 static boolean_t
aio_delay_fsync_request( aio_workq_entry
*entryp
);
145 static int aio_free_request( aio_workq_entry
*entryp
, vm_map_t the_map
);
146 static int aio_get_all_queues_count( void );
147 static int aio_get_process_count( struct proc
*procp
);
148 static aio_workq_entry
* aio_get_some_work( void );
149 static boolean_t
aio_last_group_io( aio_workq_entry
*entryp
);
150 static void aio_mark_requests( aio_workq_entry
*entryp
);
151 static int aio_queue_async_request( struct proc
*procp
,
152 struct aiocb
*aiocbp
,
154 static int aio_validate( aio_workq_entry
*entryp
);
155 static void aio_work_thread( void );
156 static int do_aio_cancel( struct proc
*p
,
158 struct aiocb
*aiocbp
,
159 boolean_t wait_for_completion
,
160 boolean_t disable_notification
);
161 static void do_aio_completion( aio_workq_entry
*entryp
);
162 static int do_aio_fsync( aio_workq_entry
*entryp
);
163 static int do_aio_read( aio_workq_entry
*entryp
);
164 static int do_aio_write( aio_workq_entry
*entryp
);
165 static boolean_t
is_already_queued( struct proc
*procp
,
166 struct aiocb
*aiocbp
);
167 static int lio_create_async_entry( struct proc
*procp
,
168 struct aiocb
*aiocbp
,
169 struct sigevent
*sigp
,
171 aio_workq_entry
**entrypp
);
172 static int lio_create_sync_entry( struct proc
*procp
,
173 struct aiocb
*aiocbp
,
175 aio_workq_entry
**entrypp
);
178 * EXTERNAL PROTOTYPES
181 /* in ...bsd/kern/sys_generic.c */
182 extern struct file
* holdfp( struct filedesc
* fdp
, int fd
, int flag
);
183 extern int dofileread( struct proc
*p
, struct file
*fp
, int fd
,
184 void *buf
, size_t nbyte
, off_t offset
,
185 int flags
, int *retval
);
186 extern int dofilewrite( struct proc
*p
, struct file
*fp
, int fd
,
187 const void *buf
, size_t nbyte
, off_t offset
,
188 int flags
, int *retval
);
189 extern vm_map_t
vm_map_switch( vm_map_t map
);
193 * aio external global variables.
195 extern int aio_max_requests
; /* AIO_MAX - configurable */
196 extern int aio_max_requests_per_process
; /* AIO_PROCESS_MAX - configurable */
197 extern int aio_worker_threads
; /* AIO_THREAD_COUNT - configurable */
201 * aio static variables.
203 static aio_anchor_cb aio_anchor
;
204 static simple_lock_data_t aio_lock
;
205 static struct zone
*aio_workq_zonep
;
209 * syscall input parameters
211 #ifndef _SYS_SYSPROTO_H_
213 struct aio_cancel_args
{
215 struct aiocb
*aiocbp
;
218 struct aio_error_args
{
219 struct aiocb
*aiocbp
;
222 struct aio_fsync_args
{
224 struct aiocb
*aiocbp
;
227 struct aio_read_args
{
228 struct aiocb
*aiocbp
;
231 struct aio_return_args
{
232 struct aiocb
*aiocbp
;
235 struct aio_suspend_args
{
236 struct aiocb
*const *aiocblist
;
238 const struct timespec
*timeoutp
;
241 struct aio_write_args
{
242 struct aiocb
*aiocbp
;
245 struct lio_listio_args
{
247 struct aiocb
*const *aiocblist
;
249 struct sigevent
*sigp
;
252 #endif /* _SYS_SYSPROTO_H_ */
256 * aio_cancel - attempt to cancel one or more async IO requests currently
257 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
258 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
259 * is NULL then all outstanding async IO request for the given file
260 * descriptor are cancelled (if possible).
264 aio_cancel( struct proc
*p
, struct aio_cancel_args
*uap
, int *retval
)
266 struct aiocb my_aiocb
;
268 boolean_t funnel_state
;
270 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel
)) | DBG_FUNC_START
,
271 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
273 /* quick check to see if there are any async IO requests queued up */
275 result
= aio_get_all_queues_count( );
283 if ( uap
->aiocbp
!= NULL
) {
284 result
= copyin( uap
->aiocbp
, &my_aiocb
, sizeof(my_aiocb
) );
290 /* NOTE - POSIX standard says a mismatch between the file */
291 /* descriptor passed in and the file descriptor embedded in */
292 /* the aiocb causes unspecified results. We return EBADF in */
293 /* that situation. */
294 if ( uap
->fd
!= my_aiocb
.aio_fildes
) {
300 /* current BSD code assumes funnel lock is held */
301 funnel_state
= thread_funnel_set( kernel_flock
, TRUE
);
302 result
= do_aio_cancel( p
, uap
->fd
, uap
->aiocbp
, FALSE
, FALSE
);
303 (void) thread_funnel_set( kernel_flock
, funnel_state
);
305 if ( result
!= -1 ) {
314 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel
)) | DBG_FUNC_END
,
315 (int)p
, (int)uap
->aiocbp
, result
, 0, 0 );
323 * _aio_close - internal function used to clean up async IO requests for
324 * a file descriptor that is closing.
325 * NOTE - kernel funnel lock is held when we get called.
329 __private_extern__
void
330 _aio_close( struct proc
*p
, int fd
)
334 /* quick check to see if there are any async IO requests queued up */
336 count
= aio_get_all_queues_count( );
341 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close
)) | DBG_FUNC_START
,
342 (int)p
, fd
, 0, 0, 0 );
344 /* cancel all async IO requests on our todo queues for this file descriptor */
345 error
= do_aio_cancel( p
, fd
, NULL
, TRUE
, FALSE
);
346 if ( error
== AIO_NOTCANCELED
) {
348 * AIO_NOTCANCELED is returned when we find an aio request for this process
349 * and file descriptor on the active async IO queue. Active requests cannot
350 * be cancelled so we must wait for them to complete. We will get a special
351 * wake up call on our channel used to sleep for ALL active requests to
352 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
353 * when we must wait for all active aio requests.
356 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close_sleep
)) | DBG_FUNC_NONE
,
357 (int)p
, fd
, 0, 0, 0 );
359 tsleep( &p
->AIO_CLEANUP_SLEEP_CHAN
, PRIBIO
, "aio_close", 0 );
362 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close
)) | DBG_FUNC_END
,
363 (int)p
, fd
, 0, 0, 0 );
371 * aio_error - return the error status associated with the async IO
372 * request referred to by uap->aiocbp. The error status is the errno
373 * value that would be set by the corresponding IO request (read, wrtie,
374 * fdatasync, or sync).
378 aio_error( struct proc
*p
, struct aio_error_args
*uap
, int *retval
)
380 aio_workq_entry
*entryp
;
383 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error
)) | DBG_FUNC_START
,
384 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
388 /* quick check to see if there are any async IO requests queued up */
389 if ( aio_get_all_queues_count( ) < 1 ) {
394 /* look for a match on our queue of async IO requests that have completed */
395 TAILQ_FOREACH( entryp
, &p
->aio_doneq
, aio_workq_link
) {
396 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
397 *retval
= entryp
->errorval
;
399 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_val
)) | DBG_FUNC_NONE
,
400 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
405 /* look for a match on our queue of active async IO requests */
406 TAILQ_FOREACH( entryp
, &p
->aio_activeq
, aio_workq_link
) {
407 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
408 *retval
= EINPROGRESS
;
410 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_activeq
)) | DBG_FUNC_NONE
,
411 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
416 /* look for a match on our queue of todo work */
417 TAILQ_FOREACH( entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
418 if ( p
== entryp
->procp
&& entryp
->uaiocbp
== uap
->aiocbp
) {
419 *retval
= EINPROGRESS
;
421 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_workq
)) | DBG_FUNC_NONE
,
422 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
429 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error
)) | DBG_FUNC_END
,
430 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
439 * aio_fsync - asynchronously force all IO operations associated
440 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
441 * queued at the time of the call to the synchronized completion state.
442 * NOTE - we do not support op O_DSYNC at this point since we do not support the
447 aio_fsync( struct proc
*p
, struct aio_fsync_args
*uap
, int *retval
)
452 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync
)) | DBG_FUNC_START
,
453 (int)p
, (int)uap
->aiocbp
, uap
->op
, 0, 0 );
456 if ( uap
->op
== O_SYNC
)
457 fsync_kind
= AIO_FSYNC
;
458 #if 0 // we don't support fdatasync() call yet
459 else if ( uap
->op
== O_DSYNC
)
460 fsync_kind
= AIO_DSYNC
;
468 error
= aio_queue_async_request( p
, uap
->aiocbp
, fsync_kind
);
473 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync
)) | DBG_FUNC_END
,
474 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
481 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
482 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
483 * (uap->aiocbp->aio_buf).
487 aio_read( struct proc
*p
, struct aio_read_args
*uap
, int *retval
)
491 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_read
)) | DBG_FUNC_START
,
492 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
496 error
= aio_queue_async_request( p
, uap
->aiocbp
, AIO_READ
);
500 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_read
)) | DBG_FUNC_END
,
501 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
509 * aio_return - return the return status associated with the async IO
510 * request referred to by uap->aiocbp. The return status is the value
511 * that would be returned by corresponding IO request (read, wrtie,
512 * fdatasync, or sync). This is where we release kernel resources
513 * held for async IO call associated with the given aiocb pointer.
517 aio_return( struct proc
*p
, struct aio_return_args
*uap
, register_t
*retval
)
519 aio_workq_entry
*entryp
;
523 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return
)) | DBG_FUNC_START
,
524 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
530 /* quick check to see if there are any async IO requests queued up */
531 if ( aio_get_all_queues_count( ) < 1 ) {
536 /* look for a match on our queue of async IO requests that have completed */
537 TAILQ_FOREACH( entryp
, &p
->aio_doneq
, aio_workq_link
) {
538 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
539 TAILQ_REMOVE( &p
->aio_doneq
, entryp
, aio_workq_link
);
540 aio_anchor
.aio_done_count
--;
543 *retval
= entryp
->returnval
;
545 /* we cannot free requests that are still completing */
546 if ( (entryp
->flags
& AIO_COMPLETION
) == 0 ) {
549 my_map
= entryp
->aio_map
;
550 entryp
->aio_map
= VM_MAP_NULL
;
553 aio_free_request( entryp
, my_map
);
556 /* tell completion code to free this request */
557 entryp
->flags
|= AIO_DO_FREE
;
559 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_val
)) | DBG_FUNC_NONE
,
560 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
565 /* look for a match on our queue of active async IO requests */
566 TAILQ_FOREACH( entryp
, &p
->aio_activeq
, aio_workq_link
) {
567 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
569 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_activeq
)) | DBG_FUNC_NONE
,
570 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
575 /* look for a match on our queue of todo work */
576 TAILQ_FOREACH( entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
577 if ( p
== entryp
->procp
&& entryp
->uaiocbp
== uap
->aiocbp
) {
579 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_workq
)) | DBG_FUNC_NONE
,
580 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
589 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return
)) | DBG_FUNC_END
,
590 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
598 * _aio_exec - internal function used to clean up async IO requests for
599 * a process that is going away due to exec(). We cancel any async IOs
600 * we can and wait for those already active. We also disable signaling
601 * for cancelled or active aio requests that complete.
602 * NOTE - kernel funnel lock is held when we get called.
603 * This routine MAY block!
606 __private_extern__
void
607 _aio_exec( struct proc
*p
)
610 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exec
)) | DBG_FUNC_START
,
611 (int)p
, 0, 0, 0, 0 );
615 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exec
)) | DBG_FUNC_END
,
616 (int)p
, 0, 0, 0, 0 );
624 * _aio_exit - internal function used to clean up async IO requests for
625 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
626 * we can and wait for those already active. We also disable signaling
627 * for cancelled or active aio requests that complete. This routine MAY block!
628 * NOTE - kernel funnel lock is held when we get called.
631 __private_extern__
void
632 _aio_exit( struct proc
*p
)
635 aio_workq_entry
*entryp
;
637 /* quick check to see if there are any async IO requests queued up */
639 count
= aio_get_all_queues_count( );
645 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit
)) | DBG_FUNC_START
,
646 (int)p
, 0, 0, 0, 0 );
649 * cancel async IO requests on the todo work queue and wait for those
650 * already active to complete.
652 error
= do_aio_cancel( p
, 0, NULL
, TRUE
, TRUE
);
653 if ( error
== AIO_NOTCANCELED
) {
655 * AIO_NOTCANCELED is returned when we find an aio request for this process
656 * on the active async IO queue. Active requests cannot be cancelled so we
657 * must wait for them to complete. We will get a special wake up call on
658 * our channel used to sleep for ALL active requests to complete. This sleep
659 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
660 * active aio requests.
663 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit_sleep
)) | DBG_FUNC_NONE
,
664 (int)p
, 0, 0, 0, 0 );
666 tsleep( &p
->AIO_CLEANUP_SLEEP_CHAN
, PRIBIO
, "aio_exit", 0 );
669 /* release all aio resources used by this process */
671 entryp
= TAILQ_FIRST( &p
->aio_doneq
);
672 while ( entryp
!= NULL
) {
673 aio_workq_entry
*next_entryp
;
675 next_entryp
= TAILQ_NEXT( entryp
, aio_workq_link
);
676 TAILQ_REMOVE( &p
->aio_doneq
, entryp
, aio_workq_link
);
677 aio_anchor
.aio_done_count
--;
680 /* we cannot free requests that are still completing */
681 if ( (entryp
->flags
& AIO_COMPLETION
) == 0 ) {
684 my_map
= entryp
->aio_map
;
685 entryp
->aio_map
= VM_MAP_NULL
;
687 aio_free_request( entryp
, my_map
);
689 /* need to start over since aio_doneq may have been */
690 /* changed while we were away. */
692 entryp
= TAILQ_FIRST( &p
->aio_doneq
);
696 /* tell completion code to free this request */
697 entryp
->flags
|= AIO_DO_FREE
;
698 entryp
= next_entryp
;
703 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit
)) | DBG_FUNC_END
,
704 (int)p
, 0, 0, 0, 0 );
712 * do_aio_cancel - cancel async IO requests (if possible). We get called by
713 * aio_cancel, close, and at exit.
714 * There are three modes of operation: 1) cancel all async IOs for a process -
715 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
716 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
718 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
719 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
720 * target async IO requests, and AIO_ALLDONE if all target async IO requests
721 * were already complete.
722 * WARNING - do not deference aiocbp in this routine, it may point to user
723 * land data that has not been copied in (when called from aio_cancel() )
724 * NOTE - kernel funnel lock is held when we get called.
728 do_aio_cancel( struct proc
*p
, int fd
, struct aiocb
*aiocbp
,
729 boolean_t wait_for_completion
, boolean_t disable_notification
)
731 aio_workq_entry
*entryp
;
736 /* look for a match on our queue of async todo work. */
738 entryp
= TAILQ_FIRST( &aio_anchor
.aio_async_workq
);
739 while ( entryp
!= NULL
) {
740 aio_workq_entry
*next_entryp
;
742 next_entryp
= TAILQ_NEXT( entryp
, aio_workq_link
);
743 if ( p
== entryp
->procp
) {
744 if ( (aiocbp
== NULL
&& fd
== 0) ||
745 (aiocbp
!= NULL
&& entryp
->uaiocbp
== aiocbp
) ||
746 (aiocbp
== NULL
&& fd
== entryp
->aiocb
.aio_fildes
) ) {
747 /* we found a match so we remove the entry from the */
748 /* todo work queue and place it on the done queue */
749 TAILQ_REMOVE( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link
);
750 aio_anchor
.aio_async_workq_count
--;
751 entryp
->errorval
= ECANCELED
;
752 entryp
->returnval
= -1;
753 if ( disable_notification
)
754 entryp
->flags
|= AIO_DISABLE
; /* flag for special completion processing */
755 result
= AIO_CANCELED
;
757 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_async_workq
)) | DBG_FUNC_NONE
,
758 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
760 TAILQ_INSERT_TAIL( &p
->aio_doneq
, entryp
, aio_workq_link
);
761 aio_anchor
.aio_done_count
++;
763 entryp
->flags
|= AIO_COMPLETION
;
766 /* do completion processing for this request */
767 do_aio_completion( entryp
);
770 entryp
->flags
&= ~AIO_COMPLETION
;
771 if ( (entryp
->flags
& AIO_DO_FREE
) != 0 ) {
774 my_map
= entryp
->aio_map
;
775 entryp
->aio_map
= VM_MAP_NULL
;
777 aio_free_request( entryp
, my_map
);
782 if ( aiocbp
!= NULL
) {
786 /* need to start over since aio_async_workq may have been */
787 /* changed while we were away doing completion processing. */
789 entryp
= TAILQ_FIRST( &aio_anchor
.aio_async_workq
);
793 entryp
= next_entryp
;
797 * look for a match on our queue of synchronous todo work. This will
798 * be a rare occurrence but could happen if a process is terminated while
799 * processing a lio_listio call.
801 entryp
= TAILQ_FIRST( &aio_anchor
.lio_sync_workq
);
802 while ( entryp
!= NULL
) {
803 aio_workq_entry
*next_entryp
;
805 next_entryp
= TAILQ_NEXT( entryp
, aio_workq_link
);
806 if ( p
== entryp
->procp
) {
807 if ( (aiocbp
== NULL
&& fd
== 0) ||
808 (aiocbp
!= NULL
&& entryp
->uaiocbp
== aiocbp
) ||
809 (aiocbp
== NULL
&& fd
== entryp
->aiocb
.aio_fildes
) ) {
810 /* we found a match so we remove the entry from the */
811 /* todo work queue and place it on the done queue */
812 TAILQ_REMOVE( &aio_anchor
.lio_sync_workq
, entryp
, aio_workq_link
);
813 aio_anchor
.lio_sync_workq_count
--;
814 entryp
->errorval
= ECANCELED
;
815 entryp
->returnval
= -1;
816 if ( disable_notification
)
817 entryp
->flags
|= AIO_DISABLE
; /* flag for special completion processing */
818 result
= AIO_CANCELED
;
820 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_sync_workq
)) | DBG_FUNC_NONE
,
821 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
823 TAILQ_INSERT_TAIL( &p
->aio_doneq
, entryp
, aio_workq_link
);
824 aio_anchor
.aio_done_count
++;
826 if ( aiocbp
!= NULL
) {
832 entryp
= next_entryp
;
836 * look for a match on our queue of active async IO requests and
837 * return AIO_NOTCANCELED result.
839 TAILQ_FOREACH( entryp
, &p
->aio_activeq
, aio_workq_link
) {
840 if ( (aiocbp
== NULL
&& fd
== 0) ||
841 (aiocbp
!= NULL
&& entryp
->uaiocbp
== aiocbp
) ||
842 (aiocbp
== NULL
&& fd
== entryp
->aiocb
.aio_fildes
) ) {
843 result
= AIO_NOTCANCELED
;
845 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_activeq
)) | DBG_FUNC_NONE
,
846 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
848 if ( wait_for_completion
)
849 entryp
->flags
|= AIO_WAITING
; /* flag for special completion processing */
850 if ( disable_notification
)
851 entryp
->flags
|= AIO_DISABLE
; /* flag for special completion processing */
852 if ( aiocbp
!= NULL
) {
860 * if we didn't find any matches on the todo or active queues then look for a
861 * match on our queue of async IO requests that have completed and if found
862 * return AIO_ALLDONE result.
864 if ( result
== -1 ) {
865 TAILQ_FOREACH( entryp
, &p
->aio_doneq
, aio_workq_link
) {
866 if ( (aiocbp
== NULL
&& fd
== 0) ||
867 (aiocbp
!= NULL
&& entryp
->uaiocbp
== aiocbp
) ||
868 (aiocbp
== NULL
&& fd
== entryp
->aiocb
.aio_fildes
) ) {
869 result
= AIO_ALLDONE
;
871 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_doneq
)) | DBG_FUNC_NONE
,
872 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
874 if ( aiocbp
!= NULL
) {
885 } /* do_aio_cancel */
889 * aio_suspend - suspend the calling thread until at least one of the async
890 * IO operations referenced by uap->aiocblist has completed, until a signal
891 * interrupts the function, or uap->timeoutp time interval (optional) has
893 * Returns 0 if one or more async IOs have completed else -1 and errno is
894 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
899 aio_suspend( struct proc
*p
, struct aio_suspend_args
*uap
, int *retval
)
906 aio_workq_entry
*entryp
;
907 struct aiocb
* *aiocbpp
;
909 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend
)) | DBG_FUNC_START
,
910 (int)p
, uap
->nent
, 0, 0, 0 );
916 /* quick check to see if there are any async IO requests queued up */
918 count
= aio_get_all_queues_count( );
922 goto ExitThisRoutine
;
925 if ( uap
->nent
< 1 || uap
->nent
> aio_max_requests_per_process
) {
927 goto ExitThisRoutine
;
930 if ( uap
->timeoutp
!= NULL
) {
931 error
= copyin( (void *)uap
->timeoutp
, &ts
, sizeof(ts
) );
934 goto ExitThisRoutine
;
937 if ( ts
.tv_nsec
< 0 || ts
.tv_nsec
>= 1000000000 ) {
939 goto ExitThisRoutine
;
942 nanoseconds_to_absolutetime( (uint64_t)ts
.tv_sec
* NSEC_PER_SEC
+ ts
.tv_nsec
,
944 clock_absolutetime_interval_to_deadline( abstime
, &abstime
);
947 MALLOC( aiocbpp
, void *, (uap
->nent
* sizeof(struct aiocb
*)), M_TEMP
, M_WAITOK
);
948 if ( aiocbpp
== NULL
) {
950 goto ExitThisRoutine
;
953 /* copyin our aiocb pointers from list */
954 for ( i
= 0; i
< uap
->nent
; i
++ ) {
955 struct aiocb
*aiocbp
;
957 /* copyin in aiocb pointer from list */
958 error
= copyin( (void *)(uap
->aiocblist
+ i
), (aiocbpp
+ i
), sizeof(*aiocbpp
) );
961 goto ExitThisRoutine
;
963 } /* for ( ; i < uap->nent; ) */
965 /* check list of aio requests to see if any have completed */
967 for ( i
= 0; i
< uap
->nent
; i
++ ) {
968 struct aiocb
*aiocbp
;
970 /* NULL elements are legal so check for 'em */
971 aiocbp
= *(aiocbpp
+ i
);
972 if ( aiocbp
== NULL
)
975 /* return immediately if any aio request in the list is done */
976 TAILQ_FOREACH( entryp
, &p
->aio_doneq
, aio_workq_link
) {
977 if ( entryp
->uaiocbp
== aiocbp
) {
981 goto ExitThisRoutine
;
984 } /* for ( ; i < uap->nent; ) */
986 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend_sleep
)) | DBG_FUNC_NONE
,
987 (int)p
, uap
->nent
, 0, 0, 0 );
990 * wait for an async IO to complete or a signal fires or timeout expires.
991 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
992 * interrupts us. If an async IO completes before a signal fires or our
993 * timeout expires, we get a wakeup call from aio_work_thread(). We do not
994 * use tsleep() here in order to avoid getting kernel funnel lock.
996 assert_wait( (event_t
) &p
->AIO_SUSPEND_SLEEP_CHAN
, THREAD_ABORTSAFE
);
1000 thread_set_timer_deadline( abstime
);
1002 error
= thread_block( THREAD_CONTINUE_NULL
);
1004 if ( error
== THREAD_AWAKENED
) {
1005 /* got our wakeup call from aio_work_thread() */
1006 if ( abstime
> 0 ) {
1007 thread_cancel_timer();
1012 else if ( error
== THREAD_TIMED_OUT
) {
1013 /* our timeout expired */
1017 /* we were interrupted */
1018 if ( abstime
> 0 ) {
1019 thread_cancel_timer();
1025 if ( aiocbpp
!= NULL
)
1026 FREE( aiocbpp
, M_TEMP
);
1028 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend
)) | DBG_FUNC_END
,
1029 (int)p
, uap
->nent
, error
, 0, 0 );
1036 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1037 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1038 * (uap->aiocbp->aio_buf).
1042 aio_write( struct proc
*p
, struct aio_write_args
*uap
, int *retval
)
1048 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_write
)) | DBG_FUNC_START
,
1049 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
1051 error
= aio_queue_async_request( p
, uap
->aiocbp
, AIO_WRITE
);
1055 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_write
)) | DBG_FUNC_END
,
1056 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
1064 * lio_listio - initiate a list of IO requests. We process the list of aiocbs
1065 * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1066 * The caller gets error and return status for each aiocb in the list via aio_error
1067 * and aio_return. We must keep completed requests until released by the
1072 lio_listio( struct proc
*p
, struct lio_listio_args
*uap
, int *retval
)
1078 aio_workq_entry
* *entryp_listp
;
1080 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_listio
)) | DBG_FUNC_START
,
1081 (int)p
, uap
->nent
, uap
->mode
, 0, 0 );
1083 entryp_listp
= NULL
;
1086 if ( !(uap
->mode
== LIO_NOWAIT
|| uap
->mode
== LIO_WAIT
) ) {
1087 call_result
= EINVAL
;
1091 if ( uap
->nent
< 1 || uap
->nent
> AIO_LISTIO_MAX
) {
1092 call_result
= EINVAL
;
1097 * we use group_tag to mark IO requests for delayed completion processing
1098 * which means we wait until all IO requests in the group have completed
1099 * before we either return to the caller when mode is LIO_WAIT or signal
1100 * user when mode is LIO_NOWAIT.
1102 group_tag
= random();
1105 * allocate a list of aio_workq_entry pointers that we will use to queue
1106 * up all our requests at once while holding our lock.
1108 MALLOC( entryp_listp
, void *, (uap
->nent
* sizeof(struct aiocb
*)), M_TEMP
, M_WAITOK
);
1109 if ( entryp_listp
== NULL
) {
1110 call_result
= EAGAIN
;
1114 /* process list of aio requests */
1115 for ( i
= 0; i
< uap
->nent
; i
++ ) {
1116 struct aiocb
*my_aiocbp
;
1118 *(entryp_listp
+ i
) = NULL
;
1120 /* copyin in aiocb pointer from list */
1121 result
= copyin( (void *)(uap
->aiocblist
+ i
), &my_aiocbp
, sizeof(my_aiocbp
) );
1122 if ( result
!= 0 ) {
1123 call_result
= EAGAIN
;
1127 /* NULL elements are legal so check for 'em */
1128 if ( my_aiocbp
== NULL
)
1131 if ( uap
->mode
== LIO_NOWAIT
)
1132 result
= lio_create_async_entry( p
, my_aiocbp
, uap
->sigp
,
1133 group_tag
, (entryp_listp
+ i
) );
1135 result
= lio_create_sync_entry( p
, my_aiocbp
, group_tag
,
1136 (entryp_listp
+ i
) );
1138 if ( result
!= 0 && call_result
== -1 )
1139 call_result
= result
;
1143 * we need to protect this section since we do not want any of these grouped
1144 * IO requests to begin until we have them all on the queue.
1147 for ( i
= 0; i
< uap
->nent
; i
++ ) {
1148 aio_workq_entry
*entryp
;
1150 /* NULL elements are legal so check for 'em */
1151 entryp
= *(entryp_listp
+ i
);
1152 if ( entryp
== NULL
)
1155 /* check our aio limits to throttle bad or rude user land behavior */
1156 if ( aio_get_all_queues_count( ) >= aio_max_requests
||
1157 aio_get_process_count( entryp
->procp
) >= aio_max_requests_per_process
||
1158 is_already_queued( entryp
->procp
, entryp
->uaiocbp
) == TRUE
) {
1161 my_map
= entryp
->aio_map
;
1162 entryp
->aio_map
= VM_MAP_NULL
;
1165 aio_free_request( entryp
, my_map
);
1170 /* place the request on the appropriate queue */
1171 if ( uap
->mode
== LIO_NOWAIT
) {
1172 TAILQ_INSERT_TAIL( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link
);
1173 aio_anchor
.aio_async_workq_count
++;
1175 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_NONE
,
1176 (int)p
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1179 TAILQ_INSERT_TAIL( &aio_anchor
.lio_sync_workq
, entryp
, aio_workq_link
);
1180 aio_anchor
.lio_sync_workq_count
++;
1184 if ( uap
->mode
== LIO_NOWAIT
) {
1185 /* caller does not want to wait so we'll fire off a worker thread and return */
1186 wakeup_one( &aio_anchor
.aio_async_workq
);
1189 aio_workq_entry
*entryp
;
1193 * mode is LIO_WAIT - handle the IO requests now.
1195 entryp
= TAILQ_FIRST( &aio_anchor
.lio_sync_workq
);
1196 while ( entryp
!= NULL
) {
1197 if ( p
== entryp
->procp
&& group_tag
== entryp
->group_tag
) {
1198 boolean_t funnel_state
;
1200 TAILQ_REMOVE( &aio_anchor
.lio_sync_workq
, entryp
, aio_workq_link
);
1201 aio_anchor
.lio_sync_workq_count
--;
1204 // file system IO code path requires kernel funnel lock
1205 funnel_state
= thread_funnel_set( kernel_flock
, TRUE
);
1206 if ( (entryp
->flags
& AIO_READ
) != 0 ) {
1207 error
= do_aio_read( entryp
);
1209 else if ( (entryp
->flags
& AIO_WRITE
) != 0 ) {
1210 error
= do_aio_write( entryp
);
1212 else if ( (entryp
->flags
& AIO_FSYNC
) != 0 ) {
1213 error
= do_aio_fsync( entryp
);
1216 printf( "%s - unknown aio request - flags 0x%02X \n",
1217 __FUNCTION__
, entryp
->flags
);
1220 entryp
->errorval
= error
;
1221 if ( error
!= 0 && call_result
== -1 )
1223 (void) thread_funnel_set( kernel_flock
, funnel_state
);
1226 /* we're done with the IO request so move it on the done queue */
1227 TAILQ_INSERT_TAIL( &p
->aio_doneq
, entryp
, aio_workq_link
);
1228 aio_anchor
.aio_done_count
++;
1229 p
->aio_done_count
++;
1231 /* need to start over since lio_sync_workq may have been changed while we */
1232 /* were away doing the IO. */
1233 entryp
= TAILQ_FIRST( &aio_anchor
.lio_sync_workq
);
1235 } /* p == entryp->procp */
1237 entryp
= TAILQ_NEXT( entryp
, aio_workq_link
);
1238 } /* while ( entryp != NULL ) */
1239 } /* uap->mode == LIO_WAIT */
1242 /* call_result == -1 means we had no trouble queueing up requests */
1243 if ( call_result
== -1 ) {
1249 if ( entryp_listp
!= NULL
)
1250 FREE( entryp_listp
, M_TEMP
);
1252 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_listio
)) | DBG_FUNC_END
,
1253 (int)p
, call_result
, 0, 0, 0 );
1255 return( call_result
);
1261 * aio worker thread. this is where all the real work gets done.
1262 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1263 * after new work is queued up.
1267 aio_work_thread( void )
1269 aio_workq_entry
*entryp
;
1270 struct uthread
*uthread
= (struct uthread
*)get_bsdthread_info(current_act());
1274 entryp
= aio_get_some_work();
1275 if ( entryp
== NULL
) {
1277 * aio worker threads wait for some work to get queued up
1278 * by aio_queue_async_request. Once some work gets queued
1279 * it will wake up one of these worker threads just before
1280 * returning to our caller in user land. We do not use
1281 * tsleep() here in order to avoid getting kernel funnel lock.
1283 assert_wait( (event_t
) &aio_anchor
.aio_async_workq
, THREAD_UNINT
);
1285 thread_block( THREAD_CONTINUE_NULL
);
1287 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_wake
)) | DBG_FUNC_NONE
,
1292 boolean_t funnel_state
;
1293 vm_map_t currentmap
;
1294 vm_map_t oldmap
= VM_MAP_NULL
;
1295 task_t oldaiotask
= TASK_NULL
;
1299 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_thread
)) | DBG_FUNC_START
,
1300 (int)entryp
->procp
, (int)entryp
->uaiocbp
, entryp
->flags
, 0, 0 );
1303 * Assume the target's address space identity for the duration
1306 funnel_state
= thread_funnel_set( kernel_flock
, TRUE
);
1308 currentmap
= get_task_map( (current_proc())->task
);
1309 if ( currentmap
!= entryp
->aio_map
) {
1310 oldaiotask
= uthread
->uu_aio_task
;
1311 uthread
->uu_aio_task
= entryp
->procp
->task
;
1312 oldmap
= vm_map_switch( entryp
->aio_map
);
1315 if ( (entryp
->flags
& AIO_READ
) != 0 ) {
1316 error
= do_aio_read( entryp
);
1318 else if ( (entryp
->flags
& AIO_WRITE
) != 0 ) {
1319 error
= do_aio_write( entryp
);
1321 else if ( (entryp
->flags
& AIO_FSYNC
) != 0 ) {
1322 error
= do_aio_fsync( entryp
);
1325 printf( "%s - unknown aio request - flags 0x%02X \n",
1326 __FUNCTION__
, entryp
->flags
);
1329 entryp
->errorval
= error
;
1330 if ( currentmap
!= entryp
->aio_map
) {
1331 (void) vm_map_switch( oldmap
);
1332 uthread
->uu_aio_task
= oldaiotask
;
1335 /* we're done with the IO request so pop it off the active queue and */
1336 /* push it on the done queue */
1338 TAILQ_REMOVE( &entryp
->procp
->aio_activeq
, entryp
, aio_workq_link
);
1339 aio_anchor
.aio_active_count
--;
1340 entryp
->procp
->aio_active_count
--;
1341 TAILQ_INSERT_TAIL( &entryp
->procp
->aio_doneq
, entryp
, aio_workq_link
);
1342 aio_anchor
.aio_done_count
++;
1343 entryp
->procp
->aio_done_count
++;
1344 entryp
->flags
|= AIO_COMPLETION
;
1346 /* remove our reference to the user land map. */
1347 if ( VM_MAP_NULL
!= entryp
->aio_map
) {
1350 my_map
= entryp
->aio_map
;
1351 entryp
->aio_map
= VM_MAP_NULL
;
1352 AIO_UNLOCK
; /* must unlock before calling vm_map_deallocate() */
1353 vm_map_deallocate( my_map
);
1359 do_aio_completion( entryp
);
1360 (void) thread_funnel_set( kernel_flock
, funnel_state
);
1362 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_thread
)) | DBG_FUNC_END
,
1363 (int)entryp
->procp
, (int)entryp
->uaiocbp
, entryp
->errorval
,
1364 entryp
->returnval
, 0 );
1367 entryp
->flags
&= ~AIO_COMPLETION
;
1368 if ( (entryp
->flags
& AIO_DO_FREE
) != 0 ) {
1371 my_map
= entryp
->aio_map
;
1372 entryp
->aio_map
= VM_MAP_NULL
;
1374 aio_free_request( entryp
, my_map
);
1383 } /* aio_work_thread */
1387 * aio_get_some_work - get the next async IO request that is ready to be executed.
1388 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1389 * IO requests at the time the aio_fsync call came in have completed.
1390 * NOTE - AIO_LOCK must be held by caller
1393 static aio_workq_entry
*
1394 aio_get_some_work( void )
1396 aio_workq_entry
*entryp
;
1399 /* pop some work off the work queue and add to our active queue */
1400 for ( entryp
= TAILQ_FIRST( &aio_anchor
.aio_async_workq
);
1402 entryp
= TAILQ_NEXT( entryp
, aio_workq_link
) ) {
1404 if ( (entryp
->flags
& AIO_FSYNC
) != 0 ) {
1405 /* leave aio_fsync calls on the work queue if there are IO */
1406 /* requests on the active queue for the same file descriptor. */
1407 if ( aio_delay_fsync_request( entryp
) ) {
1409 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync_delay
)) | DBG_FUNC_NONE
,
1410 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1417 if ( entryp
!= NULL
) {
1418 TAILQ_REMOVE( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link
);
1419 aio_anchor
.aio_async_workq_count
--;
1420 TAILQ_INSERT_TAIL( &entryp
->procp
->aio_activeq
, entryp
, aio_workq_link
);
1421 aio_anchor
.aio_active_count
++;
1422 entryp
->procp
->aio_active_count
++;
1427 } /* aio_get_some_work */
1431 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1432 * this time. Delay will happen when there are any active IOs for the same file
1433 * descriptor that were queued at time the aio_sync call was queued.
1434 * NOTE - AIO_LOCK must be held by caller
1437 aio_delay_fsync_request( aio_workq_entry
*entryp
)
1439 aio_workq_entry
*my_entryp
;
1441 TAILQ_FOREACH( my_entryp
, &entryp
->procp
->aio_activeq
, aio_workq_link
) {
1442 if ( my_entryp
->fsyncp
!= NULL
&&
1443 entryp
->uaiocbp
== my_entryp
->fsyncp
&&
1444 entryp
->aiocb
.aio_fildes
== my_entryp
->aiocb
.aio_fildes
) {
1451 } /* aio_delay_fsync_request */
1455 * aio_queue_async_request - queue up an async IO request on our work queue then
1456 * wake up one of our worker threads to do the actual work. We get a reference
1457 * to our caller's user land map in order to keep it around while we are
1458 * processing the request.
1462 aio_queue_async_request( struct proc
*procp
, struct aiocb
*aiocbp
, int kindOfIO
)
1464 aio_workq_entry
*entryp
;
1467 entryp
= (aio_workq_entry
*) zalloc( aio_workq_zonep
);
1468 if ( entryp
== NULL
) {
1472 bzero( entryp
, sizeof(*entryp
) );
1474 /* fill in the rest of the aio_workq_entry */
1475 entryp
->procp
= procp
;
1476 entryp
->uaiocbp
= aiocbp
;
1477 entryp
->flags
|= kindOfIO
;
1478 entryp
->aio_map
= VM_MAP_NULL
;
1479 result
= copyin( aiocbp
, &entryp
->aiocb
, sizeof(entryp
->aiocb
) );
1480 if ( result
!= 0 ) {
1485 /* do some more validation on the aiocb and embedded file descriptor */
1486 result
= aio_validate( entryp
);
1490 /* get a reference to the user land map in order to keep it around */
1491 entryp
->aio_map
= get_task_map( procp
->task
);
1492 vm_map_reference( entryp
->aio_map
);
1496 if ( is_already_queued( entryp
->procp
, entryp
->uaiocbp
) == TRUE
) {
1502 /* check our aio limits to throttle bad or rude user land behavior */
1503 if ( aio_get_all_queues_count( ) >= aio_max_requests
||
1504 aio_get_process_count( procp
) >= aio_max_requests_per_process
) {
1511 * aio_fsync calls sync up all async IO requests queued at the time
1512 * the aio_fsync call was made. So we mark each currently queued async
1513 * IO with a matching file descriptor as must complete before we do the
1514 * fsync. We set the fsyncp field of each matching async IO
1515 * request with the aiocb pointer passed in on the aio_fsync call to
1516 * know which IOs must complete before we process the aio_fsync call.
1518 if ( (kindOfIO
& AIO_FSYNC
) != 0 )
1519 aio_mark_requests( entryp
);
1521 /* queue up on our aio asynchronous work queue */
1522 TAILQ_INSERT_TAIL( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link
);
1523 aio_anchor
.aio_async_workq_count
++;
1525 wakeup_one( &aio_anchor
.aio_async_workq
);
1528 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_NONE
,
1529 (int)procp
, (int)aiocbp
, 0, 0, 0 );
1534 if ( entryp
!= NULL
) {
1535 /* this entry has not been queued up so no worries about unlocked */
1536 /* state and aio_map */
1537 aio_free_request( entryp
, entryp
->aio_map
);
1542 } /* aio_queue_async_request */
1546 * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1547 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1548 * our caller. We get a reference to our caller's user land map in order to keep
1549 * it around while we are processing the request.
1550 * lio_listio calls behave differently at completion they do completion notification
1551 * when all async IO requests have completed. We use group_tag to tag IO requests
1552 * that behave in the delay notification manner.
1556 lio_create_async_entry( struct proc
*procp
, struct aiocb
*aiocbp
,
1557 struct sigevent
*sigp
, long group_tag
,
1558 aio_workq_entry
**entrypp
)
1560 aio_workq_entry
*entryp
;
1563 entryp
= (aio_workq_entry
*) zalloc( aio_workq_zonep
);
1564 if ( entryp
== NULL
) {
1568 bzero( entryp
, sizeof(*entryp
) );
1570 /* fill in the rest of the aio_workq_entry */
1571 entryp
->procp
= procp
;
1572 entryp
->uaiocbp
= aiocbp
;
1573 entryp
->flags
|= AIO_LIO
;
1574 entryp
->group_tag
= group_tag
;
1575 entryp
->aio_map
= VM_MAP_NULL
;
1576 result
= copyin( aiocbp
, &entryp
->aiocb
, sizeof(entryp
->aiocb
) );
1577 if ( result
!= 0 ) {
1582 /* look for lio_listio LIO_NOP requests and ignore them. */
1583 /* Not really an error, but we need to free our aio_workq_entry. */
1584 if ( entryp
->aiocb
.aio_lio_opcode
== LIO_NOP
) {
1589 /* use sigevent passed in to lio_listio for each of our calls, but only */
1590 /* do completion notification after the last request completes. */
1591 if ( sigp
!= NULL
) {
1592 result
= copyin( sigp
, &entryp
->aiocb
.aio_sigevent
, sizeof(entryp
->aiocb
.aio_sigevent
) );
1593 if ( result
!= 0 ) {
1599 /* do some more validation on the aiocb and embedded file descriptor */
1600 result
= aio_validate( entryp
);
1604 /* get a reference to the user land map in order to keep it around */
1605 entryp
->aio_map
= get_task_map( procp
->task
);
1606 vm_map_reference( entryp
->aio_map
);
1612 if ( entryp
!= NULL
)
1613 zfree( aio_workq_zonep
, (vm_offset_t
) entryp
);
1617 } /* lio_create_async_entry */
1621 * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1622 * requests at the moment the aio_fsync call is queued. We use aio_workq_entry.fsyncp
1623 * to mark each async IO that must complete before the fsync is done. We use the uaiocbp
1624 * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1625 * NOTE - AIO_LOCK must be held by caller
1629 aio_mark_requests( aio_workq_entry
*entryp
)
1631 aio_workq_entry
*my_entryp
;
1633 TAILQ_FOREACH( my_entryp
, &entryp
->procp
->aio_activeq
, aio_workq_link
) {
1634 if ( entryp
->aiocb
.aio_fildes
== my_entryp
->aiocb
.aio_fildes
) {
1635 my_entryp
->fsyncp
= entryp
->uaiocbp
;
1639 TAILQ_FOREACH( my_entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
1640 if ( entryp
->procp
== my_entryp
->procp
&&
1641 entryp
->aiocb
.aio_fildes
== my_entryp
->aiocb
.aio_fildes
) {
1642 my_entryp
->fsyncp
= entryp
->uaiocbp
;
1646 } /* aio_mark_requests */
1650 * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1651 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1653 * lio_listio calls behave differently at completion they do completion notification
1654 * when all async IO requests have completed. We use group_tag to tag IO requests
1655 * that behave in the delay notification manner.
1659 lio_create_sync_entry( struct proc
*procp
, struct aiocb
*aiocbp
,
1660 long group_tag
, aio_workq_entry
**entrypp
)
1662 aio_workq_entry
*entryp
;
1665 entryp
= (aio_workq_entry
*) zalloc( aio_workq_zonep
);
1666 if ( entryp
== NULL
) {
1670 bzero( entryp
, sizeof(*entryp
) );
1672 /* fill in the rest of the aio_workq_entry */
1673 entryp
->procp
= procp
;
1674 entryp
->uaiocbp
= aiocbp
;
1675 entryp
->flags
|= AIO_LIO
;
1676 entryp
->group_tag
= group_tag
;
1677 entryp
->aio_map
= VM_MAP_NULL
;
1678 result
= copyin( aiocbp
, &entryp
->aiocb
, sizeof(entryp
->aiocb
) );
1679 if ( result
!= 0 ) {
1684 /* look for lio_listio LIO_NOP requests and ignore them. */
1685 /* Not really an error, but we need to free our aio_workq_entry. */
1686 if ( entryp
->aiocb
.aio_lio_opcode
== LIO_NOP
) {
1691 result
= aio_validate( entryp
);
1692 if ( result
!= 0 ) {
1700 if ( entryp
!= NULL
)
1701 zfree( aio_workq_zonep
, (vm_offset_t
) entryp
);
1705 } /* lio_create_sync_entry */
1709 * aio_free_request - remove our reference on the user land map and
1710 * free the work queue entry resources.
1711 * We are not holding the lock here thus aio_map is passed in and
1712 * zeroed while we did have the lock.
1716 aio_free_request( aio_workq_entry
*entryp
, vm_map_t the_map
)
1718 /* remove our reference to the user land map. */
1719 if ( VM_MAP_NULL
!= the_map
) {
1720 vm_map_deallocate( the_map
);
1723 zfree( aio_workq_zonep
, (vm_offset_t
) entryp
);
1727 } /* aio_free_request */
1730 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1734 aio_validate( aio_workq_entry
*entryp
)
1736 boolean_t funnel_state
;
1743 if ( (entryp
->flags
& AIO_LIO
) != 0 ) {
1744 if ( entryp
->aiocb
.aio_lio_opcode
== LIO_READ
)
1745 entryp
->flags
|= AIO_READ
;
1746 else if ( entryp
->aiocb
.aio_lio_opcode
== LIO_WRITE
)
1747 entryp
->flags
|= AIO_WRITE
;
1748 else if ( entryp
->aiocb
.aio_lio_opcode
== LIO_NOP
)
1755 if ( (entryp
->flags
& (AIO_WRITE
| AIO_FSYNC
)) != 0 ) {
1759 if ( (entryp
->flags
& (AIO_READ
| AIO_WRITE
)) != 0 ) {
1760 if ( entryp
->aiocb
.aio_offset
< 0 ||
1761 entryp
->aiocb
.aio_nbytes
< 0 ||
1762 entryp
->aiocb
.aio_nbytes
> INT_MAX
||
1763 entryp
->aiocb
.aio_buf
== NULL
)
1767 /* validate aiocb.aio_sigevent. at this point we only support sigev_notify
1768 * equal to SIGEV_SIGNAL or SIGEV_NONE. this means sigev_value,
1769 * sigev_notify_function, and sigev_notify_attributes are ignored.
1771 if ( entryp
->aiocb
.aio_sigevent
.sigev_notify
== SIGEV_SIGNAL
) {
1773 /* make sure we have a valid signal number */
1774 signum
= entryp
->aiocb
.aio_sigevent
.sigev_signo
;
1775 if ( signum
<= 0 || signum
>= NSIG
||
1776 signum
== SIGKILL
|| signum
== SIGSTOP
)
1779 else if ( entryp
->aiocb
.aio_sigevent
.sigev_notify
!= SIGEV_NONE
)
1782 /* validate the file descriptor and that the file was opened
1783 * for the appropriate read / write access. This section requires
1784 * kernel funnel lock.
1786 funnel_state
= thread_funnel_set( kernel_flock
, TRUE
);
1788 result
= fdgetf( entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
);
1789 if ( result
== 0 ) {
1790 if ( (fp
->f_flag
& flag
) == 0 ) {
1791 /* we don't have read or write access */
1794 else if ( fp
->f_type
!= DTYPE_VNODE
) {
1795 /* this is not a file */
1803 (void) thread_funnel_set( kernel_flock
, funnel_state
);
1807 } /* aio_validate */
1811 * aio_get_process_count - runs through our queues that hold outstanding
1812 * async IO reqests and totals up number of requests for the given
1814 * NOTE - caller must hold aio lock!
1818 aio_get_process_count( struct proc
*procp
)
1820 aio_workq_entry
*entryp
;
1824 /* begin with count of completed async IO requests for this process */
1825 count
= procp
->aio_done_count
;
1827 /* add in count of active async IO requests for this process */
1828 count
+= procp
->aio_active_count
;
1830 /* look for matches on our queue of asynchronous todo work */
1831 TAILQ_FOREACH( entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
1832 if ( procp
== entryp
->procp
) {
1837 /* look for matches on our queue of synchronous todo work */
1838 TAILQ_FOREACH( entryp
, &aio_anchor
.lio_sync_workq
, aio_workq_link
) {
1839 if ( procp
== entryp
->procp
) {
1846 } /* aio_get_process_count */
1850 * aio_get_all_queues_count - get total number of entries on all aio work queues.
1851 * NOTE - caller must hold aio lock!
1855 aio_get_all_queues_count( void )
1859 count
= aio_anchor
.aio_async_workq_count
;
1860 count
+= aio_anchor
.lio_sync_workq_count
;
1861 count
+= aio_anchor
.aio_active_count
;
1862 count
+= aio_anchor
.aio_done_count
;
1866 } /* aio_get_all_queues_count */
1870 * do_aio_completion. Handle async IO completion.
1874 do_aio_completion( aio_workq_entry
*entryp
)
1876 /* signal user land process if appropriate */
1877 if ( entryp
->aiocb
.aio_sigevent
.sigev_notify
== SIGEV_SIGNAL
&&
1878 (entryp
->flags
& AIO_DISABLE
) == 0 ) {
1881 * if group_tag is non zero then make sure this is the last IO request
1882 * in the group before we signal.
1884 if ( entryp
->group_tag
== 0 ||
1885 (entryp
->group_tag
!= 0 && aio_last_group_io( entryp
)) ) {
1886 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_sig
)) | DBG_FUNC_NONE
,
1887 (int)entryp
->procp
, (int)entryp
->uaiocbp
,
1888 entryp
->aiocb
.aio_sigevent
.sigev_signo
, 0, 0 );
1890 psignal( entryp
->procp
, entryp
->aiocb
.aio_sigevent
.sigev_signo
);
1896 * need to handle case where a process is trying to exit, exec, or close
1897 * and is currently waiting for active aio requests to complete. If
1898 * AIO_WAITING is set then we need to look to see if there are any
1899 * other requests in the active queue for this process. If there are
1900 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel. If
1901 * there are some still active then do nothing - we only want to wakeup
1902 * when all active aio requests for the process are complete.
1904 if ( (entryp
->flags
& AIO_WAITING
) != 0 ) {
1905 int active_requests
;
1907 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wait
)) | DBG_FUNC_NONE
,
1908 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1911 active_requests
= aio_active_requests_for_process( entryp
->procp
);
1913 if ( active_requests
< 1 ) {
1914 /* no active aio requests for this process, continue exiting */
1915 wakeup_one( &entryp
->procp
->AIO_CLEANUP_SLEEP_CHAN
);
1917 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wake
)) | DBG_FUNC_NONE
,
1918 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1925 * aio_suspend case when a signal was not requested. In that scenario we
1926 * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1927 * NOTE - the assumption here is that this wakeup call is inexpensive.
1928 * we really only need to do this when an aio_suspend call is pending.
1929 * If we find the wakeup call should be avoided we could mark the
1930 * async IO requests given in the list provided by aio_suspend and only
1931 * call wakeup for them. If we do mark them we should unmark them after
1932 * the aio_suspend wakes up.
1935 wakeup_one( &entryp
->procp
->AIO_SUSPEND_SLEEP_CHAN
);
1938 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_suspend_wake
)) | DBG_FUNC_NONE
,
1939 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1943 } /* do_aio_completion */
1947 * aio_last_group_io - checks to see if this is the last unfinished IO request
1948 * for the given group_tag. Returns TRUE if there are no other active IO
1949 * requests for this group or FALSE if the are active IO requests
1950 * NOTE - AIO_LOCK must be held by caller
1954 aio_last_group_io( aio_workq_entry
*entryp
)
1956 aio_workq_entry
*my_entryp
;
1958 /* look for matches on our queue of active async IO requests */
1959 TAILQ_FOREACH( my_entryp
, &entryp
->procp
->aio_activeq
, aio_workq_link
) {
1960 if ( my_entryp
->group_tag
== entryp
->group_tag
)
1964 /* look for matches on our queue of asynchronous todo work */
1965 TAILQ_FOREACH( my_entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
1966 if ( my_entryp
->group_tag
== entryp
->group_tag
)
1970 /* look for matches on our queue of synchronous todo work */
1971 TAILQ_FOREACH( my_entryp
, &aio_anchor
.lio_sync_workq
, aio_workq_link
) {
1972 if ( my_entryp
->group_tag
== entryp
->group_tag
)
1978 } /* aio_last_group_io */
1985 do_aio_read( aio_workq_entry
*entryp
)
1990 fp
= holdfp( entryp
->procp
->p_fd
, entryp
->aiocb
.aio_fildes
, FREAD
);
1992 error
= dofileread( entryp
->procp
, fp
, entryp
->aiocb
.aio_fildes
,
1993 (void *)entryp
->aiocb
.aio_buf
,
1994 entryp
->aiocb
.aio_nbytes
,
1995 entryp
->aiocb
.aio_offset
, FOF_OFFSET
,
1996 &entryp
->returnval
);
2011 do_aio_write( aio_workq_entry
*entryp
)
2016 fp
= holdfp( entryp
->procp
->p_fd
, entryp
->aiocb
.aio_fildes
, FWRITE
);
2018 error
= dofilewrite( entryp
->procp
, fp
, entryp
->aiocb
.aio_fildes
,
2019 (const void *)entryp
->aiocb
.aio_buf
,
2020 entryp
->aiocb
.aio_nbytes
,
2021 entryp
->aiocb
.aio_offset
, FOF_OFFSET
,
2022 &entryp
->returnval
);
2030 } /* do_aio_write */
2034 * aio_active_requests_for_process - return number of active async IO
2035 * requests for the given process.
2036 * NOTE - caller must hold aio lock!
2040 aio_active_requests_for_process( struct proc
*procp
)
2043 return( procp
->aio_active_count
);
2045 } /* aio_active_requests_for_process */
2052 do_aio_fsync( aio_workq_entry
*entryp
)
2054 register struct vnode
*vp
;
2059 * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2060 * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2061 * The following was shamelessly extracted from fsync() implementation.
2063 error
= getvnode( entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
);
2065 vp
= (struct vnode
*)fp
->f_data
;
2066 vn_lock( vp
, LK_EXCLUSIVE
| LK_RETRY
, entryp
->procp
);
2067 error
= VOP_FSYNC( vp
, fp
->f_cred
, MNT_WAIT
, entryp
->procp
);
2068 VOP_UNLOCK( vp
, 0, entryp
->procp
);
2071 entryp
->returnval
= -1;
2075 } /* do_aio_fsync */
2079 * is_already_queued - runs through our queues to see if the given
2080 * aiocbp / process is there. Returns TRUE if there is a match
2081 * on any of our aio queues.
2082 * NOTE - callers must hold aio lock!
2086 is_already_queued( struct proc
*procp
,
2087 struct aiocb
*aiocbp
)
2089 aio_workq_entry
*entryp
;
2094 /* look for matches on our queue of async IO requests that have completed */
2095 TAILQ_FOREACH( entryp
, &procp
->aio_doneq
, aio_workq_link
) {
2096 if ( aiocbp
== entryp
->uaiocbp
) {
2098 goto ExitThisRoutine
;
2102 /* look for matches on our queue of active async IO requests */
2103 TAILQ_FOREACH( entryp
, &procp
->aio_activeq
, aio_workq_link
) {
2104 if ( aiocbp
== entryp
->uaiocbp
) {
2106 goto ExitThisRoutine
;
2110 /* look for matches on our queue of asynchronous todo work */
2111 TAILQ_FOREACH( entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
2112 if ( procp
== entryp
->procp
&& aiocbp
== entryp
->uaiocbp
) {
2114 goto ExitThisRoutine
;
2118 /* look for matches on our queue of synchronous todo work */
2119 TAILQ_FOREACH( entryp
, &aio_anchor
.lio_sync_workq
, aio_workq_link
) {
2120 if ( procp
== entryp
->procp
&& aiocbp
== entryp
->uaiocbp
) {
2122 goto ExitThisRoutine
;
2129 } /* is_already_queued */
2133 * aio initialization
2135 __private_extern__
void
2140 simple_lock_init( &aio_lock
);
2143 TAILQ_INIT( &aio_anchor
.aio_async_workq
);
2144 TAILQ_INIT( &aio_anchor
.lio_sync_workq
);
2145 aio_anchor
.aio_async_workq_count
= 0;
2146 aio_anchor
.lio_sync_workq_count
= 0;
2147 aio_anchor
.aio_active_count
= 0;
2148 aio_anchor
.aio_done_count
= 0;
2151 i
= sizeof( aio_workq_entry
);
2152 aio_workq_zonep
= zinit( i
, i
* aio_max_requests
, i
* aio_max_requests
, "aiowq" );
2154 _aio_create_worker_threads( aio_worker_threads
);
2162 * aio worker threads created here.
2164 __private_extern__
void
2165 _aio_create_worker_threads( int num
)
2169 /* create some worker threads to handle the async IO requests */
2170 for ( i
= 0; i
< num
; i
++ ) {
2173 myThread
= kernel_thread( kernel_task
, aio_work_thread
);
2174 if ( THREAD_NULL
== myThread
) {
2175 printf( "%s - failed to create a work thread \n", __FUNCTION__
);
2181 } /* _aio_create_worker_threads */
2184 * Return the current activation utask
2189 return ((struct uthread
*)get_bsdthread_info(current_act()))->uu_aio_task
;