2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
26 * 1) ramesh is looking into how to replace taking a reference on
27 * the user's map (vm_map_reference()) since it is believed that
28 * would not hold the process for us.
29 * 2) david is looking into a way for us to set the priority of the
30 * worker threads to match that of the user's thread when the
31 * async IO was queued.
36 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
39 #include <sys/systm.h>
41 #include <sys/fcntl.h>
43 #include <sys/filedesc.h>
44 #include <sys/kernel.h>
45 #include <sys/vnode.h>
46 #include <sys/malloc.h>
47 #include <sys/mount.h>
48 #include <sys/param.h>
50 #include <sys/sysctl.h>
51 #include <sys/unistd.h>
54 #include <sys/aio_kern.h>
56 #include <machine/limits.h>
57 #include <kern/zalloc.h>
58 #include <kern/task.h>
60 #include <sys/kdebug.h>
61 #define AIO_work_queued 1
62 #define AIO_worker_wake 2
63 #define AIO_completion_sig 3
64 #define AIO_completion_cleanup_wait 4
65 #define AIO_completion_cleanup_wake 5
66 #define AIO_completion_suspend_wake 6
67 #define AIO_fsync_delay 7
69 #define AIO_cancel_async_workq 11
70 #define AIO_cancel_sync_workq 12
71 #define AIO_cancel_activeq 13
72 #define AIO_cancel_doneq 14
78 #define AIO_error_val 61
79 #define AIO_error_activeq 62
80 #define AIO_error_workq 63
82 #define AIO_return_val 71
83 #define AIO_return_activeq 72
84 #define AIO_return_workq 73
87 #define AIO_exit_sleep 91
89 #define AIO_close_sleep 101
90 #define AIO_suspend 110
91 #define AIO_suspend_sleep 111
92 #define AIO_worker_thread 120
96 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
100 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
101 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq
102 * (proc.aio_activeq) when one of our worker threads start the IO.
103 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
104 * when the IO request completes. The request remains on aio_doneq until
105 * user process calls aio_return or the process exits, either way that is our
106 * trigger to release aio resources.
110 int aio_async_workq_count
; /* entries on aio_async_workq */
111 int lio_sync_workq_count
; /* entries on lio_sync_workq */
112 int aio_active_count
; /* entries on all active queues (proc.aio_activeq) */
113 int aio_done_count
; /* entries on all done queues (proc.aio_doneq) */
114 TAILQ_HEAD( , aio_workq_entry
) aio_async_workq
;
115 TAILQ_HEAD( , aio_workq_entry
) lio_sync_workq
;
117 typedef struct aio_anchor_cb aio_anchor_cb
;
121 * Notes on aio sleep / wake channels.
122 * We currently pick a couple fields within the proc structure that will allow
123 * us sleep channels that currently do not collide with any other kernel routines.
124 * At this time, for binary compatibility reasons, we cannot create new proc fields.
126 #define AIO_SUSPEND_SLEEP_CHAN p_estcpu
127 #define AIO_CLEANUP_SLEEP_CHAN p_pctcpu
131 * aysnc IO locking macros used to protect critical sections.
133 #define AIO_LOCK usimple_lock( &aio_lock )
134 #define AIO_UNLOCK usimple_unlock( &aio_lock )
140 static int aio_active_requests_for_process( struct proc
*procp
);
141 static boolean_t
aio_delay_fsync_request( aio_workq_entry
*entryp
);
142 static int aio_free_request( aio_workq_entry
*entryp
, vm_map_t the_map
);
143 static int aio_get_all_queues_count( void );
144 static int aio_get_process_count( struct proc
*procp
);
145 static aio_workq_entry
* aio_get_some_work( void );
146 static boolean_t
aio_last_group_io( aio_workq_entry
*entryp
);
147 static void aio_mark_requests( aio_workq_entry
*entryp
);
148 static int aio_queue_async_request( struct proc
*procp
,
149 struct aiocb
*aiocbp
,
151 static int aio_validate( aio_workq_entry
*entryp
);
152 static void aio_work_thread( void );
153 static int do_aio_cancel( struct proc
*p
,
155 struct aiocb
*aiocbp
,
156 boolean_t wait_for_completion
,
157 boolean_t disable_notification
);
158 static void do_aio_completion( aio_workq_entry
*entryp
);
159 static int do_aio_fsync( aio_workq_entry
*entryp
);
160 static int do_aio_read( aio_workq_entry
*entryp
);
161 static int do_aio_write( aio_workq_entry
*entryp
);
162 static boolean_t
is_already_queued( struct proc
*procp
,
163 struct aiocb
*aiocbp
);
164 static int lio_create_async_entry( struct proc
*procp
,
165 struct aiocb
*aiocbp
,
166 struct sigevent
*sigp
,
168 aio_workq_entry
**entrypp
);
169 static int lio_create_sync_entry( struct proc
*procp
,
170 struct aiocb
*aiocbp
,
172 aio_workq_entry
**entrypp
);
175 * EXTERNAL PROTOTYPES
178 /* in ...bsd/kern/sys_generic.c */
179 extern struct file
* holdfp( struct filedesc
* fdp
, int fd
, int flag
);
180 extern int dofileread( struct proc
*p
, struct file
*fp
, int fd
,
181 void *buf
, size_t nbyte
, off_t offset
,
182 int flags
, int *retval
);
183 extern int dofilewrite( struct proc
*p
, struct file
*fp
, int fd
,
184 const void *buf
, size_t nbyte
, off_t offset
,
185 int flags
, int *retval
);
186 extern vm_map_t
vm_map_switch( vm_map_t map
);
190 * aio external global variables.
192 extern int aio_max_requests
; /* AIO_MAX - configurable */
193 extern int aio_max_requests_per_process
; /* AIO_PROCESS_MAX - configurable */
194 extern int aio_worker_threads
; /* AIO_THREAD_COUNT - configurable */
198 * aio static variables.
200 static aio_anchor_cb aio_anchor
;
201 static simple_lock_data_t aio_lock
;
202 static struct zone
*aio_workq_zonep
;
206 * syscall input parameters
208 #ifndef _SYS_SYSPROTO_H_
210 struct aio_cancel_args
{
212 struct aiocb
*aiocbp
;
215 struct aio_error_args
{
216 struct aiocb
*aiocbp
;
219 struct aio_fsync_args
{
221 struct aiocb
*aiocbp
;
224 struct aio_read_args
{
225 struct aiocb
*aiocbp
;
228 struct aio_return_args
{
229 struct aiocb
*aiocbp
;
232 struct aio_suspend_args
{
233 struct aiocb
*const *aiocblist
;
235 const struct timespec
*timeoutp
;
238 struct aio_write_args
{
239 struct aiocb
*aiocbp
;
242 struct lio_listio_args
{
244 struct aiocb
*const *aiocblist
;
246 struct sigevent
*sigp
;
249 #endif /* _SYS_SYSPROTO_H_ */
253 * aio_cancel - attempt to cancel one or more async IO requests currently
254 * outstanding against file descriptor uap->fd. If uap->aiocbp is not
255 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp
256 * is NULL then all outstanding async IO request for the given file
257 * descriptor are cancelled (if possible).
261 aio_cancel( struct proc
*p
, struct aio_cancel_args
*uap
, int *retval
)
263 struct aiocb my_aiocb
;
265 boolean_t funnel_state
;
267 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel
)) | DBG_FUNC_START
,
268 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
270 /* quick check to see if there are any async IO requests queued up */
272 result
= aio_get_all_queues_count( );
280 if ( uap
->aiocbp
!= NULL
) {
281 result
= copyin( uap
->aiocbp
, &my_aiocb
, sizeof(my_aiocb
) );
287 /* NOTE - POSIX standard says a mismatch between the file */
288 /* descriptor passed in and the file descriptor embedded in */
289 /* the aiocb causes unspecified results. We return EBADF in */
290 /* that situation. */
291 if ( uap
->fd
!= my_aiocb
.aio_fildes
) {
297 /* current BSD code assumes funnel lock is held */
298 funnel_state
= thread_funnel_set( kernel_flock
, TRUE
);
299 result
= do_aio_cancel( p
, uap
->fd
, uap
->aiocbp
, FALSE
, FALSE
);
300 (void) thread_funnel_set( kernel_flock
, funnel_state
);
302 if ( result
!= -1 ) {
311 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel
)) | DBG_FUNC_END
,
312 (int)p
, (int)uap
->aiocbp
, result
, 0, 0 );
320 * _aio_close - internal function used to clean up async IO requests for
321 * a file descriptor that is closing.
322 * NOTE - kernel funnel lock is held when we get called.
326 __private_extern__
void
327 _aio_close( struct proc
*p
, int fd
)
331 /* quick check to see if there are any async IO requests queued up */
333 count
= aio_get_all_queues_count( );
338 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close
)) | DBG_FUNC_START
,
339 (int)p
, fd
, 0, 0, 0 );
341 /* cancel all async IO requests on our todo queues for this file descriptor */
342 error
= do_aio_cancel( p
, fd
, NULL
, TRUE
, FALSE
);
343 if ( error
== AIO_NOTCANCELED
) {
345 * AIO_NOTCANCELED is returned when we find an aio request for this process
346 * and file descriptor on the active async IO queue. Active requests cannot
347 * be cancelled so we must wait for them to complete. We will get a special
348 * wake up call on our channel used to sleep for ALL active requests to
349 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
350 * when we must wait for all active aio requests.
353 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close_sleep
)) | DBG_FUNC_NONE
,
354 (int)p
, fd
, 0, 0, 0 );
356 tsleep( &p
->AIO_CLEANUP_SLEEP_CHAN
, PRIBIO
, "aio_close", 0 );
359 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_close
)) | DBG_FUNC_END
,
360 (int)p
, fd
, 0, 0, 0 );
368 * aio_error - return the error status associated with the async IO
369 * request referred to by uap->aiocbp. The error status is the errno
370 * value that would be set by the corresponding IO request (read, wrtie,
371 * fdatasync, or sync).
375 aio_error( struct proc
*p
, struct aio_error_args
*uap
, int *retval
)
377 aio_workq_entry
*entryp
;
380 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error
)) | DBG_FUNC_START
,
381 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
385 /* quick check to see if there are any async IO requests queued up */
386 if ( aio_get_all_queues_count( ) < 1 ) {
391 /* look for a match on our queue of async IO requests that have completed */
392 TAILQ_FOREACH( entryp
, &p
->aio_doneq
, aio_workq_link
) {
393 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
394 *retval
= entryp
->errorval
;
396 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_val
)) | DBG_FUNC_NONE
,
397 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
402 /* look for a match on our queue of active async IO requests */
403 TAILQ_FOREACH( entryp
, &p
->aio_activeq
, aio_workq_link
) {
404 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
405 *retval
= EINPROGRESS
;
407 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_activeq
)) | DBG_FUNC_NONE
,
408 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
413 /* look for a match on our queue of todo work */
414 TAILQ_FOREACH( entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
415 if ( p
== entryp
->procp
&& entryp
->uaiocbp
== uap
->aiocbp
) {
416 *retval
= EINPROGRESS
;
418 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error_workq
)) | DBG_FUNC_NONE
,
419 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
426 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_error
)) | DBG_FUNC_END
,
427 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
436 * aio_fsync - asynchronously force all IO operations associated
437 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
438 * queued at the time of the call to the synchronized completion state.
439 * NOTE - we do not support op O_DSYNC at this point since we do not support the
444 aio_fsync( struct proc
*p
, struct aio_fsync_args
*uap
, int *retval
)
449 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync
)) | DBG_FUNC_START
,
450 (int)p
, (int)uap
->aiocbp
, uap
->op
, 0, 0 );
453 if ( uap
->op
== O_SYNC
)
454 fsync_kind
= AIO_FSYNC
;
455 #if 0 // we don't support fdatasync() call yet
456 else if ( uap
->op
== O_DSYNC
)
457 fsync_kind
= AIO_DSYNC
;
465 error
= aio_queue_async_request( p
, uap
->aiocbp
, fsync_kind
);
470 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync
)) | DBG_FUNC_END
,
471 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
478 /* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
479 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
480 * (uap->aiocbp->aio_buf).
484 aio_read( struct proc
*p
, struct aio_read_args
*uap
, int *retval
)
488 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_read
)) | DBG_FUNC_START
,
489 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
493 error
= aio_queue_async_request( p
, uap
->aiocbp
, AIO_READ
);
497 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_read
)) | DBG_FUNC_END
,
498 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
506 * aio_return - return the return status associated with the async IO
507 * request referred to by uap->aiocbp. The return status is the value
508 * that would be returned by corresponding IO request (read, wrtie,
509 * fdatasync, or sync). This is where we release kernel resources
510 * held for async IO call associated with the given aiocb pointer.
514 aio_return( struct proc
*p
, struct aio_return_args
*uap
, register_t
*retval
)
516 aio_workq_entry
*entryp
;
520 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return
)) | DBG_FUNC_START
,
521 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
527 /* quick check to see if there are any async IO requests queued up */
528 if ( aio_get_all_queues_count( ) < 1 ) {
533 /* look for a match on our queue of async IO requests that have completed */
534 TAILQ_FOREACH( entryp
, &p
->aio_doneq
, aio_workq_link
) {
535 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
536 TAILQ_REMOVE( &p
->aio_doneq
, entryp
, aio_workq_link
);
537 aio_anchor
.aio_done_count
--;
540 *retval
= entryp
->returnval
;
542 /* we cannot free requests that are still completing */
543 if ( (entryp
->flags
& AIO_COMPLETION
) == 0 ) {
546 my_map
= entryp
->aio_map
;
547 entryp
->aio_map
= VM_MAP_NULL
;
550 aio_free_request( entryp
, my_map
);
553 /* tell completion code to free this request */
554 entryp
->flags
|= AIO_DO_FREE
;
556 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_val
)) | DBG_FUNC_NONE
,
557 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
562 /* look for a match on our queue of active async IO requests */
563 TAILQ_FOREACH( entryp
, &p
->aio_activeq
, aio_workq_link
) {
564 if ( entryp
->uaiocbp
== uap
->aiocbp
) {
566 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_activeq
)) | DBG_FUNC_NONE
,
567 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
572 /* look for a match on our queue of todo work */
573 TAILQ_FOREACH( entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
574 if ( p
== entryp
->procp
&& entryp
->uaiocbp
== uap
->aiocbp
) {
576 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return_workq
)) | DBG_FUNC_NONE
,
577 (int)p
, (int)uap
->aiocbp
, *retval
, 0, 0 );
586 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_return
)) | DBG_FUNC_END
,
587 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
595 * _aio_exec - internal function used to clean up async IO requests for
596 * a process that is going away due to exec(). We cancel any async IOs
597 * we can and wait for those already active. We also disable signaling
598 * for cancelled or active aio requests that complete.
599 * NOTE - kernel funnel lock is held when we get called.
600 * This routine MAY block!
603 __private_extern__
void
604 _aio_exec( struct proc
*p
)
607 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exec
)) | DBG_FUNC_START
,
608 (int)p
, 0, 0, 0, 0 );
612 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exec
)) | DBG_FUNC_END
,
613 (int)p
, 0, 0, 0, 0 );
621 * _aio_exit - internal function used to clean up async IO requests for
622 * a process that is terminating (via exit() or exec() ). We cancel any async IOs
623 * we can and wait for those already active. We also disable signaling
624 * for cancelled or active aio requests that complete. This routine MAY block!
625 * NOTE - kernel funnel lock is held when we get called.
628 __private_extern__
void
629 _aio_exit( struct proc
*p
)
632 aio_workq_entry
*entryp
;
634 /* quick check to see if there are any async IO requests queued up */
636 count
= aio_get_all_queues_count( );
642 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit
)) | DBG_FUNC_START
,
643 (int)p
, 0, 0, 0, 0 );
646 * cancel async IO requests on the todo work queue and wait for those
647 * already active to complete.
649 error
= do_aio_cancel( p
, 0, NULL
, TRUE
, TRUE
);
650 if ( error
== AIO_NOTCANCELED
) {
652 * AIO_NOTCANCELED is returned when we find an aio request for this process
653 * on the active async IO queue. Active requests cannot be cancelled so we
654 * must wait for them to complete. We will get a special wake up call on
655 * our channel used to sleep for ALL active requests to complete. This sleep
656 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
657 * active aio requests.
660 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit_sleep
)) | DBG_FUNC_NONE
,
661 (int)p
, 0, 0, 0, 0 );
663 tsleep( &p
->AIO_CLEANUP_SLEEP_CHAN
, PRIBIO
, "aio_exit", 0 );
666 /* release all aio resources used by this process */
668 entryp
= TAILQ_FIRST( &p
->aio_doneq
);
669 while ( entryp
!= NULL
) {
670 aio_workq_entry
*next_entryp
;
672 next_entryp
= TAILQ_NEXT( entryp
, aio_workq_link
);
673 TAILQ_REMOVE( &p
->aio_doneq
, entryp
, aio_workq_link
);
674 aio_anchor
.aio_done_count
--;
677 /* we cannot free requests that are still completing */
678 if ( (entryp
->flags
& AIO_COMPLETION
) == 0 ) {
681 my_map
= entryp
->aio_map
;
682 entryp
->aio_map
= VM_MAP_NULL
;
684 aio_free_request( entryp
, my_map
);
686 /* need to start over since aio_doneq may have been */
687 /* changed while we were away. */
689 entryp
= TAILQ_FIRST( &p
->aio_doneq
);
693 /* tell completion code to free this request */
694 entryp
->flags
|= AIO_DO_FREE
;
695 entryp
= next_entryp
;
700 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_exit
)) | DBG_FUNC_END
,
701 (int)p
, 0, 0, 0, 0 );
709 * do_aio_cancel - cancel async IO requests (if possible). We get called by
710 * aio_cancel, close, and at exit.
711 * There are three modes of operation: 1) cancel all async IOs for a process -
712 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
713 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
715 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
716 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
717 * target async IO requests, and AIO_ALLDONE if all target async IO requests
718 * were already complete.
719 * WARNING - do not deference aiocbp in this routine, it may point to user
720 * land data that has not been copied in (when called from aio_cancel() )
721 * NOTE - kernel funnel lock is held when we get called.
725 do_aio_cancel( struct proc
*p
, int fd
, struct aiocb
*aiocbp
,
726 boolean_t wait_for_completion
, boolean_t disable_notification
)
728 aio_workq_entry
*entryp
;
733 /* look for a match on our queue of async todo work. */
735 entryp
= TAILQ_FIRST( &aio_anchor
.aio_async_workq
);
736 while ( entryp
!= NULL
) {
737 aio_workq_entry
*next_entryp
;
739 next_entryp
= TAILQ_NEXT( entryp
, aio_workq_link
);
740 if ( p
== entryp
->procp
) {
741 if ( (aiocbp
== NULL
&& fd
== 0) ||
742 (aiocbp
!= NULL
&& entryp
->uaiocbp
== aiocbp
) ||
743 (aiocbp
== NULL
&& fd
== entryp
->aiocb
.aio_fildes
) ) {
744 /* we found a match so we remove the entry from the */
745 /* todo work queue and place it on the done queue */
746 TAILQ_REMOVE( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link
);
747 aio_anchor
.aio_async_workq_count
--;
748 entryp
->errorval
= ECANCELED
;
749 entryp
->returnval
= -1;
750 if ( disable_notification
)
751 entryp
->flags
|= AIO_DISABLE
; /* flag for special completion processing */
752 result
= AIO_CANCELED
;
754 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_async_workq
)) | DBG_FUNC_NONE
,
755 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
757 TAILQ_INSERT_TAIL( &p
->aio_doneq
, entryp
, aio_workq_link
);
758 aio_anchor
.aio_done_count
++;
760 entryp
->flags
|= AIO_COMPLETION
;
763 /* do completion processing for this request */
764 do_aio_completion( entryp
);
767 entryp
->flags
&= ~AIO_COMPLETION
;
768 if ( (entryp
->flags
& AIO_DO_FREE
) != 0 ) {
771 my_map
= entryp
->aio_map
;
772 entryp
->aio_map
= VM_MAP_NULL
;
774 aio_free_request( entryp
, my_map
);
779 if ( aiocbp
!= NULL
) {
783 /* need to start over since aio_async_workq may have been */
784 /* changed while we were away doing completion processing. */
786 entryp
= TAILQ_FIRST( &aio_anchor
.aio_async_workq
);
790 entryp
= next_entryp
;
794 * look for a match on our queue of synchronous todo work. This will
795 * be a rare occurrence but could happen if a process is terminated while
796 * processing a lio_listio call.
798 entryp
= TAILQ_FIRST( &aio_anchor
.lio_sync_workq
);
799 while ( entryp
!= NULL
) {
800 aio_workq_entry
*next_entryp
;
802 next_entryp
= TAILQ_NEXT( entryp
, aio_workq_link
);
803 if ( p
== entryp
->procp
) {
804 if ( (aiocbp
== NULL
&& fd
== 0) ||
805 (aiocbp
!= NULL
&& entryp
->uaiocbp
== aiocbp
) ||
806 (aiocbp
== NULL
&& fd
== entryp
->aiocb
.aio_fildes
) ) {
807 /* we found a match so we remove the entry from the */
808 /* todo work queue and place it on the done queue */
809 TAILQ_REMOVE( &aio_anchor
.lio_sync_workq
, entryp
, aio_workq_link
);
810 aio_anchor
.lio_sync_workq_count
--;
811 entryp
->errorval
= ECANCELED
;
812 entryp
->returnval
= -1;
813 if ( disable_notification
)
814 entryp
->flags
|= AIO_DISABLE
; /* flag for special completion processing */
815 result
= AIO_CANCELED
;
817 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_sync_workq
)) | DBG_FUNC_NONE
,
818 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
820 TAILQ_INSERT_TAIL( &p
->aio_doneq
, entryp
, aio_workq_link
);
821 aio_anchor
.aio_done_count
++;
823 if ( aiocbp
!= NULL
) {
829 entryp
= next_entryp
;
833 * look for a match on our queue of active async IO requests and
834 * return AIO_NOTCANCELED result.
836 TAILQ_FOREACH( entryp
, &p
->aio_activeq
, aio_workq_link
) {
837 if ( (aiocbp
== NULL
&& fd
== 0) ||
838 (aiocbp
!= NULL
&& entryp
->uaiocbp
== aiocbp
) ||
839 (aiocbp
== NULL
&& fd
== entryp
->aiocb
.aio_fildes
) ) {
840 result
= AIO_NOTCANCELED
;
842 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_activeq
)) | DBG_FUNC_NONE
,
843 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
845 if ( wait_for_completion
)
846 entryp
->flags
|= AIO_WAITING
; /* flag for special completion processing */
847 if ( disable_notification
)
848 entryp
->flags
|= AIO_DISABLE
; /* flag for special completion processing */
849 if ( aiocbp
!= NULL
) {
857 * if we didn't find any matches on the todo or active queues then look for a
858 * match on our queue of async IO requests that have completed and if found
859 * return AIO_ALLDONE result.
861 if ( result
== -1 ) {
862 TAILQ_FOREACH( entryp
, &p
->aio_doneq
, aio_workq_link
) {
863 if ( (aiocbp
== NULL
&& fd
== 0) ||
864 (aiocbp
!= NULL
&& entryp
->uaiocbp
== aiocbp
) ||
865 (aiocbp
== NULL
&& fd
== entryp
->aiocb
.aio_fildes
) ) {
866 result
= AIO_ALLDONE
;
868 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_cancel_doneq
)) | DBG_FUNC_NONE
,
869 (int)entryp
->procp
, (int)entryp
->uaiocbp
, fd
, 0, 0 );
871 if ( aiocbp
!= NULL
) {
882 } /* do_aio_cancel */
886 * aio_suspend - suspend the calling thread until at least one of the async
887 * IO operations referenced by uap->aiocblist has completed, until a signal
888 * interrupts the function, or uap->timeoutp time interval (optional) has
890 * Returns 0 if one or more async IOs have completed else -1 and errno is
891 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
896 aio_suspend( struct proc
*p
, struct aio_suspend_args
*uap
, int *retval
)
903 aio_workq_entry
*entryp
;
904 struct aiocb
* *aiocbpp
;
906 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend
)) | DBG_FUNC_START
,
907 (int)p
, uap
->nent
, 0, 0, 0 );
913 /* quick check to see if there are any async IO requests queued up */
915 count
= aio_get_all_queues_count( );
919 goto ExitThisRoutine
;
922 if ( uap
->nent
< 1 || uap
->nent
> AIO_LISTIO_MAX
) {
924 goto ExitThisRoutine
;
927 if ( uap
->timeoutp
!= NULL
) {
928 error
= copyin( (void *)uap
->timeoutp
, &ts
, sizeof(ts
) );
931 goto ExitThisRoutine
;
934 if ( ts
.tv_nsec
< 0 || ts
.tv_nsec
>= 1000000000 ) {
936 goto ExitThisRoutine
;
939 nanoseconds_to_absolutetime( (uint64_t)ts
.tv_sec
* NSEC_PER_SEC
+ ts
.tv_nsec
,
941 clock_absolutetime_interval_to_deadline( abstime
, &abstime
);
944 MALLOC( aiocbpp
, void *, (uap
->nent
* sizeof(struct aiocb
*)), M_TEMP
, M_WAITOK
);
945 if ( aiocbpp
== NULL
) {
947 goto ExitThisRoutine
;
950 /* check list of aio requests to see if any have completed */
951 for ( i
= 0; i
< uap
->nent
; i
++ ) {
952 struct aiocb
*aiocbp
;
954 /* copyin in aiocb pointer from list */
955 error
= copyin( (void *)(uap
->aiocblist
+ i
), (aiocbpp
+ i
), sizeof(aiocbp
) );
958 goto ExitThisRoutine
;
961 /* NULL elements are legal so check for 'em */
962 aiocbp
= *(aiocbpp
+ i
);
963 if ( aiocbp
== NULL
)
966 /* return immediately if any aio request in the list is done */
968 TAILQ_FOREACH( entryp
, &p
->aio_doneq
, aio_workq_link
) {
969 if ( entryp
->uaiocbp
== aiocbp
) {
973 goto ExitThisRoutine
;
977 } /* for ( ; i < uap->nent; ) */
979 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend_sleep
)) | DBG_FUNC_NONE
,
980 (int)p
, uap
->nent
, 0, 0, 0 );
983 * wait for an async IO to complete or a signal fires or timeout expires.
984 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
985 * interrupts us. If an async IO completes before a signal fires or our
986 * timeout expires, we get a wakeup call from aio_work_thread(). We do not
987 * use tsleep() here in order to avoid getting kernel funnel lock.
989 assert_wait( (event_t
) &p
->AIO_SUSPEND_SLEEP_CHAN
, THREAD_ABORTSAFE
);
991 thread_set_timer_deadline( abstime
);
993 error
= thread_block( THREAD_CONTINUE_NULL
);
994 if ( error
== THREAD_AWAKENED
) {
995 /* got our wakeup call from aio_work_thread() */
997 thread_cancel_timer();
1002 else if ( error
== THREAD_TIMED_OUT
) {
1003 /* our timeout expired */
1007 /* we were interrupted */
1008 if ( abstime
> 0 ) {
1009 thread_cancel_timer();
1015 if ( aiocbpp
!= NULL
)
1016 FREE( aiocbpp
, M_TEMP
);
1018 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_suspend
)) | DBG_FUNC_END
,
1019 (int)p
, uap
->nent
, error
, 0, 0 );
1026 /* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1027 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1028 * (uap->aiocbp->aio_buf).
1032 aio_write( struct proc
*p
, struct aio_write_args
*uap
, int *retval
)
1038 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_write
)) | DBG_FUNC_START
,
1039 (int)p
, (int)uap
->aiocbp
, 0, 0, 0 );
1041 error
= aio_queue_async_request( p
, uap
->aiocbp
, AIO_WRITE
);
1045 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_write
)) | DBG_FUNC_END
,
1046 (int)p
, (int)uap
->aiocbp
, error
, 0, 0 );
1054 * lio_listio - initiate a list of IO requests. We process the list of aiocbs
1055 * either synchronously (mode == LIO_WAIT) or asynchronously (mode == LIO_NOWAIT).
1056 * The caller gets error and return status for each aiocb in the list via aio_error
1057 * and aio_return. We must keep completed requests until released by the
1062 lio_listio( struct proc
*p
, struct lio_listio_args
*uap
, int *retval
)
1068 aio_workq_entry
* *entryp_listp
;
1070 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_listio
)) | DBG_FUNC_START
,
1071 (int)p
, uap
->nent
, uap
->mode
, 0, 0 );
1073 entryp_listp
= NULL
;
1076 if ( !(uap
->mode
== LIO_NOWAIT
|| uap
->mode
== LIO_WAIT
) ) {
1077 call_result
= EINVAL
;
1081 if ( uap
->nent
< 1 || uap
->nent
> AIO_LISTIO_MAX
) {
1082 call_result
= EINVAL
;
1087 * we use group_tag to mark IO requests for delayed completion processing
1088 * which means we wait until all IO requests in the group have completed
1089 * before we either return to the caller when mode is LIO_WAIT or signal
1090 * user when mode is LIO_NOWAIT.
1092 group_tag
= random();
1095 * allocate a list of aio_workq_entry pointers that we will use to queue
1096 * up all our requests at once while holding our lock.
1098 MALLOC( entryp_listp
, void *, (uap
->nent
* sizeof(struct aiocb
*)), M_TEMP
, M_WAITOK
);
1099 if ( entryp_listp
== NULL
) {
1100 call_result
= EAGAIN
;
1104 /* process list of aio requests */
1105 for ( i
= 0; i
< uap
->nent
; i
++ ) {
1106 struct aiocb
*my_aiocbp
;
1108 *(entryp_listp
+ i
) = NULL
;
1110 /* copyin in aiocb pointer from list */
1111 result
= copyin( (void *)(uap
->aiocblist
+ i
), &my_aiocbp
, sizeof(my_aiocbp
) );
1112 if ( result
!= 0 ) {
1113 call_result
= EAGAIN
;
1117 /* NULL elements are legal so check for 'em */
1118 if ( my_aiocbp
== NULL
)
1121 if ( uap
->mode
== LIO_NOWAIT
)
1122 result
= lio_create_async_entry( p
, my_aiocbp
, uap
->sigp
,
1123 group_tag
, (entryp_listp
+ i
) );
1125 result
= lio_create_sync_entry( p
, my_aiocbp
, group_tag
,
1126 (entryp_listp
+ i
) );
1128 if ( result
!= 0 && call_result
== -1 )
1129 call_result
= result
;
1133 * we need to protect this section since we do not want any of these grouped
1134 * IO requests to begin until we have them all on the queue.
1137 for ( i
= 0; i
< uap
->nent
; i
++ ) {
1138 aio_workq_entry
*entryp
;
1140 /* NULL elements are legal so check for 'em */
1141 entryp
= *(entryp_listp
+ i
);
1142 if ( entryp
== NULL
)
1145 /* check our aio limits to throttle bad or rude user land behavior */
1146 if ( aio_get_all_queues_count( ) >= aio_max_requests
||
1147 aio_get_process_count( entryp
->procp
) >= aio_max_requests_per_process
||
1148 is_already_queued( entryp
->procp
, entryp
->uaiocbp
) == TRUE
) {
1151 my_map
= entryp
->aio_map
;
1152 entryp
->aio_map
= VM_MAP_NULL
;
1155 aio_free_request( entryp
, my_map
);
1160 /* place the request on the appropriate queue */
1161 if ( uap
->mode
== LIO_NOWAIT
) {
1162 TAILQ_INSERT_TAIL( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link
);
1163 aio_anchor
.aio_async_workq_count
++;
1165 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_NONE
,
1166 (int)p
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1169 TAILQ_INSERT_TAIL( &aio_anchor
.lio_sync_workq
, entryp
, aio_workq_link
);
1170 aio_anchor
.lio_sync_workq_count
++;
1175 if ( uap
->mode
== LIO_NOWAIT
)
1176 /* caller does not want to wait so we'll fire off a worker thread and return */
1177 wakeup_one( &aio_anchor
.aio_async_workq
);
1179 aio_workq_entry
*entryp
;
1183 * mode is LIO_WAIT - handle the IO requests now.
1186 entryp
= TAILQ_FIRST( &aio_anchor
.lio_sync_workq
);
1187 while ( entryp
!= NULL
) {
1188 if ( p
== entryp
->procp
&& group_tag
== entryp
->group_tag
) {
1189 boolean_t funnel_state
;
1191 TAILQ_REMOVE( &aio_anchor
.lio_sync_workq
, entryp
, aio_workq_link
);
1192 aio_anchor
.lio_sync_workq_count
--;
1195 // file system IO code path requires kernel funnel lock
1196 funnel_state
= thread_funnel_set( kernel_flock
, TRUE
);
1197 if ( (entryp
->flags
& AIO_READ
) != 0 ) {
1198 error
= do_aio_read( entryp
);
1200 else if ( (entryp
->flags
& AIO_WRITE
) != 0 ) {
1201 error
= do_aio_write( entryp
);
1203 else if ( (entryp
->flags
& AIO_FSYNC
) != 0 ) {
1204 error
= do_aio_fsync( entryp
);
1207 printf( "%s - unknown aio request - flags 0x%02X \n",
1208 __FUNCTION__
, entryp
->flags
);
1211 entryp
->errorval
= error
;
1212 if ( error
!= 0 && call_result
== -1 )
1214 (void) thread_funnel_set( kernel_flock
, funnel_state
);
1217 /* we're done with the IO request so move it on the done queue */
1218 TAILQ_INSERT_TAIL( &p
->aio_doneq
, entryp
, aio_workq_link
);
1219 aio_anchor
.aio_done_count
++;
1220 p
->aio_done_count
++;
1222 /* need to start over since lio_sync_workq may have been changed while we */
1223 /* were away doing the IO. */
1224 entryp
= TAILQ_FIRST( &aio_anchor
.lio_sync_workq
);
1226 } /* p == entryp->procp */
1228 entryp
= TAILQ_NEXT( entryp
, aio_workq_link
);
1229 } /* while ( entryp != NULL ) */
1231 } /* uap->mode == LIO_WAIT */
1233 /* call_result == -1 means we had no trouble queueing up requests */
1234 if ( call_result
== -1 ) {
1240 if ( entryp_listp
!= NULL
)
1241 FREE( entryp_listp
, M_TEMP
);
1243 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_listio
)) | DBG_FUNC_END
,
1244 (int)p
, call_result
, 0, 0, 0 );
1246 return( call_result
);
1252 * aio worker thread. this is where all the real work gets done.
1253 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1254 * after new work is queued up.
1258 aio_work_thread( void )
1260 aio_workq_entry
*entryp
;
1261 struct uthread
*uthread
= (struct uthread
*)get_bsdthread_info(current_act());
1264 entryp
= aio_get_some_work();
1265 if ( entryp
== NULL
) {
1267 * aio worker threads wait for some work to get queued up
1268 * by aio_queue_async_request. Once some work gets queued
1269 * it will wake up one of these worker threads just before
1270 * returning to our caller in user land. We do not use
1271 * tsleep() here in order to avoid getting kernel funnel lock.
1273 assert_wait( (event_t
) &aio_anchor
.aio_async_workq
, THREAD_UNINT
);
1274 thread_block( THREAD_CONTINUE_NULL
);
1276 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_wake
)) | DBG_FUNC_NONE
,
1281 boolean_t funnel_state
;
1282 vm_map_t currentmap
;
1283 vm_map_t oldmap
= VM_MAP_NULL
;
1284 task_t oldaiotask
= TASK_NULL
;
1286 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_thread
)) | DBG_FUNC_START
,
1287 (int)entryp
->procp
, (int)entryp
->uaiocbp
, entryp
->flags
, 0, 0 );
1290 * Assume the target's address space identity for the duration
1293 funnel_state
= thread_funnel_set( kernel_flock
, TRUE
);
1295 currentmap
= get_task_map( (current_proc())->task
);
1296 if ( currentmap
!= entryp
->aio_map
) {
1297 oldaiotask
= uthread
->uu_aio_task
;
1298 uthread
->uu_aio_task
= entryp
->procp
->task
;
1299 oldmap
= vm_map_switch( entryp
->aio_map
);
1302 if ( (entryp
->flags
& AIO_READ
) != 0 ) {
1303 error
= do_aio_read( entryp
);
1305 else if ( (entryp
->flags
& AIO_WRITE
) != 0 ) {
1306 error
= do_aio_write( entryp
);
1308 else if ( (entryp
->flags
& AIO_FSYNC
) != 0 ) {
1309 error
= do_aio_fsync( entryp
);
1312 printf( "%s - unknown aio request - flags 0x%02X \n",
1313 __FUNCTION__
, entryp
->flags
);
1316 entryp
->errorval
= error
;
1317 if ( currentmap
!= entryp
->aio_map
) {
1318 (void) vm_map_switch( oldmap
);
1319 uthread
->uu_aio_task
= oldaiotask
;
1322 /* we're done with the IO request so pop it off the active queue and */
1323 /* push it on the done queue */
1325 TAILQ_REMOVE( &entryp
->procp
->aio_activeq
, entryp
, aio_workq_link
);
1326 aio_anchor
.aio_active_count
--;
1327 entryp
->procp
->aio_active_count
--;
1328 TAILQ_INSERT_TAIL( &entryp
->procp
->aio_doneq
, entryp
, aio_workq_link
);
1329 aio_anchor
.aio_done_count
++;
1330 entryp
->procp
->aio_done_count
++;
1331 entryp
->flags
|= AIO_COMPLETION
;
1333 /* remove our reference to the user land map. */
1334 if ( VM_MAP_NULL
!= entryp
->aio_map
) {
1337 my_map
= entryp
->aio_map
;
1338 entryp
->aio_map
= VM_MAP_NULL
;
1339 AIO_UNLOCK
; /* must unlock before calling vm_map_deallocate() */
1340 vm_map_deallocate( my_map
);
1346 do_aio_completion( entryp
);
1347 (void) thread_funnel_set( kernel_flock
, funnel_state
);
1349 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_worker_thread
)) | DBG_FUNC_END
,
1350 (int)entryp
->procp
, (int)entryp
->uaiocbp
, entryp
->errorval
,
1351 entryp
->returnval
, 0 );
1354 entryp
->flags
&= ~AIO_COMPLETION
;
1355 if ( (entryp
->flags
& AIO_DO_FREE
) != 0 ) {
1358 my_map
= entryp
->aio_map
;
1359 entryp
->aio_map
= VM_MAP_NULL
;
1361 aio_free_request( entryp
, my_map
);
1370 } /* aio_work_thread */
1374 * aio_get_some_work - get the next async IO request that is ready to be executed.
1375 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1376 * IO requests at the time the aio_fsync call came in have completed.
1379 static aio_workq_entry
*
1380 aio_get_some_work( void )
1382 aio_workq_entry
*entryp
;
1385 /* pop some work off the work queue and add to our active queue */
1387 for ( entryp
= TAILQ_FIRST( &aio_anchor
.aio_async_workq
);
1389 entryp
= TAILQ_NEXT( entryp
, aio_workq_link
) ) {
1391 if ( (entryp
->flags
& AIO_FSYNC
) != 0 ) {
1392 /* leave aio_fsync calls on the work queue if there are IO */
1393 /* requests on the active queue for the same file descriptor. */
1394 if ( aio_delay_fsync_request( entryp
) ) {
1396 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_fsync_delay
)) | DBG_FUNC_NONE
,
1397 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1404 if ( entryp
!= NULL
) {
1405 TAILQ_REMOVE( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link
);
1406 aio_anchor
.aio_async_workq_count
--;
1407 TAILQ_INSERT_TAIL( &entryp
->procp
->aio_activeq
, entryp
, aio_workq_link
);
1408 aio_anchor
.aio_active_count
++;
1409 entryp
->procp
->aio_active_count
++;
1415 } /* aio_get_some_work */
1419 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed at
1420 * this time. Delay will happen when there are any active IOs for the same file
1421 * descriptor that were queued at time the aio_sync call was queued.
1422 * NOTE - AIO_LOCK must be held by caller
1425 aio_delay_fsync_request( aio_workq_entry
*entryp
)
1427 aio_workq_entry
*my_entryp
;
1429 TAILQ_FOREACH( my_entryp
, &entryp
->procp
->aio_activeq
, aio_workq_link
) {
1430 if ( my_entryp
->fsyncp
!= NULL
&&
1431 entryp
->uaiocbp
== my_entryp
->fsyncp
&&
1432 entryp
->aiocb
.aio_fildes
== my_entryp
->aiocb
.aio_fildes
) {
1439 } /* aio_delay_fsync_request */
1443 * aio_queue_async_request - queue up an async IO request on our work queue then
1444 * wake up one of our worker threads to do the actual work. We get a reference
1445 * to our caller's user land map in order to keep it around while we are
1446 * processing the request.
1450 aio_queue_async_request( struct proc
*procp
, struct aiocb
*aiocbp
, int kindOfIO
)
1452 aio_workq_entry
*entryp
;
1455 entryp
= (aio_workq_entry
*) zalloc( aio_workq_zonep
);
1456 if ( entryp
== NULL
) {
1460 bzero( entryp
, sizeof(*entryp
) );
1462 /* fill in the rest of the aio_workq_entry */
1463 entryp
->procp
= procp
;
1464 entryp
->uaiocbp
= aiocbp
;
1465 entryp
->flags
|= kindOfIO
;
1466 entryp
->aio_map
= VM_MAP_NULL
;
1467 result
= copyin( aiocbp
, &entryp
->aiocb
, sizeof(entryp
->aiocb
) );
1468 if ( result
!= 0 ) {
1473 /* do some more validation on the aiocb and embedded file descriptor */
1474 result
= aio_validate( entryp
);
1478 /* get a reference to the user land map in order to keep it around */
1479 entryp
->aio_map
= get_task_map( procp
->task
);
1480 vm_map_reference( entryp
->aio_map
);
1484 if ( is_already_queued( entryp
->procp
, entryp
->uaiocbp
) == TRUE
) {
1490 /* check our aio limits to throttle bad or rude user land behavior */
1491 if ( aio_get_all_queues_count( ) >= aio_max_requests
||
1492 aio_get_process_count( procp
) >= aio_max_requests_per_process
) {
1499 * aio_fsync calls sync up all async IO requests queued at the time
1500 * the aio_fsync call was made. So we mark each currently queued async
1501 * IO with a matching file descriptor as must complete before we do the
1502 * fsync. We set the fsyncp field of each matching async IO
1503 * request with the aiocb pointer passed in on the aio_fsync call to
1504 * know which IOs must complete before we process the aio_fsync call.
1506 if ( (kindOfIO
& AIO_FSYNC
) != 0 )
1507 aio_mark_requests( entryp
);
1509 /* queue up on our aio asynchronous work queue */
1510 TAILQ_INSERT_TAIL( &aio_anchor
.aio_async_workq
, entryp
, aio_workq_link
);
1511 aio_anchor
.aio_async_workq_count
++;
1515 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_work_queued
)) | DBG_FUNC_NONE
,
1516 (int)procp
, (int)aiocbp
, 0, 0, 0 );
1518 wakeup_one( &aio_anchor
.aio_async_workq
);
1523 if ( entryp
!= NULL
) {
1524 /* this entry has not been queued up so no worries about unlocked */
1525 /* state and aio_map */
1526 aio_free_request( entryp
, entryp
->aio_map
);
1531 } /* aio_queue_async_request */
1535 * lio_create_async_entry - allocate an aio_workq_entry and fill it in.
1536 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1537 * our caller. We get a reference to our caller's user land map in order to keep
1538 * it around while we are processing the request.
1539 * lio_listio calls behave differently at completion they do completion notification
1540 * when all async IO requests have completed. We use group_tag to tag IO requests
1541 * that behave in the delay notification manner.
1545 lio_create_async_entry( struct proc
*procp
, struct aiocb
*aiocbp
,
1546 struct sigevent
*sigp
, long group_tag
,
1547 aio_workq_entry
**entrypp
)
1549 aio_workq_entry
*entryp
;
1552 entryp
= (aio_workq_entry
*) zalloc( aio_workq_zonep
);
1553 if ( entryp
== NULL
) {
1557 bzero( entryp
, sizeof(*entryp
) );
1559 /* fill in the rest of the aio_workq_entry */
1560 entryp
->procp
= procp
;
1561 entryp
->uaiocbp
= aiocbp
;
1562 entryp
->flags
|= AIO_LIO
;
1563 entryp
->group_tag
= group_tag
;
1564 entryp
->aio_map
= VM_MAP_NULL
;
1565 result
= copyin( aiocbp
, &entryp
->aiocb
, sizeof(entryp
->aiocb
) );
1566 if ( result
!= 0 ) {
1571 /* look for lio_listio LIO_NOP requests and ignore them. */
1572 /* Not really an error, but we need to free our aio_workq_entry. */
1573 if ( entryp
->aiocb
.aio_lio_opcode
== LIO_NOP
) {
1578 /* use sigevent passed in to lio_listio for each of our calls, but only */
1579 /* do completion notification after the last request completes. */
1580 if ( sigp
!= NULL
) {
1581 result
= copyin( sigp
, &entryp
->aiocb
.aio_sigevent
, sizeof(entryp
->aiocb
.aio_sigevent
) );
1582 if ( result
!= 0 ) {
1588 /* do some more validation on the aiocb and embedded file descriptor */
1589 result
= aio_validate( entryp
);
1593 /* get a reference to the user land map in order to keep it around */
1594 entryp
->aio_map
= get_task_map( procp
->task
);
1595 vm_map_reference( entryp
->aio_map
);
1601 if ( entryp
!= NULL
)
1602 zfree( aio_workq_zonep
, (vm_offset_t
) entryp
);
1606 } /* lio_create_async_entry */
1610 * aio_mark_requests - aio_fsync calls synchronize file data for all queued async IO
1611 * requests at the moment the aio_fsync call is queued. We use aio_workq_entry.fsyncp
1612 * to mark each async IO that must complete before the fsync is done. We use the uaiocbp
1613 * field from the aio_fsync call as the aio_workq_entry.fsyncp in marked requests.
1614 * NOTE - AIO_LOCK must be held by caller
1618 aio_mark_requests( aio_workq_entry
*entryp
)
1620 aio_workq_entry
*my_entryp
;
1622 TAILQ_FOREACH( my_entryp
, &entryp
->procp
->aio_activeq
, aio_workq_link
) {
1623 if ( entryp
->aiocb
.aio_fildes
== my_entryp
->aiocb
.aio_fildes
) {
1624 my_entryp
->fsyncp
= entryp
->uaiocbp
;
1628 TAILQ_FOREACH( my_entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
1629 if ( entryp
->procp
== my_entryp
->procp
&&
1630 entryp
->aiocb
.aio_fildes
== my_entryp
->aiocb
.aio_fildes
) {
1631 my_entryp
->fsyncp
= entryp
->uaiocbp
;
1635 } /* aio_mark_requests */
1639 * lio_create_sync_entry - allocate an aio_workq_entry and fill it in.
1640 * If all goes well return 0 and pass the aio_workq_entry pointer back to
1642 * lio_listio calls behave differently at completion they do completion notification
1643 * when all async IO requests have completed. We use group_tag to tag IO requests
1644 * that behave in the delay notification manner.
1648 lio_create_sync_entry( struct proc
*procp
, struct aiocb
*aiocbp
,
1649 long group_tag
, aio_workq_entry
**entrypp
)
1651 aio_workq_entry
*entryp
;
1654 entryp
= (aio_workq_entry
*) zalloc( aio_workq_zonep
);
1655 if ( entryp
== NULL
) {
1659 bzero( entryp
, sizeof(*entryp
) );
1661 /* fill in the rest of the aio_workq_entry */
1662 entryp
->procp
= procp
;
1663 entryp
->uaiocbp
= aiocbp
;
1664 entryp
->flags
|= AIO_LIO
;
1665 entryp
->group_tag
= group_tag
;
1666 entryp
->aio_map
= VM_MAP_NULL
;
1667 result
= copyin( aiocbp
, &entryp
->aiocb
, sizeof(entryp
->aiocb
) );
1668 if ( result
!= 0 ) {
1673 /* look for lio_listio LIO_NOP requests and ignore them. */
1674 /* Not really an error, but we need to free our aio_workq_entry. */
1675 if ( entryp
->aiocb
.aio_lio_opcode
== LIO_NOP
) {
1680 result
= aio_validate( entryp
);
1681 if ( result
!= 0 ) {
1689 if ( entryp
!= NULL
)
1690 zfree( aio_workq_zonep
, (vm_offset_t
) entryp
);
1694 } /* lio_create_sync_entry */
1698 * aio_free_request - remove our reference on the user land map and
1699 * free the work queue entry resources.
1700 * We are not holding the lock here thus aio_map is passed in and
1701 * zeroed while we did have the lock.
1705 aio_free_request( aio_workq_entry
*entryp
, vm_map_t the_map
)
1707 /* remove our reference to the user land map. */
1708 if ( VM_MAP_NULL
!= the_map
) {
1709 vm_map_deallocate( the_map
);
1712 zfree( aio_workq_zonep
, (vm_offset_t
) entryp
);
1716 } /* aio_free_request */
1719 /* aio_validate - validate the aiocb passed in by one of the aio syscalls.
1723 aio_validate( aio_workq_entry
*entryp
)
1725 boolean_t funnel_state
;
1732 if ( (entryp
->flags
& AIO_LIO
) != 0 ) {
1733 if ( entryp
->aiocb
.aio_lio_opcode
== LIO_READ
)
1734 entryp
->flags
|= AIO_READ
;
1735 else if ( entryp
->aiocb
.aio_lio_opcode
== LIO_WRITE
)
1736 entryp
->flags
|= AIO_WRITE
;
1737 else if ( entryp
->aiocb
.aio_lio_opcode
== LIO_NOP
)
1744 if ( (entryp
->flags
& (AIO_WRITE
| AIO_FSYNC
)) != 0 ) {
1748 if ( (entryp
->flags
& (AIO_READ
| AIO_WRITE
)) != 0 ) {
1749 if ( entryp
->aiocb
.aio_offset
< 0 ||
1750 entryp
->aiocb
.aio_nbytes
< 0 ||
1751 entryp
->aiocb
.aio_nbytes
> INT_MAX
||
1752 entryp
->aiocb
.aio_buf
== NULL
)
1756 /* validate aiocb.aio_sigevent. at this point we only support sigev_notify
1757 * equal to SIGEV_SIGNAL or SIGEV_NONE. this means sigev_value,
1758 * sigev_notify_function, and sigev_notify_attributes are ignored.
1760 if ( entryp
->aiocb
.aio_sigevent
.sigev_notify
== SIGEV_SIGNAL
) {
1762 /* make sure we have a valid signal number */
1763 signum
= entryp
->aiocb
.aio_sigevent
.sigev_signo
;
1764 if ( signum
<= 0 || signum
>= NSIG
||
1765 signum
== SIGKILL
|| signum
== SIGSTOP
)
1768 else if ( entryp
->aiocb
.aio_sigevent
.sigev_notify
!= SIGEV_NONE
)
1771 /* validate the file descriptor and that the file was opened
1772 * for the appropriate read / write access. This section requires
1773 * kernel funnel lock.
1775 funnel_state
= thread_funnel_set( kernel_flock
, TRUE
);
1777 result
= fdgetf( entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
);
1778 if ( result
== 0 ) {
1779 if ( (fp
->f_flag
& flag
) == 0 ) {
1780 /* we don't have read or write access */
1783 else if ( fp
->f_type
!= DTYPE_VNODE
) {
1784 /* this is not a file */
1792 (void) thread_funnel_set( kernel_flock
, funnel_state
);
1796 } /* aio_validate */
1800 * aio_get_process_count - runs through our queues that hold outstanding
1801 * async IO reqests and totals up number of requests for the given
1803 * NOTE - caller must hold aio lock!
1807 aio_get_process_count( struct proc
*procp
)
1809 aio_workq_entry
*entryp
;
1813 /* begin with count of completed async IO requests for this process */
1814 count
= procp
->aio_done_count
;
1816 /* add in count of active async IO requests for this process */
1817 count
+= procp
->aio_active_count
;
1819 /* look for matches on our queue of asynchronous todo work */
1820 TAILQ_FOREACH( entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
1821 if ( procp
== entryp
->procp
) {
1826 /* look for matches on our queue of synchronous todo work */
1827 TAILQ_FOREACH( entryp
, &aio_anchor
.lio_sync_workq
, aio_workq_link
) {
1828 if ( procp
== entryp
->procp
) {
1835 } /* aio_get_process_count */
1839 * aio_get_all_queues_count - get total number of entries on all aio work queues.
1840 * NOTE - caller must hold aio lock!
1844 aio_get_all_queues_count( void )
1848 count
= aio_anchor
.aio_async_workq_count
;
1849 count
+= aio_anchor
.lio_sync_workq_count
;
1850 count
+= aio_anchor
.aio_active_count
;
1851 count
+= aio_anchor
.aio_done_count
;
1855 } /* aio_get_all_queues_count */
1859 * do_aio_completion. Handle async IO completion.
1863 do_aio_completion( aio_workq_entry
*entryp
)
1865 /* signal user land process if appropriate */
1866 if ( entryp
->aiocb
.aio_sigevent
.sigev_notify
== SIGEV_SIGNAL
&&
1867 (entryp
->flags
& AIO_DISABLE
) == 0 ) {
1870 * if group_tag is non zero then make sure this is the last IO request
1871 * in the group before we signal.
1873 if ( entryp
->group_tag
== 0 ||
1874 (entryp
->group_tag
!= 0 && aio_last_group_io( entryp
)) ) {
1875 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_sig
)) | DBG_FUNC_NONE
,
1876 (int)entryp
->procp
, (int)entryp
->uaiocbp
,
1877 entryp
->aiocb
.aio_sigevent
.sigev_signo
, 0, 0 );
1879 psignal( entryp
->procp
, entryp
->aiocb
.aio_sigevent
.sigev_signo
);
1885 * need to handle case where a process is trying to exit, exec, or close
1886 * and is currently waiting for active aio requests to complete. If
1887 * AIO_WAITING is set then we need to look to see if there are any
1888 * other requests in the active queue for this process. If there are
1889 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel. If
1890 * there are some still active then do nothing - we only want to wakeup
1891 * when all active aio requests for the process are complete.
1893 if ( (entryp
->flags
& AIO_WAITING
) != 0 ) {
1894 int active_requests
;
1896 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wait
)) | DBG_FUNC_NONE
,
1897 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1900 active_requests
= aio_active_requests_for_process( entryp
->procp
);
1902 if ( active_requests
< 1 ) {
1903 /* no active aio requests for this process, continue exiting */
1905 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_cleanup_wake
)) | DBG_FUNC_NONE
,
1906 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1908 wakeup_one( &entryp
->procp
->AIO_CLEANUP_SLEEP_CHAN
);
1914 * aio_suspend case when a signal was not requested. In that scenario we
1915 * are sleeping on the AIO_SUSPEND_SLEEP_CHAN channel.
1916 * NOTE - the assumption here is that this wakeup call is inexpensive.
1917 * we really only need to do this when an aio_suspend call is pending.
1918 * If we find the wakeup call should be avoided we could mark the
1919 * async IO requests given in the list provided by aio_suspend and only
1920 * call wakeup for them. If we do mark them we should unmark them after
1921 * the aio_suspend wakes up.
1923 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO
, AIO_completion_suspend_wake
)) | DBG_FUNC_NONE
,
1924 (int)entryp
->procp
, (int)entryp
->uaiocbp
, 0, 0, 0 );
1926 wakeup_one( &entryp
->procp
->AIO_SUSPEND_SLEEP_CHAN
);
1930 } /* do_aio_completion */
1934 * aio_last_group_io - checks to see if this is the last unfinished IO request
1935 * for the given group_tag. Returns TRUE if there are no other active IO
1936 * requests for this group or FALSE if the are active IO requests
1937 * NOTE - AIO_LOCK must be held by caller
1941 aio_last_group_io( aio_workq_entry
*entryp
)
1943 aio_workq_entry
*my_entryp
;
1945 /* look for matches on our queue of active async IO requests */
1946 TAILQ_FOREACH( my_entryp
, &entryp
->procp
->aio_activeq
, aio_workq_link
) {
1947 if ( my_entryp
->group_tag
== entryp
->group_tag
)
1951 /* look for matches on our queue of asynchronous todo work */
1952 TAILQ_FOREACH( my_entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
1953 if ( my_entryp
->group_tag
== entryp
->group_tag
)
1957 /* look for matches on our queue of synchronous todo work */
1958 TAILQ_FOREACH( my_entryp
, &aio_anchor
.lio_sync_workq
, aio_workq_link
) {
1959 if ( my_entryp
->group_tag
== entryp
->group_tag
)
1965 } /* aio_last_group_io */
1972 do_aio_read( aio_workq_entry
*entryp
)
1977 fp
= holdfp( entryp
->procp
->p_fd
, entryp
->aiocb
.aio_fildes
, FREAD
);
1979 error
= dofileread( entryp
->procp
, fp
, entryp
->aiocb
.aio_fildes
,
1980 (void *)entryp
->aiocb
.aio_buf
,
1981 entryp
->aiocb
.aio_nbytes
,
1982 entryp
->aiocb
.aio_offset
, FOF_OFFSET
,
1983 &entryp
->returnval
);
1998 do_aio_write( aio_workq_entry
*entryp
)
2003 fp
= holdfp( entryp
->procp
->p_fd
, entryp
->aiocb
.aio_fildes
, FWRITE
);
2005 error
= dofilewrite( entryp
->procp
, fp
, entryp
->aiocb
.aio_fildes
,
2006 (const void *)entryp
->aiocb
.aio_buf
,
2007 entryp
->aiocb
.aio_nbytes
,
2008 entryp
->aiocb
.aio_offset
, FOF_OFFSET
,
2009 &entryp
->returnval
);
2017 } /* do_aio_write */
2021 * aio_active_requests_for_process - return number of active async IO
2022 * requests for the given process.
2023 * NOTE - caller must hold aio lock!
2027 aio_active_requests_for_process( struct proc
*procp
)
2030 return( procp
->aio_active_count
);
2032 } /* aio_active_requests_for_process */
2039 do_aio_fsync( aio_workq_entry
*entryp
)
2041 register struct vnode
*vp
;
2046 * NOTE - we will not support AIO_DSYNC until fdatasync() is supported.
2047 * AIO_DSYNC is caught before we queue up a request and flagged as an error.
2048 * The following was shamelessly extracted from fsync() implementation.
2050 error
= getvnode( entryp
->procp
, entryp
->aiocb
.aio_fildes
, &fp
);
2052 vp
= (struct vnode
*)fp
->f_data
;
2053 vn_lock( vp
, LK_EXCLUSIVE
| LK_RETRY
, entryp
->procp
);
2054 error
= VOP_FSYNC( vp
, fp
->f_cred
, MNT_WAIT
, entryp
->procp
);
2055 VOP_UNLOCK( vp
, 0, entryp
->procp
);
2058 entryp
->returnval
= -1;
2062 } /* do_aio_fsync */
2066 * is_already_queued - runs through our queues to see if the given
2067 * aiocbp / process is there. Returns TRUE if there is a match
2068 * on any of our aio queues.
2069 * NOTE - callers must hold aio lock!
2073 is_already_queued( struct proc
*procp
,
2074 struct aiocb
*aiocbp
)
2076 aio_workq_entry
*entryp
;
2081 /* look for matches on our queue of async IO requests that have completed */
2082 TAILQ_FOREACH( entryp
, &procp
->aio_doneq
, aio_workq_link
) {
2083 if ( aiocbp
== entryp
->uaiocbp
) {
2085 goto ExitThisRoutine
;
2089 /* look for matches on our queue of active async IO requests */
2090 TAILQ_FOREACH( entryp
, &procp
->aio_activeq
, aio_workq_link
) {
2091 if ( aiocbp
== entryp
->uaiocbp
) {
2093 goto ExitThisRoutine
;
2097 /* look for matches on our queue of asynchronous todo work */
2098 TAILQ_FOREACH( entryp
, &aio_anchor
.aio_async_workq
, aio_workq_link
) {
2099 if ( procp
== entryp
->procp
&& aiocbp
== entryp
->uaiocbp
) {
2101 goto ExitThisRoutine
;
2105 /* look for matches on our queue of synchronous todo work */
2106 TAILQ_FOREACH( entryp
, &aio_anchor
.lio_sync_workq
, aio_workq_link
) {
2107 if ( procp
== entryp
->procp
&& aiocbp
== entryp
->uaiocbp
) {
2109 goto ExitThisRoutine
;
2116 } /* is_already_queued */
2120 * aio initialization
2122 __private_extern__
void
2127 simple_lock_init( &aio_lock
);
2130 TAILQ_INIT( &aio_anchor
.aio_async_workq
);
2131 TAILQ_INIT( &aio_anchor
.lio_sync_workq
);
2132 aio_anchor
.aio_async_workq_count
= 0;
2133 aio_anchor
.lio_sync_workq_count
= 0;
2134 aio_anchor
.aio_active_count
= 0;
2135 aio_anchor
.aio_done_count
= 0;
2138 i
= sizeof( aio_workq_entry
);
2139 aio_workq_zonep
= zinit( i
, i
* aio_max_requests
, i
* aio_max_requests
, "aiowq" );
2141 _aio_create_worker_threads( aio_worker_threads
);
2149 * aio worker threads created here.
2151 __private_extern__
void
2152 _aio_create_worker_threads( int num
)
2156 /* create some worker threads to handle the async IO requests */
2157 for ( i
= 0; i
< num
; i
++ ) {
2160 myThread
= kernel_thread( kernel_task
, aio_work_thread
);
2161 if ( THREAD_NULL
== myThread
) {
2162 printf( "%s - failed to create a work thread \n", __FUNCTION__
);
2168 } /* _aio_create_worker_threads */
2171 * Return the current activation utask
2176 return ((struct uthread
*)get_bsdthread_info(current_act()))->uu_aio_task
;